#-----------------------------------------------------------------------
#
#  Nordic Summer School in Cancer Epidemiology, Copenhagen, 2019
#
#  Exercises on measures of frequency and effects
#  R solutions for Wednesday 14 August -- Esa Läärä
#
#-----------------------------------------------------------------------

#  Exercise 2.7: Standardization: colon cancer
#  -------------------------------------------

#  Item 1.
#---------

#  Crude rates are obtained simply from the bottom line figures:
#  males 23.5, females 27.6, both per 100000 y, their ratio 0.85
crM <- 592/(25.23)
crF <- 732/(26.48)
crRR <- crM/crF
round( c(crM, crF, crRR), 2)

#  Item 2.
#---------

#  Age-specific rates and weights are assigned into own vectors
ratesF.a <- c(2.0, 8.6, 55, 155)
ratesM.a <- c(0.9, 9.4, 67, 196)
wM <- c(46, 32, 18, 4)

#  Weighted averages are obtained by function sum()
stdRateM_wM <- sum( wM * ratesM.a )/sum( wM )
stdRateF_wM <- sum( wM * ratesF.a )/sum( wM )
ratio_wM <- stdRateM_wM/stdRateF_wM

#  Rates (per 100000 y): men 23.3, women 19.8; ratio 1.18
round( c(stdRateM_wM, stdRateF_wM, ratio_wM), 2)


#  Item 3.
#---------

#  Weights from the World standard population: take sums
#  of the weights of the 5-year age groups belonging to each
#  of the broader age bands
wW <- c(62, 23, 13, 2)
stdRateM_wW <- sum( wW * ratesM.a )/sum( wW )
stdRateF_wW <- sum( wW * ratesF.a )/sum( wW )
ratio_wW <- stdRateM_wW/stdRateF_wW 

#  Rates (per 100000 y): men 15.4, women 13.5; ratio 1.14
round( c(stdRateM_wW, stdRateF_wW, ratio_wW), 2)

#  Item 4.
#---------

#  Widths (in years) of the age intervals to be covered:
wC <- c(35, 20, 20, 0)

#  In the calculations express rates as per year!
cumRateM <- sum( wC * ratesM.a/100000 )
cumRateF <- sum( wC * ratesF.a/100000 )
cumRateR <- cumRateM/cumRateF

#  Cum. rates (per 100): men 1.56, women 1.34, ratio 1.16
round( c(100*cumRateM, 100*cumRateF, cumRateR), 2)

#  Item 5.
#---------

#  Just take the negative exponential transformation of the
#  cumulative rates
cumRiskM <- 1 - exp( - cumRateM )
cumRiskF <- 1 - exp( - cumRateF )
cumRiskR <- cumRiskM/cumRiskF
#  Cum. risks (%): men 1.55, women 1.33, ratio 1.16
round( c(100*cumRiskM, 100*cumRiskF, cumRiskR), 2)

#---------------------------------------------------------------------

#  Exercise 2.9: Survival of tongue cancer
#  ---------------------------------------

#  Item 1.
#---------

#  Assign the sizes of risk sets, numbers of deaths and 
#  censorings (losses) into their own vectors
Nk <- c(130, 78, 45, 33, 25, 19, 12)
Dk <- c(45, 24, 5, 2, 1, 0, 0)
Lk <- c(7, 9, 7, 6, 5, 7, 6)

#  Compute effective denominators, Nk.e, conditional proportions
#  of death, qk, conditional survival proportions, pk, and
#  cumulative survival proportions Sk for all intervals
Nk.e <- Nk - Lk/2
qk <- Dk/Nk.e
pk <- 1 - qk
Sk <- cumprod( pk )

#  Present all of these columns in one table, rounding the
#  proportions to 3 decimal points
data.frame( Nk, Dk, Lk, Nk.e, 
      qk = round(qk, 3), pk = round(pk, 3), Sk = round(Sk,3))

#  Item 2.
#---------

#  Survival curves start at time zero on the x-axis, at which 
#  point the survival proportion is 1 (or 100 %). Therefore, 
#  the vector containing survival proportions must be added
#  "from the left" by value 1 before one starts plotting the
#  curve.

#  The basic plotting function in R is 'plot()'. The first
#  argument 0:7 denotes the sequence of time points 0, 1, 2, ...,
#  6, 7, at which survival proportions are computed. The
#  second argument contains these proportions ordered according
#  to time. Argument 'pch' specifies the plotting character
#  (16 referring to bullet), and type = "o" means that both the
#  points and the line segments between the points are plotted.
#  Argument ylim= c(0,1.05) specifies the range of the y-axis.
plot( 0:7, c(1,Sk), pch=16, type="o", ylim=c(0,1.05), yaxs="i",
      main = "Survival of tongue cancer patients",
      ylab = "Survival proportion", xlab="Years since diagnosis" )

#  In the following, horizontal reference lines are drawn at
#  levels 0.25, 0.5 and 0.75 to facilitate graphical estimation
#  of the median and quartiles of the survival time distribution 
abline( h=c(1:3/4), col="gray", lwd = 0.5)
segments( c(0.72, 1.7), c(0,0), c(0.72, 1.7), c(0.75, 0.5), 
          lty = 3)
 
#  The median survival time is about 1.7 years, and the lower
#  quartile about 0.72 y ~ 8.6 months.

#  HOMEWORK: Try running otherwise the same lines of plotting 
#  script but changing type="b". See the result. Next, run it again 
#  but with type="l" ("ell", not "one"). Repeat the exercise
#  but now write type="p". -- Which one you would prefer? 

#---------------------------------------------------------------------

#  Exercise 2.11: Lexis diagram
#  ----------------------------

#  Item 1.
#---------

#  Organize the age-specific person-years and cases into vectors,
#  for each calendar period separately. Within these vectors the
#  total person-years in each cell can be left for the program to 
#  compute, because those totals may be expressed as sums of the
#  individual person-times included in that cell
D90_94 <- c(0, 0, 1)           # 1990-94
Y90_94 <- c(4+4+2+1, 2+3+1, 3+3)
D95_99 <- c(1, 0, 1)           # 1995-99
Y95_99 <- c(1+2.5+3+3, 2.2+1.5+4.5+4, 2+3+2+1.6)
D00_04 <- c(0, 2, 1)           # 2000-04
Y00_04 <- c(2+4, 0.5+3+4+3, 1.4+1.8+1.0)
D05_09 <- c(0, 0, 1)           # 2005_09
Y05_09 <- c(0, 2+3.7, 3.1+3)
AgeGrp <- c("40-44", "45-49", "50-54")
data.frame( AgeGrp, D90_94, Y90_94, D95_99, Y95_99,
                    D00_04, Y00_04, D05_09, Y05_09)

#  Item 2.
#---------

#  Do the same thing for the 10-year birth cohort 1952-61
Dcoh <- c(1, 1, 1)
Ycoh <- c(16.5, 15.7, 7.1)
data.frame(AgeGrp, Dcoh, Ycoh)

#  Item 3.
#---------

#  As the width of all age intervals is the same,
#  we obtain a weighted sum of the rates easily:
cumRate <- sum( 5*Dcoh/Ycoh ); cumRate

#  The cumulative rate is 1.31, i.e. greater than 1!
#  This is not a problem for a cumulative rate. 
#  The estimated cumulative risk, however, stays btw 0 and 1
cumRisk <- 1 - exp( - cumRate) ; cumRisk

#  Item 4.
#---------

#  For this exercise it is actually more practical to organize 
#  the person-years differently from item 1, i.e. so that
#  we collect the period-specific person-years of each age group
#  into own vectors
Y40_44 <- c(11, 9.5, 6, 0)
Y45_49 <- c(6, 12.2, 10.5, 5.7)
Y50_54 <- c(6, 8.6, 4.2, 6.1)

#  If the age-specific reference rates are the same over time
#  we may pool the age-specific person-times over the periods
#  when computing the total expected number Exp = 0.1945 
Exp <- (100/100000)*sum(Y40_44) + (200/100000)*sum(Y45_49) +
       (400/100000)*sum(Y50_54) 
Exp 

#  The observed number Obs = 7 is obtained by summing the numbers
#  of cases over all ages and periods.

Obs <- sum(D90_94) + sum(D95_99) + sum(D00_04) + sum(D05_09)
Obs

#  The standardized incidence ratio SIR = 7/0.1945 = 35.9 -- High!
SIR <- Obs/Exp 
SIR

#  Comment: With real cohort data one would work with a file
#  of individual records containing the key dates: date of birth,
#  date of entry, date of exit, and status at exit. The individual
#  follow-up times would then be split by age and calendar time
#  (or perhaps by cohort) using the splitLexis() function in the 
#  Epi package of R. After splitting one would proceed by 
#  tabulating the sums of follow-up times and nummbers of cases
#  by age and period, etc.

#-------------------------------------------------------------------------