Survival Case Study

Introduction

We reconsider the case study from Chapter 20 in Frank Harrell's book.

First, the data set is acquired.

con <- url("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/prostate.sav")
print(load(con)) ## Prints "prostate" if data loads correctly.
## [1] "prostate"
close(con)

Then we load the packages that are needed.

library(rms)      ### Harrell's R package ~~ the S-package is 'Design'
library(ggplot2)  ### Extensive graphics package
library(reshape2) ### For the 'melt' function
library(lattice)  ### More graphics (specifically, scatter plot matrices)
library(hexbin)   ### and more graphics
library(survival)
library(xtable)   ### for nice html tables

Descriptive statistics

The data set contains the following variables with the following class labels:

names(prostate)
##  [1] "patno"  "stage"  "rx"     "dtime"  "status" "age"    "wt"    
##  [8] "pf"     "hx"     "sbp"    "dbp"    "ekg"    "hg"     "sz"    
## [15] "sg"     "ap"     "bm"     "sdate"
sapply(prostate, function(x) class(x)[1])
##      patno      stage         rx      dtime     status        age 
## "labelled" "labelled"   "factor" "labelled"   "factor" "labelled" 
##         wt         pf         hx        sbp        dbp        ekg 
## "labelled"   "factor" "labelled" "labelled" "labelled"   "factor" 
##         hg         sz         sg         ap         bm      sdate 
## "labelled" "labelled" "labelled" "labelled" "labelled" "labelled"

Using the labels that Harrell has attached to the columns as attributes we can get more informative variable names.


sapply(prostate, function(x) {
  if(is.factor(x)) {
    attr(x, "levels")
  } else {
    attr(x, "label")
  }
}
       )
## $patno
## [1] "Patient Number"
## 
## $stage
## [1] "Stage"
## 
## $rx
## [1] "placebo"         "0.2 mg estrogen" "1.0 mg estrogen" "5.0 mg estrogen"
## 
## $dtime
## [1] "Months of Follow-up"
## 
## $status
##  [1] "alive"                        "dead - prostatic ca"         
##  [3] "dead - heart or vascular"     "dead - cerebrovascular"      
##  [5] "dead - pulmonary embolus"     "dead - other ca"             
##  [7] "dead - respiratory disease"   "dead - other specific non-ca"
##  [9] "dead - unspecified non-ca"    "dead - unknown cause"        
## 
## $age
## [1] "Age in Years"
## 
## $wt
## [1] "Weight Index = wt(kg)-ht(cm)+200"
## 
## $pf
## [1] "normal activity"      "in bed < 50% daytime" "in bed > 50% daytime"
## [4] "confined to bed"     
## 
## $hx
## [1] "History of Cardiovascular Disease"
## 
## $sbp
## [1] "Systolic Blood Pressure/10"
## 
## $dbp
## [1] "Diastolic Blood Pressure/10"
## 
## $ekg
## [1] "normal"                            "benign"                           
## [3] "rhythmic disturb & electrolyte ch" "heart block or conduction def"    
## [5] "heart strain"                      "old MI"                           
## [7] "recent MI"                        
## 
## $hg
## [1] "Serum Hemoglobin (g/100ml)"
## 
## $sz
## [1] "Size of Primary Tumor (cm^2)"
## 
## $sg
## [1] "Combined Index of Stage and Hist. Grade"
## 
## $ap
## [1] "Serum Prostatic Acid Phosphatase"
## 
## $bm
## [1] "Bone Metastases"
## 
## $sdate
## [1] "Date on study"

A standard R summary can also be obtained.

summary(prostate)
##      patno         stage                    rx          dtime     
##  Min.   :  1   Min.   :3.00   placebo        :127   Min.   : 0.0  
##  1st Qu.:126   1st Qu.:3.00   0.2 mg estrogen:124   1st Qu.:14.2  
##  Median :252   Median :3.00   1.0 mg estrogen:126   Median :34.0  
##  Mean   :252   Mean   :3.42   5.0 mg estrogen:125   Mean   :36.1  
##  3rd Qu.:377   3rd Qu.:4.00                         3rd Qu.:57.8  
##  Max.   :506   Max.   :4.00                         Max.   :76.0  
##                                                                   
##                           status         age             wt     
##  alive                       :148   Min.   :48.0   Min.   : 69  
##  dead - prostatic ca         :130   1st Qu.:70.0   1st Qu.: 90  
##  dead - heart or vascular    : 96   Median :73.0   Median : 98  
##  dead - cerebrovascular      : 31   Mean   :71.5   Mean   : 99  
##  dead - other specific non-ca: 28   3rd Qu.:76.0   3rd Qu.:107  
##  dead - other ca             : 25   Max.   :89.0   Max.   :152  
##  (Other)                     : 44   NA's   :1      NA's   :2    
##                     pf            hx             sbp            dbp       
##  normal activity     :450   Min.   :0.000   Min.   : 8.0   Min.   : 4.00  
##  in bed < 50% daytime: 37   1st Qu.:0.000   1st Qu.:13.0   1st Qu.: 7.00  
##  in bed > 50% daytime: 13   Median :0.000   Median :14.0   Median : 8.00  
##  confined to bed     :  2   Mean   :0.424   Mean   :14.4   Mean   : 8.15  
##                             3rd Qu.:1.000   3rd Qu.:16.0   3rd Qu.: 9.00  
##                             Max.   :1.000   Max.   :30.0   Max.   :18.00  
##                                                                           
##                                 ekg            hg             sz      
##  normal                           :168   Min.   : 5.9   Min.   : 0.0  
##  heart strain                     :150   1st Qu.:12.3   1st Qu.: 5.0  
##  old MI                           : 75   Median :13.7   Median :11.0  
##  rhythmic disturb & electrolyte ch: 51   Mean   :13.4   Mean   :14.6  
##  heart block or conduction def    : 26   3rd Qu.:14.7   3rd Qu.:21.0  
##  (Other)                          : 24   Max.   :21.2   Max.   :69.0  
##  NA's                             :  8                  NA's   :5     
##        sg             ap               bm            sdate     
##  Min.   : 5.0   Min.   :   0.1   Min.   :0.000   Min.   :2652  
##  1st Qu.: 9.0   1st Qu.:   0.5   1st Qu.:0.000   1st Qu.:2860  
##  Median :10.0   Median :   0.7   Median :0.000   Median :3021  
##  Mean   :10.3   Mean   :  12.2   Mean   :0.163   Mean   :3039  
##  3rd Qu.:11.0   3rd Qu.:   3.0   3rd Qu.:0.000   3rd Qu.:3204  
##  Max.   :15.0   Max.   : 999.9   Max.   :1.000   Max.   :3465  
##  NA's   :11

Then we look at the marginal distributions of the variables in terms of histograms and barplots.

To study the continuous variables we exclude the patient number, factors, and some of the numeric variables that are, in fact, dichotomous variables encoded as numerics and the date. Then we make histograms. Subsequently, to study the discrete variables, we consider barplots.

conVar <- c("dtime", "age", "wt", "sbp", "dbp","hg", "sz", "sg", "ap")
disVar <- c( "stage", "rx", "status", "pf", "hx", "ekg", "bm")

meltedProstate <- melt(prostate[, conVar])
qplot(value, data = meltedProstate, geom = "histogram",
      xlab = "", ylab = "") + 
  facet_wrap(~ variable, scales = "free")

plot of chunk histograms


meltedProstate <- melt(prostate[, disVar], id.vars = c())

qplot(value, data = meltedProstate, geom = "bar",
      xlab = "", ylab = "") + 
  facet_wrap(~ variable, scales = "free", nrow = 2) +
  theme(axis.text.x = element_text(angle = -30, 
         size = 8, hjust = 0, vjust = 1))

plot of chunk histograms

The number of missing values for each variable.

sapply(prostate, function(x) sum(is.na(x)))
##  patno  stage     rx  dtime status    age     wt     pf     hx    sbp 
##      0      0      0      0      0      1      2      0      0      0 
##    dbp    ekg     hg     sz     sg     ap     bm  sdate 
##      0      8      0      5     11      0      0      0

The scatter plot matrix.


  splom(na.omit(prostate[, conVar]),
        upper.panel = panel.hexbinplot,
        pscale = 0, 
        varname.cex = 0.7,
        nbins = 15,
        lower.panel = function(x, y) {
          panel.text(mean(range(x)), mean(range(y)), 
                     round(cor(x, y), digits = 2),
                     cex = 0.7
                     )
        }
        )

plot of chunk ScatterPlotMatrix

We will for now just drop the missing observations but see Harrell Chapter 8 for a treatment on imputation for this particular data set. In the following plots we study the relation between the main response (the followup time) and the explanatory variables. This is complicated by the censoring.

### Dropping observations with missing variables.

subProstate <- transform(na.omit(prostate),
                         patno = as.factor(patno),
                         dtime = as.numeric(dtime)
                         )

meltedProstate <- melt(subProstate[, c("patno", conVar, "status")], 
                       id.vars = c("patno", "dtime", "status")
                       )

qplot(value, dtime, data = meltedProstate,
      xlab = "", ylab = "", geom = "point", 
      colour = status != "alive", alpha = I(0.3), size = I(3)) + 
  facet_wrap(~ variable, scales = "free_x") 

plot of chunk margReg


meltedProstate <- melt(subProstate[, c("patno", "dtime", disVar)], 
                       id.vars = c("patno", "dtime", "status")
                       )

ggplot(meltedProstate, aes(value, dtime)) +
  geom_violin(scale = 'width', aes(fill = status != "alive"), alpha = 0.2) +
  facet_wrap(~ variable, scales = "free_x") +
  theme(axis.text.x = element_text(angle = -30, 
         size = 8, hjust = 0, vjust = 1), 
       legend.position = "top")

plot of chunk margReg

The figures shed some light on how survival is related marginally to the explanatory variables. To focus on the treatment we turn to a Kaplan-Meier estimator for each of the four treatment groups.

pbcSurv <- survfit(Surv(dtime, status != "alive") ~ rx, 
                                 data = prostate)
pbcSurv
## Call: survfit.formula(formula = Surv(dtime, status != "alive") ~ rx, 
##     data = prostate)
## 
##                    records n.max n.start events median 0.95LCL 0.95UCL
## rx=placebo             127   127     127     95   33.0      27      40
## rx=0.2 mg estrogen     124   124     124     95   30.0      26      41
## rx=1.0 mg estrogen     126   126     126     71   41.5      31      NA
## rx=5.0 mg estrogen     125   125     125     93   35.0      28      46
plot(pbcSurv, mark.time = FALSE, conf.int = FALSE, 
     col = c("red", "blue", "purple", "cyan"))

plot of chunk surv1

The plot suggests that those individuals treated with 1.0 mg estrogen has a longer survival time than the other three groups. Adding confidence bands (which will mesh up the reading of the figure in general) shows that the difference is significant.

The Cox model


form <- Surv(dtime, status != "alive") ~ rx + age + wt + pf + 
  hx + sbp + dbp + ekg +  hg + sz + sg + ap + bm
prostateCox <- coxph(form, data = subProstate)
print(xtable(coef(summary(prostateCox))), type = "html")
coef exp(coef) se(coef) z Pr(> |z|)
rx0.2 mg estrogen -0.03 0.97 0.15 -0.18 0.86
rx1.0 mg estrogen -0.31 0.73 0.17 -1.85 0.06
rx5.0 mg estrogen -0.00 1.00 0.16 -0.00 1.00
age 0.02 1.03 0.01 2.72 0.01
wt -0.01 0.99 0.00 -1.98 0.05
pfin bed < 50% daytime 0.29 1.34 0.21 1.40 0.16
pfin bed > 50% daytime 0.43 1.54 0.33 1.31 0.19
pfconfined to bed 1.60 4.94 0.81 1.96 0.05
hx 0.50 1.66 0.12 4.21 0.00
sbp -0.03 0.97 0.03 -1.03 0.30
dbp 0.04 1.04 0.05 0.83 0.41
ekgbenign 0.06 1.06 0.28 0.22 0.83
ekgrhythmic disturb & electrolyte ch 0.34 1.41 0.19 1.77 0.08
ekgheart block or conduction def -0.10 0.91 0.29 -0.34 0.74
ekgheart strain 0.45 1.57 0.14 3.17 0.00
ekgold MI 0.06 1.07 0.18 0.35 0.72
ekgrecent MI 0.89 2.43 1.02 0.87 0.38
hg -0.07 0.93 0.03 -2.18 0.03
sz 0.02 1.02 0.00 3.83 0.00
sg 0.07 1.07 0.03 2.25 0.02
ap -0.00 1.00 0.00 -1.57 0.12
bm 0.27 1.31 0.17 1.60 0.11
print(xtable(drop1(prostateCox, test = "Chisq")), type = "html")
Df AIC LRT Pr(> Chi)
< none> 3730.75
rx 3 3729.65 4.91 0.1789
age 1 3736.54 7.80 0.0052
wt 1 3732.73 3.99 0.0459
pf 3 3730.35 5.61 0.1325
hx 1 3746.31 17.57 0.0000
sbp 1 3729.82 1.08 0.2994
dbp 1 3729.44 0.69 0.4064
ekg 6 3732.11 13.37 0.0376
hg 1 3733.43 4.68 0.0305
sz 1 3742.43 13.68 0.0002
sg 1 3733.73 4.99 0.0255
ap 1 3731.64 2.89 0.0891
bm 1 3731.24 2.49 0.1142
prostateData <- subProstate[, c("patno", all.vars(form)[-c(1, 2)])]
predProstate <- cbind(prostateData, 
                      data.frame(mgres = residuals(prostateCox))
                      )

meltedPredProstate <- melt(predProstate, id.vars = c("patno", "mgres"),
                           measure.vars = conVar[conVar %in% all.vars(form)[-c(1, 2)]])

qplot(value, mgres, data = meltedPredProstate,
      xlab = "", ylab = "", geom = "point", alpha = I(0.3), size = I(3)) + 
  facet_wrap(~ variable, scales = "free_x") + 
  geom_smooth(size = 1, fill = "blue") + 
  coord_cartesian(ylim = c(-5, 2))

plot of chunk residualPlots



meltedPredProstate <- melt(predProstate, id.vars = c("patno", "mgres"),
                           measure.vars = disVar[disVar %in% all.vars(form)[-c(1, 2)]])

qplot(value, mgres, data = meltedPredProstate,
      xlab = "", ylab = "", geom = "boxplot") + 
  facet_wrap(~ variable, scales = "free_x") +
    opts(axis.text.x = theme_text(angle = -30, 
         size = 8, hjust = 0, vjust = 1))

plot of chunk residualPlots

form <- Surv(dtime, status != "alive") ~ rx + rcs(age, 4) + rcs(wt, 4) + pf + hx + 
  rcs(sbp, 4) + rcs(dbp, 4) + ekg +  rcs(hg, 4) + rcs(sz, 4) + rcs(sg, 4) + 
  rcs(ap, 4) + bm
prostateCox2 <- coxph(form, data = subProstate)
print(xtable(drop1(prostateCox2, test = "Chisq")), type = "html")
Df AIC LRT Pr(> Chi)
< none> 3727.40
rx 3 3724.80 3.40 0.3334
rcs(age, 4) 3 3737.25 15.85 0.0012
rcs(wt, 4) 3 3728.44 7.05 0.0704
pf 3 3729.94 8.54 0.0360
hx 1 3742.48 17.09 0.0000
rcs(sbp, 4) 3 3722.89 1.49 0.6847
rcs(dbp, 4) 3 3728.42 7.02 0.0712
ekg 6 3728.87 13.48 0.0360
rcs(hg, 4) 3 3730.63 9.23 0.0264
rcs(sz, 4) 3 3733.44 12.04 0.0073
rcs(sg, 4) 3 3725.26 3.87 0.2762
rcs(ap, 4) 3 3732.55 11.15 0.0109
bm 1 3725.76 0.36 0.5479
predictProstate <- predict(prostateCox2, type = "terms", se.fit = TRUE)
predictData <- melt(subProstate[, c("patno", conVar[-1])], 
                    id.vars = "patno")

selectedTerms <- which(all.vars(form)[-c(1,2)] %in% conVar)

plotData <- cbind(predictData,
                  se = melt(predictProstate$se.fit[, selectedTerms])$value,
                  y = melt(predictProstate$fit[, selectedTerms])$value
                  )

ggplot(plotData, aes(x = value), xlab = "", ylab = "") +
  geom_ribbon(aes(ymin = y - 2*se, ymax = y + 2*se),
              alpha = I(0.2),
              fill = I("blue")) +
  geom_line(aes(y = y), size = 1) +
  geom_rug(data = predictData,
           aes(x = value, y = NULL), 
           colour = I("black"), 
           alpha = I(0.1)
           ) +
  facet_wrap(~ variable, nrow = 2, scale = "free") 

plot of chunk termplot

predictData <- melt(subProstate[, c("patno", disVar[-c(1, 3)])], 
                    id.vars = "patno")

selectedTerms <- which(all.vars(form)[-c(1, 2)] %in% disVar)

plotData <- cbind(predictData,
                  se = melt(predictProstate$se.fit[, selectedTerms])$value,
                  y = melt(predictProstate$fit[, selectedTerms])$value
                  )[!duplicated(interaction(predictData[, c("variable", "value")])), ]

ggplot(plotData, aes(x = value), xlab = "", ylab = "") +
  geom_errorbar(aes(ymin = y - 2*se, ymax = y + 2*se),
                colour = I("blue")) +
  geom_point(aes(y = y), size = 3) +
  facet_wrap(~ variable, nrow = 2, scale = "free") +
  theme(axis.text.x = element_text(angle = -30, 
         size = 8, hjust = 0, vjust = 1))

plot of chunk termplot2