We reconsider the case study from Chapter 20 in Frank Harrell's book.
First, the data set is acquired.
con <- url("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/prostate.sav")
print(load(con)) ## Prints "prostate" if data loads correctly.
## [1] "prostate"
close(con)
Then we load the packages that are needed.
library(rms) ### Harrell's R package ~~ the S-package is 'Design'
library(ggplot2) ### Extensive graphics package
library(reshape2) ### For the 'melt' function
library(lattice) ### More graphics (specifically, scatter plot matrices)
library(hexbin) ### and more graphics
library(survival)
library(xtable) ### for nice html tables
The data set contains the following variables with the following class labels:
names(prostate)
## [1] "patno" "stage" "rx" "dtime" "status" "age" "wt"
## [8] "pf" "hx" "sbp" "dbp" "ekg" "hg" "sz"
## [15] "sg" "ap" "bm" "sdate"
sapply(prostate, function(x) class(x)[1])
## patno stage rx dtime status age
## "labelled" "labelled" "factor" "labelled" "factor" "labelled"
## wt pf hx sbp dbp ekg
## "labelled" "factor" "labelled" "labelled" "labelled" "factor"
## hg sz sg ap bm sdate
## "labelled" "labelled" "labelled" "labelled" "labelled" "labelled"
Using the labels that Harrell has attached to the columns as attributes we can get more informative variable names.
sapply(prostate, function(x) {
if(is.factor(x)) {
attr(x, "levels")
} else {
attr(x, "label")
}
}
)
## $patno
## [1] "Patient Number"
##
## $stage
## [1] "Stage"
##
## $rx
## [1] "placebo" "0.2 mg estrogen" "1.0 mg estrogen" "5.0 mg estrogen"
##
## $dtime
## [1] "Months of Follow-up"
##
## $status
## [1] "alive" "dead - prostatic ca"
## [3] "dead - heart or vascular" "dead - cerebrovascular"
## [5] "dead - pulmonary embolus" "dead - other ca"
## [7] "dead - respiratory disease" "dead - other specific non-ca"
## [9] "dead - unspecified non-ca" "dead - unknown cause"
##
## $age
## [1] "Age in Years"
##
## $wt
## [1] "Weight Index = wt(kg)-ht(cm)+200"
##
## $pf
## [1] "normal activity" "in bed < 50% daytime" "in bed > 50% daytime"
## [4] "confined to bed"
##
## $hx
## [1] "History of Cardiovascular Disease"
##
## $sbp
## [1] "Systolic Blood Pressure/10"
##
## $dbp
## [1] "Diastolic Blood Pressure/10"
##
## $ekg
## [1] "normal" "benign"
## [3] "rhythmic disturb & electrolyte ch" "heart block or conduction def"
## [5] "heart strain" "old MI"
## [7] "recent MI"
##
## $hg
## [1] "Serum Hemoglobin (g/100ml)"
##
## $sz
## [1] "Size of Primary Tumor (cm^2)"
##
## $sg
## [1] "Combined Index of Stage and Hist. Grade"
##
## $ap
## [1] "Serum Prostatic Acid Phosphatase"
##
## $bm
## [1] "Bone Metastases"
##
## $sdate
## [1] "Date on study"
A standard R summary can also be obtained.
summary(prostate)
## patno stage rx dtime
## Min. : 1 Min. :3.00 placebo :127 Min. : 0.0
## 1st Qu.:126 1st Qu.:3.00 0.2 mg estrogen:124 1st Qu.:14.2
## Median :252 Median :3.00 1.0 mg estrogen:126 Median :34.0
## Mean :252 Mean :3.42 5.0 mg estrogen:125 Mean :36.1
## 3rd Qu.:377 3rd Qu.:4.00 3rd Qu.:57.8
## Max. :506 Max. :4.00 Max. :76.0
##
## status age wt
## alive :148 Min. :48.0 Min. : 69
## dead - prostatic ca :130 1st Qu.:70.0 1st Qu.: 90
## dead - heart or vascular : 96 Median :73.0 Median : 98
## dead - cerebrovascular : 31 Mean :71.5 Mean : 99
## dead - other specific non-ca: 28 3rd Qu.:76.0 3rd Qu.:107
## dead - other ca : 25 Max. :89.0 Max. :152
## (Other) : 44 NA's :1 NA's :2
## pf hx sbp dbp
## normal activity :450 Min. :0.000 Min. : 8.0 Min. : 4.00
## in bed < 50% daytime: 37 1st Qu.:0.000 1st Qu.:13.0 1st Qu.: 7.00
## in bed > 50% daytime: 13 Median :0.000 Median :14.0 Median : 8.00
## confined to bed : 2 Mean :0.424 Mean :14.4 Mean : 8.15
## 3rd Qu.:1.000 3rd Qu.:16.0 3rd Qu.: 9.00
## Max. :1.000 Max. :30.0 Max. :18.00
##
## ekg hg sz
## normal :168 Min. : 5.9 Min. : 0.0
## heart strain :150 1st Qu.:12.3 1st Qu.: 5.0
## old MI : 75 Median :13.7 Median :11.0
## rhythmic disturb & electrolyte ch: 51 Mean :13.4 Mean :14.6
## heart block or conduction def : 26 3rd Qu.:14.7 3rd Qu.:21.0
## (Other) : 24 Max. :21.2 Max. :69.0
## NA's : 8 NA's :5
## sg ap bm sdate
## Min. : 5.0 Min. : 0.1 Min. :0.000 Min. :2652
## 1st Qu.: 9.0 1st Qu.: 0.5 1st Qu.:0.000 1st Qu.:2860
## Median :10.0 Median : 0.7 Median :0.000 Median :3021
## Mean :10.3 Mean : 12.2 Mean :0.163 Mean :3039
## 3rd Qu.:11.0 3rd Qu.: 3.0 3rd Qu.:0.000 3rd Qu.:3204
## Max. :15.0 Max. : 999.9 Max. :1.000 Max. :3465
## NA's :11
Then we look at the marginal distributions of the variables in terms of histograms and barplots.
To study the continuous variables we exclude the patient number, factors, and some of the numeric variables that are, in fact, dichotomous variables encoded as numerics and the date. Then we make histograms. Subsequently, to study the discrete variables, we consider barplots.
conVar <- c("dtime", "age", "wt", "sbp", "dbp","hg", "sz", "sg", "ap")
disVar <- c( "stage", "rx", "status", "pf", "hx", "ekg", "bm")
meltedProstate <- melt(prostate[, conVar])
qplot(value, data = meltedProstate, geom = "histogram",
xlab = "", ylab = "") +
facet_wrap(~ variable, scales = "free")
meltedProstate <- melt(prostate[, disVar], id.vars = c())
qplot(value, data = meltedProstate, geom = "bar",
xlab = "", ylab = "") +
facet_wrap(~ variable, scales = "free", nrow = 2) +
theme(axis.text.x = element_text(angle = -30,
size = 8, hjust = 0, vjust = 1))
The number of missing values for each variable.
sapply(prostate, function(x) sum(is.na(x)))
## patno stage rx dtime status age wt pf hx sbp
## 0 0 0 0 0 1 2 0 0 0
## dbp ekg hg sz sg ap bm sdate
## 0 8 0 5 11 0 0 0
The scatter plot matrix.
splom(na.omit(prostate[, conVar]),
upper.panel = panel.hexbinplot,
pscale = 0,
varname.cex = 0.7,
nbins = 15,
lower.panel = function(x, y) {
panel.text(mean(range(x)), mean(range(y)),
round(cor(x, y), digits = 2),
cex = 0.7
)
}
)
We will for now just drop the missing observations but see Harrell Chapter 8 for a treatment on imputation for this particular data set. In the following plots we study the relation between the main response (the followup time) and the explanatory variables. This is complicated by the censoring.
### Dropping observations with missing variables.
subProstate <- transform(na.omit(prostate),
patno = as.factor(patno),
dtime = as.numeric(dtime)
)
meltedProstate <- melt(subProstate[, c("patno", conVar, "status")],
id.vars = c("patno", "dtime", "status")
)
qplot(value, dtime, data = meltedProstate,
xlab = "", ylab = "", geom = "point",
colour = status != "alive", alpha = I(0.3), size = I(3)) +
facet_wrap(~ variable, scales = "free_x")
meltedProstate <- melt(subProstate[, c("patno", "dtime", disVar)],
id.vars = c("patno", "dtime", "status")
)
ggplot(meltedProstate, aes(value, dtime)) +
geom_violin(scale = 'width', aes(fill = status != "alive"), alpha = 0.2) +
facet_wrap(~ variable, scales = "free_x") +
theme(axis.text.x = element_text(angle = -30,
size = 8, hjust = 0, vjust = 1),
legend.position = "top")
The figures shed some light on how survival is related marginally to the explanatory variables. To focus on the treatment we turn to a Kaplan-Meier estimator for each of the four treatment groups.
pbcSurv <- survfit(Surv(dtime, status != "alive") ~ rx,
data = prostate)
pbcSurv
## Call: survfit.formula(formula = Surv(dtime, status != "alive") ~ rx,
## data = prostate)
##
## records n.max n.start events median 0.95LCL 0.95UCL
## rx=placebo 127 127 127 95 33.0 27 40
## rx=0.2 mg estrogen 124 124 124 95 30.0 26 41
## rx=1.0 mg estrogen 126 126 126 71 41.5 31 NA
## rx=5.0 mg estrogen 125 125 125 93 35.0 28 46
plot(pbcSurv, mark.time = FALSE, conf.int = FALSE,
col = c("red", "blue", "purple", "cyan"))
The plot suggests that those individuals treated with 1.0 mg estrogen has a longer survival time than the other three groups. Adding confidence bands (which will mesh up the reading of the figure in general) shows that the difference is significant.
form <- Surv(dtime, status != "alive") ~ rx + age + wt + pf +
hx + sbp + dbp + ekg + hg + sz + sg + ap + bm
prostateCox <- coxph(form, data = subProstate)
print(xtable(coef(summary(prostateCox))), type = "html")
| coef | exp(coef) | se(coef) | z | Pr(> |z|) | |
|---|---|---|---|---|---|
| rx0.2 mg estrogen | -0.03 | 0.97 | 0.15 | -0.18 | 0.86 |
| rx1.0 mg estrogen | -0.31 | 0.73 | 0.17 | -1.85 | 0.06 |
| rx5.0 mg estrogen | -0.00 | 1.00 | 0.16 | -0.00 | 1.00 |
| age | 0.02 | 1.03 | 0.01 | 2.72 | 0.01 |
| wt | -0.01 | 0.99 | 0.00 | -1.98 | 0.05 |
| pfin bed < 50% daytime | 0.29 | 1.34 | 0.21 | 1.40 | 0.16 |
| pfin bed > 50% daytime | 0.43 | 1.54 | 0.33 | 1.31 | 0.19 |
| pfconfined to bed | 1.60 | 4.94 | 0.81 | 1.96 | 0.05 |
| hx | 0.50 | 1.66 | 0.12 | 4.21 | 0.00 |
| sbp | -0.03 | 0.97 | 0.03 | -1.03 | 0.30 |
| dbp | 0.04 | 1.04 | 0.05 | 0.83 | 0.41 |
| ekgbenign | 0.06 | 1.06 | 0.28 | 0.22 | 0.83 |
| ekgrhythmic disturb & electrolyte ch | 0.34 | 1.41 | 0.19 | 1.77 | 0.08 |
| ekgheart block or conduction def | -0.10 | 0.91 | 0.29 | -0.34 | 0.74 |
| ekgheart strain | 0.45 | 1.57 | 0.14 | 3.17 | 0.00 |
| ekgold MI | 0.06 | 1.07 | 0.18 | 0.35 | 0.72 |
| ekgrecent MI | 0.89 | 2.43 | 1.02 | 0.87 | 0.38 |
| hg | -0.07 | 0.93 | 0.03 | -2.18 | 0.03 |
| sz | 0.02 | 1.02 | 0.00 | 3.83 | 0.00 |
| sg | 0.07 | 1.07 | 0.03 | 2.25 | 0.02 |
| ap | -0.00 | 1.00 | 0.00 | -1.57 | 0.12 |
| bm | 0.27 | 1.31 | 0.17 | 1.60 | 0.11 |
print(xtable(drop1(prostateCox, test = "Chisq")), type = "html")
| Df | AIC | LRT | Pr(> Chi) | |
|---|---|---|---|---|
| < none> | 3730.75 | |||
| rx | 3 | 3729.65 | 4.91 | 0.1789 |
| age | 1 | 3736.54 | 7.80 | 0.0052 |
| wt | 1 | 3732.73 | 3.99 | 0.0459 |
| pf | 3 | 3730.35 | 5.61 | 0.1325 |
| hx | 1 | 3746.31 | 17.57 | 0.0000 |
| sbp | 1 | 3729.82 | 1.08 | 0.2994 |
| dbp | 1 | 3729.44 | 0.69 | 0.4064 |
| ekg | 6 | 3732.11 | 13.37 | 0.0376 |
| hg | 1 | 3733.43 | 4.68 | 0.0305 |
| sz | 1 | 3742.43 | 13.68 | 0.0002 |
| sg | 1 | 3733.73 | 4.99 | 0.0255 |
| ap | 1 | 3731.64 | 2.89 | 0.0891 |
| bm | 1 | 3731.24 | 2.49 | 0.1142 |
prostateData <- subProstate[, c("patno", all.vars(form)[-c(1, 2)])]
predProstate <- cbind(prostateData,
data.frame(mgres = residuals(prostateCox))
)
meltedPredProstate <- melt(predProstate, id.vars = c("patno", "mgres"),
measure.vars = conVar[conVar %in% all.vars(form)[-c(1, 2)]])
qplot(value, mgres, data = meltedPredProstate,
xlab = "", ylab = "", geom = "point", alpha = I(0.3), size = I(3)) +
facet_wrap(~ variable, scales = "free_x") +
geom_smooth(size = 1, fill = "blue") +
coord_cartesian(ylim = c(-5, 2))
meltedPredProstate <- melt(predProstate, id.vars = c("patno", "mgres"),
measure.vars = disVar[disVar %in% all.vars(form)[-c(1, 2)]])
qplot(value, mgres, data = meltedPredProstate,
xlab = "", ylab = "", geom = "boxplot") +
facet_wrap(~ variable, scales = "free_x") +
opts(axis.text.x = theme_text(angle = -30,
size = 8, hjust = 0, vjust = 1))
form <- Surv(dtime, status != "alive") ~ rx + rcs(age, 4) + rcs(wt, 4) + pf + hx +
rcs(sbp, 4) + rcs(dbp, 4) + ekg + rcs(hg, 4) + rcs(sz, 4) + rcs(sg, 4) +
rcs(ap, 4) + bm
prostateCox2 <- coxph(form, data = subProstate)
print(xtable(drop1(prostateCox2, test = "Chisq")), type = "html")
| Df | AIC | LRT | Pr(> Chi) | |
|---|---|---|---|---|
| < none> | 3727.40 | |||
| rx | 3 | 3724.80 | 3.40 | 0.3334 |
| rcs(age, 4) | 3 | 3737.25 | 15.85 | 0.0012 |
| rcs(wt, 4) | 3 | 3728.44 | 7.05 | 0.0704 |
| pf | 3 | 3729.94 | 8.54 | 0.0360 |
| hx | 1 | 3742.48 | 17.09 | 0.0000 |
| rcs(sbp, 4) | 3 | 3722.89 | 1.49 | 0.6847 |
| rcs(dbp, 4) | 3 | 3728.42 | 7.02 | 0.0712 |
| ekg | 6 | 3728.87 | 13.48 | 0.0360 |
| rcs(hg, 4) | 3 | 3730.63 | 9.23 | 0.0264 |
| rcs(sz, 4) | 3 | 3733.44 | 12.04 | 0.0073 |
| rcs(sg, 4) | 3 | 3725.26 | 3.87 | 0.2762 |
| rcs(ap, 4) | 3 | 3732.55 | 11.15 | 0.0109 |
| bm | 1 | 3725.76 | 0.36 | 0.5479 |
predictProstate <- predict(prostateCox2, type = "terms", se.fit = TRUE)
predictData <- melt(subProstate[, c("patno", conVar[-1])],
id.vars = "patno")
selectedTerms <- which(all.vars(form)[-c(1,2)] %in% conVar)
plotData <- cbind(predictData,
se = melt(predictProstate$se.fit[, selectedTerms])$value,
y = melt(predictProstate$fit[, selectedTerms])$value
)
ggplot(plotData, aes(x = value), xlab = "", ylab = "") +
geom_ribbon(aes(ymin = y - 2*se, ymax = y + 2*se),
alpha = I(0.2),
fill = I("blue")) +
geom_line(aes(y = y), size = 1) +
geom_rug(data = predictData,
aes(x = value, y = NULL),
colour = I("black"),
alpha = I(0.1)
) +
facet_wrap(~ variable, nrow = 2, scale = "free")
predictData <- melt(subProstate[, c("patno", disVar[-c(1, 3)])],
id.vars = "patno")
selectedTerms <- which(all.vars(form)[-c(1, 2)] %in% disVar)
plotData <- cbind(predictData,
se = melt(predictProstate$se.fit[, selectedTerms])$value,
y = melt(predictProstate$fit[, selectedTerms])$value
)[!duplicated(interaction(predictData[, c("variable", "value")])), ]
ggplot(plotData, aes(x = value), xlab = "", ylab = "") +
geom_errorbar(aes(ymin = y - 2*se, ymax = y + 2*se),
colour = I("blue")) +
geom_point(aes(y = y), size = 3) +
facet_wrap(~ variable, nrow = 2, scale = "free") +
theme(axis.text.x = element_text(angle = -30,
size = 8, hjust = 0, vjust = 1))