Fitting an Exponential Curve to a Stepwise Survival Curve
Written by Peter Rosenmai on 27 Aug 2016. Last revised 13 Mar 2017.
Let's fit a function of the form f(t) = exp(λt) to a stepwise survival curve (e.g. a Kaplan Meier curve).
Here's the stepwise survival curve we'll be using in this demonstration:
surv_time <-
c(0.2, 0.5, 1.2, 2, 5, 6.5, 9, 15, 17.3, 35, 80, 200)
surv_prob <-
c( 1, 0.94, 0.9, 0.8, 0.6, 0.57, 0.41, 0.2, 0.18, 0.07, 0.02, 0.01)
plot_stepwise_surv_fn <-
function(time, prob, col=
"black"){
time <-
c(0,
rep(time, each=2))
prob <-
c(1, 1,
rep(prob, each=2)[1:(2 *
length(prob) - 1)])
plot(x=time, y=prob, ylim=
c(0, 1), type=
"l", col=col, xlab=
"Time", ylab=
"Probability")
}
plot_stepwise_surv_fn(surv_time, surv_prob)
We get two estimates for λ:
fit <- fit_exp_to_stepwise_survival(surv_time, surv_prob)
print(fit)
lambda_method_1 lambda_method_2
-0.07377279 -0.07375238
Those estimates are very similar, so let's just add the exponential curve defined by one of them to the curve shown above:
curve(expr=
sapply(x,
function(x){
exp(fit[
"lambda_method_2"] * x)}), from=0, to=
max(surv_time), add=TRUE, col=
"red")
As you can see, the curve corresponding to the λ estimate appears to fit the stepwise survival curve quite well.
Here's the function that does the work:
fit_exp_to_stepwise_survival <- function(surv_time, surv_prob, min_lambda=-10, method_1_subs=1000){
if (length(surv_time) != length(surv_prob)) stop("surv_time and surv_prob must be the same length.")
if (min_lambda >= 0) stop("min_lambda must be less than zero.")
ord <- order(surv_time)
surv_time <- surv_time[ord]
surv_prob <- surv_prob[ord]
neg_exp <- function(lambda, t) exp(-lambda * t)
stepwise_surv_prob <- function(t){
prob <- function(t_i){
if (t_i < 0 || max(surv_time) < t_i) return(NA)
if (t_i < min(surv_time)) return(1)
return(surv_prob[max(which(surv_time <= t_i))])
}
sapply(t, prob)
}
fn_dist1 <- function(lambda) integrate(function(t) abs(neg_exp(lambda, t) - stepwise_surv_prob(t)), lower=0, upper=max(surv_time), subdivisions=method_1_subs)$value
lambda_v1 <- optimise(f=fn_dist1, interval=c(0, -min_lambda), maximum=FALSE)$minimum
fn_dist2 <- function(lambda){
df_calc <- data.frame("a"=c(0, surv_time[1:(length(surv_time) - 1)]), "b"=surv_time, "prob"=c(1, surv_prob[1:(length(surv_time) - 1)]))
df_calc$intercept <- -log(df_calc$prob) / lambda
df_calc$polarity <- (df_calc$intercept < df_calc$b) - (df_calc$intercept > df_calc$a)
for (i in rev(which(df_calc$polarity == 0))){
df_calc2 <- NULL
if (i > 1) df_calc2 <- df_calc[1:(i-1),]
df_calc2 <- rbind(df_calc2, data.frame("a"=df_calc$a[i] , "b"=df_calc$intercept[i], "prob"=df_calc$prob[i], "intercept"=df_calc$intercept[i], "polarity"=-1))
df_calc2 <- rbind(df_calc2, data.frame("a"=df_calc$intercept[i], "b"=df_calc$b[i] , "prob"=df_calc$prob[i], "intercept"=df_calc$intercept[i], "polarity"= 1))
if (i < nrow(df_calc)) df_calc2 <- rbind(df_calc2, df_calc[(i+1):nrow(df_calc),])
df_calc <- df_calc2
}
n <- nrow(df_calc)
df_calc$step_area <- df_calc$prob * (df_calc$b - df_calc$a)
exp_sum <- df_calc$polarity[n] * neg_exp(lambda, df_calc$b[n]) - df_calc$polarity[1] * neg_exp(lambda, df_calc$a[1])
if (n > 1){
pol <- df_calc$polarity
exp_sum <- exp_sum + sum((pol[1:(n-1)] != pol[2:n]) * pol[1:(n-1)] * 2 * neg_exp(lambda, df_calc$b[1:(n-1)]))
}
return(sum(df_calc$polarity * df_calc$step_area) + exp_sum / lambda)
}
lambda_v2 <- optimise(f=fn_dist2, interval=c(0, -min_lambda), maximum=FALSE)$minimum
return(c("lambda_method_1"=-lambda_v1, "lambda_method_2"=-lambda_v2))
}
The Flexsurv Package
Here's some code that uses the flexsurv package to fit an exponential curve to a Kaplan Meier curve. The fit_best_survival_curve function allows various distributions in addition to the exponential distribution to be fitted; the best fitting distribution is selected via the AIC statistic.
require(
"flexsurv")
require(
"survival")
fit_best_survival_curve =
function(days, event, silent=FALSE,
dists_to_try =
c(
"exp",
"weibull",
"gamma",
"lnorm",
"gompertz",
"llogis",
"gengamma",
"gengamma.orig",
"genf",
"genf.orig")){
logmsg =
function(..., silent=FALSE){if (!silent)
cat(
paste0(...,
"\n"))}
if (
length(days) !=
length(event))
stop(
"The days and event vectors must be of equal length.")
best_fit = NULL
for (dist in dists_to_try){
fit = NULL
options(warn=-1)
try({fit =
flexsurvreg(
Surv(days, event) ~ 1, data=
data.frame(days, event), dist=dist)}, silent=TRUE)
options(warn=0)
if (
is.null(fit))
logmsg(
"Distribution ", dist,
" could not be fitted.", silent=silent)
else{
logmsg(
"Distribution ", dist,
" was fitted with AIC=", fit$AIC, silent=silent)
if (
is.null(best_fit) || fit$AIC < best_fit$AIC) best_fit = fit
}
}
if (
is.null(best_fit))
logmsg(
"No distributions fitted.", silent=silent)
else
logmsg(
"Best distribution was ", best_fit$dlist$name,
" with AIC=", best_fit$AIC, silent=silent)
return(best_fit)
}
fitted_survival_curve =
function(fit){
fn = NULL
dist_name = fit$dlist$name
if (dist_name ==
"exp"){
rate =
exp(fit$coefficients)
fn =
function(x)
exp(-fit$dfns$
H(x, rate=rate, log=FALSE))
}
if (dist_name ==
"gompertz"){
shape = fit$coefficients[1]
rate =
exp(fit$coefficients[2])
fn =
function(x)
exp(-fit$dfns$
H(x, shape=shape, rate=rate, log=FALSE))
}
if (dist_name ==
"weibull.quiet"){
shape =
exp(fit$coefficients[1])
scale =
exp(fit$coefficients[2])
fn =
function(x)
exp(-fit$dfns$
H(x, shape=shape, scale=scale, log=FALSE))
}
if (dist_name ==
"gamma"){
shape =
exp(fit$coefficients[1])
rate =
exp(fit$coefficients[2])
fn =
function(x)
exp(-fit$dfns$
H(x, shape=shape, rate=rate, log=FALSE))
}
if (dist_name ==
"lnorm"){
meanlog = fit$coefficients[1]
sdlog =
exp(fit$coefficients[2])
fn =
function(x)
exp(-fit$dfns$
H(x, meanlog=meanlog, sdlog=sdlog, log=FALSE))
}
if (dist_name ==
"llogis"){
shape =
exp(fit$coefficients[1])
scale =
exp(fit$coefficients[2])
fn =
function(x)
exp(-fit$dfns$
H(x, shape=shape, scale=scale, log=FALSE))
}
if (dist_name ==
"gengamma"){
mu = fit$coefficients[1]
sigma =
exp(fit$coefficients[2])
Q = fit$coefficients[3]
fn =
function(x)
exp(-fit$dfns$
H(x, mu=mu, sigma=sigma, Q=Q))
}
if (dist_name ==
"gengamma.orig"){
shape =
exp(fit$coefficients[1])
scale =
exp(fit$coefficients[2])
k =
exp(fit$coefficients[3])
fn =
function(x)
exp(-fit$dfns$
H(x, shape=shape, scale=scale, k=k))
}
if (dist_name ==
"genf"){
mu = fit$coefficients[1]
sigma =
exp(fit$coefficients[2])
Q = fit$coefficients[3]
P =
exp(fit$coefficients[4])
fn =
function(x)
exp(-fit$dfns$
H(x, mu=mu, sigma=sigma, Q=Q, P=P))
}
if (dist_name ==
"genf.orig"){
mu = fit$coefficients[1]
sigma =
exp(fit$coefficients[2])
s1 =
exp(fit$coefficients[3])
s2 =
exp(fit$coefficients[4])
fn =
function(x)
exp(-fit$dfns$
H(x, mu=mu, sigma=sigma, s1=s1, s2=s2))
}
return(fn)
}
df_surv =
data.frame(
"time" =
c(3.1, 10.0, 2.3, 19.9, 3.1, 4.2, 40.2, 1.3, 0.9),
"event" =
c( 1, 1, 1, 1, 1, 1, 1, 1, 1))
fit =
fit_best_survival_curve(df_surv$time, df_surv$event, dists_to_try=
c(
"exp"))
fn =
fitted_survival_curve(fit)
col_kaplan_meier =
"black"
col_parametric =
"red"
plot(fit, col.obs=col_kaplan_meier, ci=FALSE, est=FALSE, xlab=
"Time", ylab=
"Survival Probability", main=
"Kaplan Meier and Parametric Survival Curves")
curve(fn, from=0, to=
max(df_surv$time), add=TRUE, col=col_parametric)
legend(
"topright", legend=
c(
"Kaplan Meier",
"Kaplan Meier 95% CI",
"Parametric"), col=
c(col_kaplan_meier, col_kaplan_meier, col_parametric), lwd=
rep(1, 3), lty=
c(1, 2, 1))