## Fitting an Exponential Curve to a Stepwise Survival Curve

Let's fit a function of the form f(t) = exp(λt) to a stepwise survival curve (e.g. a Kaplan Meier curve).

Here's the stepwise survival curve we'll be using in this demonstration:

# The data that describes the stepwise survival curve.
surv_time <- c(0.2,  0.5, 1.2,   2,   5,  6.5,    9,   15, 17.3,   35,   80,  200)
surv_prob <- c(  1, 0.94, 0.9, 0.8, 0.6, 0.57, 0.41,  0.2, 0.18, 0.07, 0.02, 0.01)

# Plot the curve.
plot_stepwise_surv_fn <- function(time, prob, col="black"){
time <- c(0, rep(time, each=2))
prob <- c(1, 1, rep(prob, each=2)[1:(2 * length(prob) - 1)])
plot(x=time, y=prob, ylim=c(0, 1), type="l", col=col, xlab="Time", ylab="Probability")
}
plot_stepwise_surv_fn(surv_time, surv_prob) We get two estimates for λ:

fit <- fit_exp_to_stepwise_survival(surv_time, surv_prob)
print(fit)
lambda_method_1 lambda_method_2
-0.07377279     -0.07375238

Those estimates are very similar, so let's just add the exponential curve defined by one of them to the curve shown above:

curve(expr=sapply(x, function(x){exp(fit["lambda_method_2"] * x)}), from=0, to=max(surv_time), add=TRUE, col="red") As you can see, the curve corresponding to the λ estimate appears to fit the stepwise survival curve quite well.

Here's the function that does the work:

fit_exp_to_stepwise_survival <- function(surv_time, surv_prob, min_lambda=-10, method_1_subs=1000){
#
# Fits a curve of the form y = exp(lambda * t) to the stepwise survival curve defined by surv_time and surv_prob.
#
# Two methods are used for the curve fitting. Method 1 uses numerical integration to find the lambda that minimises
# the area between the stepwise and fitted curve. Method 2 is a bit cleverer. It makes use of the fact that the
# exponential and stepwise functions are easy to integrate.
#
# min_lambda is the minimum value that lambda can take in the fitted curve.
#
# method_1_subs is the the maximum number of integration subintervals used in method 1
#
# Returns a vector with two values:
# lambda_method_1: The lambda for the fitted curve from the first method.
# lambda_method_2: The lambda for the fitted curve from the second method.
#

# Check the parameters.
if (length(surv_time) != length(surv_prob)) stop("surv_time and surv_prob must be the same length.")
if (min_lambda >= 0) stop("min_lambda must be less than zero.")

# The code that follows assumes that surv_time and surv_prob are sorted by surv_time.
ord <- order(surv_time)
surv_time <- surv_time[ord]
surv_prob <- surv_prob[ord]

# The negative exponential function that we'll be fitting to the stepwise survival curve.
neg_exp <- function(lambda, t) exp(-lambda * t)

#
# METHOD 1
#
stepwise_surv_prob <- function(t){
# A stepwise survival function based on surv_time and surv_prob.
prob <- function(t_i){
if (t_i < 0 || max(surv_time) < t_i) return(NA)
if (t_i < min(surv_time)) return(1)
return(surv_prob[max(which(surv_time <= t_i))])
}
sapply(t, prob)
}
fn_dist1 <- function(lambda) integrate(function(t) abs(neg_exp(lambda, t) - stepwise_surv_prob(t)), lower=0, upper=max(surv_time), subdivisions=method_1_subs)\$value
lambda_v1 <- optimise(f=fn_dist1, interval=c(0, -min_lambda), maximum=FALSE)\$minimum

#
# METHOD 2
#
fn_dist2 <- function(lambda){
# Create df_calc, the dataframe we'll be using for our calculation of curve areas. "a" ("b") is interval start (end).
df_calc <- data.frame("a"=c(0, surv_time[1:(length(surv_time) - 1)]), "b"=surv_time, "prob"=c(1, surv_prob[1:(length(surv_time) - 1)]))
df_calc\$intercept <- -log(df_calc\$prob) / lambda
df_calc\$polarity <- (df_calc\$intercept < df_calc\$b) - (df_calc\$intercept > df_calc\$a)
for (i in rev(which(df_calc\$polarity == 0))){
# Split in two an interval for which the stepwise survival and the exponential curves intersect.
df_calc2 <- NULL
if (i > 1) df_calc2 <- df_calc[1:(i-1),]
df_calc2 <- rbind(df_calc2, data.frame("a"=df_calc\$a[i]        , "b"=df_calc\$intercept[i], "prob"=df_calc\$prob[i], "intercept"=df_calc\$intercept[i], "polarity"=-1))
df_calc2 <- rbind(df_calc2, data.frame("a"=df_calc\$intercept[i], "b"=df_calc\$b[i]        , "prob"=df_calc\$prob[i], "intercept"=df_calc\$intercept[i], "polarity"= 1))
if (i < nrow(df_calc)) df_calc2 <- rbind(df_calc2, df_calc[(i+1):nrow(df_calc),])
df_calc <- df_calc2
}

# Do the calculations.
n <- nrow(df_calc)
df_calc\$step_area <- df_calc\$prob * (df_calc\$b - df_calc\$a)
exp_sum <- df_calc\$polarity[n] * neg_exp(lambda, df_calc\$b[n]) - df_calc\$polarity * neg_exp(lambda, df_calc\$a)
if (n > 1){
pol <- df_calc\$polarity
exp_sum <- exp_sum + sum((pol[1:(n-1)] != pol[2:n]) * pol[1:(n-1)] * 2 * neg_exp(lambda, df_calc\$b[1:(n-1)]))
}
return(sum(df_calc\$polarity * df_calc\$step_area) + exp_sum / lambda)
}
lambda_v2 <- optimise(f=fn_dist2, interval=c(0, -min_lambda), maximum=FALSE)\$minimum

# Return the results from methods 1 and 2.
return(c("lambda_method_1"=-lambda_v1, "lambda_method_2"=-lambda_v2))
}

### The Flexsurv Package

Here's some code that uses the flexsurv package to fit an exponential curve to a Kaplan Meier curve. The fit_best_survival_curve function allows various distributions in addition to the exponential distribution to be fitted; the best fitting distribution is selected via the AIC statistic.

require("flexsurv")
require("survival")

fit_best_survival_curve = function(days, event, silent=FALSE,
dists_to_try = c("exp", "weibull", "gamma", "lnorm", "gompertz", "llogis", "gengamma", "gengamma.orig", "genf", "genf.orig")){
#
# Fits survival curves based on different distributions to the supplied data.
# Returns the best fit (by AIC) as an flexsurvreg object, or NULL if nothing fitted.
#

logmsg = function(..., silent=FALSE){if (!silent) cat(paste0(..., "\n"))}

if (length(days) != length(event)) stop("The days and event vectors must be of equal length.")

best_fit = NULL
for (dist in dists_to_try){
fit = NULL
options(warn=-1) # Turn off warnings.
try({fit = flexsurvreg(Surv(days, event) ~ 1, data=data.frame(days, event), dist=dist)}, silent=TRUE)
options(warn=0) # Turn on warnings (the default).
if (is.null(fit)) logmsg("Distribution ", dist, " could not be fitted.", silent=silent)
else{
logmsg("Distribution ", dist, " was fitted with AIC=", fit\$AIC, silent=silent)
if (is.null(best_fit) || fit\$AIC < best_fit\$AIC) best_fit = fit
}
}

if (is.null(best_fit)) logmsg("No distributions fitted.", silent=silent)
else logmsg("Best distribution was ", best_fit\$dlist\$name, " with AIC=", best_fit\$AIC, silent=silent)

return(best_fit)
}

fitted_survival_curve = function(fit){
#
# Extracts as an ordinary function taking one parameter (time) the fitted survival curve from a flexsurvreg object.
#

fn = NULL
dist_name = fit\$dlist\$name

if (dist_name == "exp"){
rate = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, rate=rate, log=FALSE))
}

if (dist_name == "gompertz"){
shape = fit\$coefficients
rate  = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, shape=shape, rate=rate, log=FALSE))
}

if (dist_name == "weibull.quiet"){
shape = exp(fit\$coefficients)
scale = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, shape=shape, scale=scale, log=FALSE))
}

if (dist_name == "gamma"){
shape = exp(fit\$coefficients)
rate  = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, shape=shape, rate=rate, log=FALSE))
}

if (dist_name == "lnorm"){
meanlog = fit\$coefficients
sdlog   = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, meanlog=meanlog, sdlog=sdlog, log=FALSE))
}

if (dist_name == "llogis"){
shape = exp(fit\$coefficients)
scale = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, shape=shape, scale=scale, log=FALSE))
}

if (dist_name == "gengamma"){
mu    = fit\$coefficients
sigma = exp(fit\$coefficients)
Q     = fit\$coefficients
fn = function(x) exp(-fit\$dfns\$H(x, mu=mu, sigma=sigma, Q=Q))
}

if (dist_name == "gengamma.orig"){
shape = exp(fit\$coefficients)
scale = exp(fit\$coefficients)
k     = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, shape=shape, scale=scale, k=k))
}

if (dist_name == "genf"){
mu    = fit\$coefficients
sigma = exp(fit\$coefficients)
Q     = fit\$coefficients
P     = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, mu=mu, sigma=sigma, Q=Q, P=P))
}

if (dist_name == "genf.orig"){
mu    = fit\$coefficients
sigma = exp(fit\$coefficients)
s1    = exp(fit\$coefficients)
s2    = exp(fit\$coefficients)
fn = function(x) exp(-fit\$dfns\$H(x, mu=mu, sigma=sigma, s1=s1, s2=s2))
}

return(fn)
}

df_surv = data.frame("time"  = c(3.1, 10.0,  2.3, 19.9,  3.1,  4.2, 40.2,  1.3,  0.9),
"event" = c(  1,    1,    1,    1,    1,    1,    1,    1,    1))

fit = fit_best_survival_curve(df_surv\$time, df_surv\$event, dists_to_try=c("exp"))
fn = fitted_survival_curve(fit)

col_kaplan_meier = "black"
col_parametric   = "red"
plot(fit, col.obs=col_kaplan_meier, ci=FALSE, est=FALSE, xlab="Time", ylab="Survival Probability", main="Kaplan Meier and Parametric Survival Curves") 