Fitting an Exponential Curve to a Stepwise Survival Curve

Let's fit a function of the form f(t) = exp(λt) to a stepwise survival curve (e.g. a Kaplan Meier curve).

Here's the stepwise survival curve we'll be using in this demonstration:

# The data that describes the stepwise survival curve.
surv_time <- c(0.2,  0.5, 1.2,   2,   5,  6.5,    9,   15, 17.3,   35,   80,  200)
surv_prob <- c(  1, 0.94, 0.9, 0.8, 0.6, 0.57, 0.41,  0.2, 0.18, 0.07, 0.02, 0.01)

# Plot the curve.
plot_stepwise_surv_fn <- function(time, prob, col="black"){
   time <- c(0, rep(time, each=2))
   prob <- c(1, 1, rep(prob, each=2)[1:(2 * length(prob) - 1)])
   plot(x=time, y=prob, ylim=c(0, 1), type="l", col=col, xlab="Time", ylab="Probability")
}
plot_stepwise_surv_fn(surv_time, surv_prob)
A stepwise survival curve.

We get two estimates for λ:

fit <- fit_exp_to_stepwise_survival(surv_time, surv_prob)
print(fit)
lambda_method_1 lambda_method_2
    -0.07377279     -0.07375238

Those estimates are very similar, so let's just add the exponential curve defined by one of them to the curve shown above:

curve(expr=sapply(x, function(x){exp(fit["lambda_method_2"] * x)}), from=0, to=max(surv_time), add=TRUE, col="red")
A stepwise survival curve and an exponential curve fitted to it.

As you can see, the curve corresponding to the λ estimate appears to fit the stepwise survival curve quite well.

Here's the function that does the work:

fit_exp_to_stepwise_survival <- function(surv_time, surv_prob, min_lambda=-10, method_1_subs=1000){
   #
   # Fits a curve of the form y = exp(lambda * t) to the stepwise survival curve defined by surv_time and surv_prob.
   #
   # Two methods are used for the curve fitting. Method 1 uses numerical integration to find the lambda that minimises
   # the area between the stepwise and fitted curve. Method 2 is a bit cleverer. It makes use of the fact that the
   # exponential and stepwise functions are easy to integrate.
   #
   # min_lambda is the minimum value that lambda can take in the fitted curve.
   #
   # method_1_subs is the the maximum number of integration subintervals used in method 1
   #
   # Returns a vector with two values:
   # lambda_method_1: The lambda for the fitted curve from the first method.
   # lambda_method_2: The lambda for the fitted curve from the second method.
   #

   # Check the parameters.
   if (length(surv_time) != length(surv_prob)) stop("surv_time and surv_prob must be the same length.")
   if (min_lambda >= 0) stop("min_lambda must be less than zero.")

   # The code that follows assumes that surv_time and surv_prob are sorted by surv_time.
   ord <- order(surv_time)
   surv_time <- surv_time[ord]
   surv_prob <- surv_prob[ord]

   # The negative exponential function that we'll be fitting to the stepwise survival curve.
   neg_exp <- function(lambda, t) exp(-lambda * t)

   #
   # METHOD 1
   #
   stepwise_surv_prob <- function(t){
      # A stepwise survival function based on surv_time and surv_prob.
      prob <- function(t_i){
         if (t_i < 0 || max(surv_time) < t_i) return(NA)
         if (t_i < min(surv_time)) return(1)
         return(surv_prob[max(which(surv_time <= t_i))])
      }
      sapply(t, prob)
   }
   fn_dist1 <- function(lambda) integrate(function(t) abs(neg_exp(lambda, t) - stepwise_surv_prob(t)), lower=0, upper=max(surv_time), subdivisions=method_1_subs)$value
   lambda_v1 <- optimise(f=fn_dist1, interval=c(0, -min_lambda), maximum=FALSE)$minimum

   #
   # METHOD 2
   #
   fn_dist2 <- function(lambda){
      # Create df_calc, the dataframe we'll be using for our calculation of curve areas. "a" ("b") is interval start (end).
      df_calc <- data.frame("a"=c(0, surv_time[1:(length(surv_time) - 1)]), "b"=surv_time, "prob"=c(1, surv_prob[1:(length(surv_time) - 1)]))
      df_calc$intercept <- -log(df_calc$prob) / lambda
      df_calc$polarity <- (df_calc$intercept < df_calc$b) - (df_calc$intercept > df_calc$a)
      for (i in rev(which(df_calc$polarity == 0))){
         # Split in two an interval for which the stepwise survival and the exponential curves intersect.
         df_calc2 <- NULL
         if (i > 1) df_calc2 <- df_calc[1:(i-1),]
         df_calc2 <- rbind(df_calc2, data.frame("a"=df_calc$a[i]        , "b"=df_calc$intercept[i], "prob"=df_calc$prob[i], "intercept"=df_calc$intercept[i], "polarity"=-1))
         df_calc2 <- rbind(df_calc2, data.frame("a"=df_calc$intercept[i], "b"=df_calc$b[i]        , "prob"=df_calc$prob[i], "intercept"=df_calc$intercept[i], "polarity"= 1))
         if (i < nrow(df_calc)) df_calc2 <- rbind(df_calc2, df_calc[(i+1):nrow(df_calc),])
         df_calc <- df_calc2
      }

      # Do the calculations.
      n <- nrow(df_calc)
      df_calc$step_area <- df_calc$prob * (df_calc$b - df_calc$a)
      exp_sum <- df_calc$polarity[n] * neg_exp(lambda, df_calc$b[n]) - df_calc$polarity[1] * neg_exp(lambda, df_calc$a[1])
      if (n > 1){
         pol <- df_calc$polarity
         exp_sum <- exp_sum + sum((pol[1:(n-1)] != pol[2:n]) * pol[1:(n-1)] * 2 * neg_exp(lambda, df_calc$b[1:(n-1)]))
      }
      return(sum(df_calc$polarity * df_calc$step_area) + exp_sum / lambda)
   }
  lambda_v2 <- optimise(f=fn_dist2, interval=c(0, -min_lambda), maximum=FALSE)$minimum

  # Return the results from methods 1 and 2.
  return(c("lambda_method_1"=-lambda_v1, "lambda_method_2"=-lambda_v2))
}

The Flexsurv Package

Here's some code that uses the flexsurv package to fit an exponential curve to a Kaplan Meier curve. The fit_best_survival_curve function allows various distributions in addition to the exponential distribution to be fitted; the best fitting distribution is selected via the AIC statistic.

require("flexsurv")
require("survival")

fit_best_survival_curve = function(days, event, silent=FALSE,
                                   dists_to_try = c("exp", "weibull", "gamma", "lnorm", "gompertz", "llogis", "gengamma", "gengamma.orig", "genf", "genf.orig")){
   #
   # Fits survival curves based on different distributions to the supplied data.
   # Returns the best fit (by AIC) as an flexsurvreg object, or NULL if nothing fitted.
   #

   logmsg = function(..., silent=FALSE){if (!silent) cat(paste0(..., "\n"))}

   if (length(days) != length(event)) stop("The days and event vectors must be of equal length.")

   best_fit = NULL
   for (dist in dists_to_try){
      fit = NULL
      options(warn=-1) # Turn off warnings.
      try({fit = flexsurvreg(Surv(days, event) ~ 1, data=data.frame(days, event), dist=dist)}, silent=TRUE)
      options(warn=0) # Turn on warnings (the default).
      if (is.null(fit)) logmsg("Distribution ", dist, " could not be fitted.", silent=silent)
      else{
         logmsg("Distribution ", dist, " was fitted with AIC=", fit$AIC, silent=silent)
         if (is.null(best_fit) || fit$AIC < best_fit$AIC) best_fit = fit
      }
   }

   if (is.null(best_fit)) logmsg("No distributions fitted.", silent=silent)
   else logmsg("Best distribution was ", best_fit$dlist$name, " with AIC=", best_fit$AIC, silent=silent)

   return(best_fit)
}

fitted_survival_curve = function(fit){
   #
   # Extracts as an ordinary function taking one parameter (time) the fitted survival curve from a flexsurvreg object.
   #

   fn = NULL
   dist_name = fit$dlist$name

   if (dist_name == "exp"){
      rate = exp(fit$coefficients)
      fn = function(x) exp(-fit$dfns$H(x, rate=rate, log=FALSE))
   }

   if (dist_name == "gompertz"){
      shape = fit$coefficients[1]
      rate  = exp(fit$coefficients[2])
      fn = function(x) exp(-fit$dfns$H(x, shape=shape, rate=rate, log=FALSE))
   }

   if (dist_name == "weibull.quiet"){
      shape = exp(fit$coefficients[1])
      scale = exp(fit$coefficients[2])
      fn = function(x) exp(-fit$dfns$H(x, shape=shape, scale=scale, log=FALSE))
   }

   if (dist_name == "gamma"){
      shape = exp(fit$coefficients[1])
      rate  = exp(fit$coefficients[2])
      fn = function(x) exp(-fit$dfns$H(x, shape=shape, rate=rate, log=FALSE))
   }

   if (dist_name == "lnorm"){
      meanlog = fit$coefficients[1]
      sdlog   = exp(fit$coefficients[2])
      fn = function(x) exp(-fit$dfns$H(x, meanlog=meanlog, sdlog=sdlog, log=FALSE))
   }

   if (dist_name == "llogis"){
      shape = exp(fit$coefficients[1])
      scale = exp(fit$coefficients[2])
      fn = function(x) exp(-fit$dfns$H(x, shape=shape, scale=scale, log=FALSE))
   }

   if (dist_name == "gengamma"){
      mu    = fit$coefficients[1]
      sigma = exp(fit$coefficients[2])
      Q     = fit$coefficients[3]
      fn = function(x) exp(-fit$dfns$H(x, mu=mu, sigma=sigma, Q=Q))
   }

   if (dist_name == "gengamma.orig"){
      shape = exp(fit$coefficients[1])
      scale = exp(fit$coefficients[2])
      k     = exp(fit$coefficients[3])
      fn = function(x) exp(-fit$dfns$H(x, shape=shape, scale=scale, k=k))
   }

   if (dist_name == "genf"){
      mu    = fit$coefficients[1]
      sigma = exp(fit$coefficients[2])
      Q     = fit$coefficients[3]
      P     = exp(fit$coefficients[4])
      fn = function(x) exp(-fit$dfns$H(x, mu=mu, sigma=sigma, Q=Q, P=P))
   }

   if (dist_name == "genf.orig"){
      mu    = fit$coefficients[1]
      sigma = exp(fit$coefficients[2])
      s1    = exp(fit$coefficients[3])
      s2    = exp(fit$coefficients[4])
      fn = function(x) exp(-fit$dfns$H(x, mu=mu, sigma=sigma, s1=s1, s2=s2))
   }

   return(fn)
}

df_surv = data.frame("time"  = c(3.1, 10.0,  2.3, 19.9,  3.1,  4.2, 40.2,  1.3,  0.9),
                     "event" = c(  1,    1,    1,    1,    1,    1,    1,    1,    1))

fit = fit_best_survival_curve(df_surv$time, df_surv$event, dists_to_try=c("exp"))
fn = fitted_survival_curve(fit)

col_kaplan_meier = "black"
col_parametric   = "red"
plot(fit, col.obs=col_kaplan_meier, ci=FALSE, est=FALSE, xlab="Time", ylab="Survival Probability", main="Kaplan Meier and Parametric Survival Curves")
curve(fn, from=0, to=max(df_surv$time), add=TRUE, col=col_parametric)
legend("topright", legend=c("Kaplan Meier", "Kaplan Meier 95% CI", "Parametric"), col=c(col_kaplan_meier, col_kaplan_meier, col_parametric), lwd=rep(1, 3), lty=c(1, 2, 1))

A stepwise survival curve and an exponential curve fitted to it using the flexsurv package.