Written by Peter Rosenmai on 2 Aug 2014.
Here's some R code that generates random numbers from the probability distribution described by a given non-negative function. This can be useful when running simulations or generating datasets for testing purposes.
For example, suppose you want to generate random data from a distribution that looks something like this back-of-an-envelope sketch:
spline_function <-
function(x, y){
fn <-
splinefun(x, y, method=
"natural")
min_x <-
min(x)
max_x <-
max(x)
f <-
function(x){
y <-
fn(x)
y[x < min_x | max_x < x] <- NA
return(y)
}
return(f)
}
cdf <-
function(fn, min_x, max_x){
f <-
function(x){
y <-
rep(NA,
length(x))
total_area_under_curve <-
integrate(fn, min_x, max_x)$value
apply_fn <-
function(xval){
integrate(fn, min_x,
min(xval, max_x))$value}
y[min_x <= x & x <= max_x] <-
sapply(x[min_x <= x & x <= max_x], FUN=apply_fn) / total_area_under_curve
y[x < min_x] <- 0
y[x > max_x] <- 1
return(y)
}
return(f)
}
inverse <-
function(fn, min_x, max_x){
fn_inv <-
function(y){
uniroot((
function(x){
fn(x) - y}), lower=min_x, upper=max_x)[1]$root
}
return(
Vectorize(fn_inv))
}
x <-
c( 7, 8, 9, 9.5, 10, 11, 12, 13, 14)
y <-
c(0.1, 0.2, 0.5, 0.58, 0.35, 0.3, 0.1, 0.15, 0)
xlim <-
c(5, 15)
op <-
par(mfrow=
c(3, 1))
fn <-
spline_function(x, y)
curve(fn, xlim[1], xlim[2], n=1000, main=
"Spline Interpolation Function")
points(x, y)
cdf_fn <-
cdf(fn,
min(x),
max(x))
curve(cdf_fn, xlim[1], xlim[2], n=1000, main=
"CDF from Spline Interpolation Function")
n_random_points <- 10000
cdf_inv <-
inverse(cdf_fn,
min(x),
max(x))
random_points <-
cdf_inv(
runif(n_random_points))
hist(random_points, xlim=xlim, breaks=50, main=
"Random Points from Spline Interpolation Function")
par(op)