Bayesian model example - Native discoveries

  library(alien)
  library(rstan)
#> Loading required package: StanHeaders
#> 
#> rstan version 2.32.6 (Stan version 2.32.2)
#> For execution on a local, multicore CPU with excess RAM we recommend calling
#> options(mc.cores = parallel::detectCores()).
#> To avoid recompilation of unchanged Stan programs, we recommend calling
#> rstan_options(auto_write = TRUE)
#> For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
#> change `threads_per_chain` option:
#> rstan_options(threads_per_chain = 1)

Introduction

Here we show an example fitting of a Bayesian heirarchical model to data, using the proportion of undiscovered alien species to the total number of undiscovered species (natives and aliens) as the probability of detecting a new alien species. The model is described in full in Belmaker et al (2009), with some modifications described in Buba et al (2024).

We will demonstrate that using the code provided in the alien package:

model_file <- system.file("stan/modified_belmaker_et_al_2009_model.stan", package = "alien")
readLines(model_file)
#>  [1] "data{"                                                                               
#>  [2] "  int <lower = 1> N;  // number of rows in the data"                                 
#>  [3] "  int <lower = 1> native_total; // assumed size of native pool"                      
#>  [4] "  array[N] int <lower = 0> dI; // observed number of yearly records invasive"        
#>  [5] "  array[N] int <lower = 0> dN; // observed number of yearly records native"          
#>  [6] "  vector <lower = 0>[N] t; // time from start"                                       
#>  [7] "  real b0_mu; // prior for betas"                                                    
#>  [8] "  real b1_mu; // prior for betas"                                                    
#>  [9] "  real b0_sd; // prior for betas"                                                    
#> [10] "  real b1_sd; // prior for betas"                                                    
#> [11] "}"                                                                                   
#> [12] ""                                                                                    
#> [13] "transformed data {"                                                                  
#> [14] "  array[N] int <lower = 0> unrecorded_N;"                                            
#> [15] "  array[N] int <lower = 0> yearly_detections;"                                       
#> [16] "  array[N] int <lower = 0> recorded_I;  // cumulative recorded invasives"            
#> [17] "  array[N] int <lower = 0> recorded_N; // cumulative recorded natives"               
#> [18] ""                                                                                    
#> [19] "  recorded_I = cumulative_sum(dI);"                                                  
#> [20] "  recorded_N = cumulative_sum(dN);"                                                  
#> [21] ""                                                                                    
#> [22] "  for (i in 1:N){"                                                                   
#> [23] "    unrecorded_N[i] = native_total - recorded_N[i];"                                 
#> [24] "    yearly_detections[i] = dI[i] + dN[i];"                                           
#> [25] "  }"                                                                                 
#> [26] ""                                                                                    
#> [27] "}"                                                                                   
#> [28] ""                                                                                    
#> [29] "parameters {"                                                                        
#> [30] "  real b0;"                                                                          
#> [31] "  real b1;"                                                                          
#> [32] "}"                                                                                   
#> [33] ""                                                                                    
#> [34] "transformed parameters {"                                                            
#> [35] "  vector<lower = 0>[N] mu_t;"                                                        
#> [36] "  vector<lower = 0>[N] unrecorded_I;"                                                
#> [37] ""                                                                                    
#> [38] "  mu_t = exp(b0 + b1 .* t);"                                                         
#> [39] ""                                                                                    
#> [40] "  for (i in 1:N){"                                                                   
#> [41] "    unrecorded_I[i] = cumulative_sum(mu_t)[i] - recorded_I[i];"                      
#> [42] "  }"                                                                                 
#> [43] ""                                                                                    
#> [44] "}"                                                                                   
#> [45] ""                                                                                    
#> [46] "model{"                                                                              
#> [47] ""                                                                                    
#> [48] "  dI ~ beta_binomial(yearly_detections, unrecorded_I, unrecorded_N);"                
#> [49] ""                                                                                    
#> [50] "  //priors"                                                                          
#> [51] "  b0  ~ normal(b0_mu, b0_sd);"                                                       
#> [52] "  b1  ~ normal(b1_mu , b1_sd);"                                                      
#> [53] "}"                                                                                   
#> [54] ""                                                                                    
#> [55] "generated quantities {"                                                              
#> [56] "  array[N] int y = beta_binomial_rng(yearly_detections, unrecorded_I, unrecorded_N);"
#> [57] "}"

Data

We will the medfish data included in the alien package:

data(medfish)

head(medfish)
#>   year time natives aliens
#> 1 1927    0      52      5
#> 2 1930    3       0      1
#> 3 1931    4      12      0
#> 4 1934    7      18      0
#> 5 1935    8      17      1
#> 6 1937   10       1      1

The data has several columns: 1. year - The year of the observations. 2. time - how much time has passed from the first observation in the data until this point. 3. natives - how many natives were newly described in this year. 4. alien - how many aliens were newly described in this year.

ggplot2::ggplot(medfish)+
  ggplot2::aes(x = year) + 
  ggplot2::geom_point(ggplot2::aes(y = cumsum(natives)), shape = 21, size = 2, fill = "#377EB8") +
  ggplot2::geom_point(ggplot2::aes(y = cumsum(aliens)), shape = 21,  size = 2, fill = "#E41A1C")

Model fitting

We will use the rstan package to fit the model to our data, first we read the model script to define a stan model:

native_discoveries_model <- rstan::stan_model(model_file)

The next step is to create a data list to be used by rstan:

readLines(model_file)[1:11]
#>  [1] "data{"                                                                       
#>  [2] "  int <lower = 1> N;  // number of rows in the data"                         
#>  [3] "  int <lower = 1> native_total; // assumed size of native pool"              
#>  [4] "  array[N] int <lower = 0> dI; // observed number of yearly records invasive"
#>  [5] "  array[N] int <lower = 0> dN; // observed number of yearly records native"  
#>  [6] "  vector <lower = 0>[N] t; // time from start"                               
#>  [7] "  real b0_mu; // prior for betas"                                            
#>  [8] "  real b1_mu; // prior for betas"                                            
#>  [9] "  real b0_sd; // prior for betas"                                            
#> [10] "  real b1_sd; // prior for betas"                                            
#> [11] "}"

We can set it up using data from medfish. The only thing we need to assume is the native pool size (See Belmaker et al 2009 for more detail)

data_for_stan <- list(
  N = nrow(medfish),
  native_total = 600,
  dI = medfish$aliens,
  dN = medfish$natives,
  t = medfish$t
)

An important aspect of Bayesian modelling is the ability to use prior for the model parameters. The model uses priors for both the mean and standard error of the parameters β0 and β1:

readLines(model_file)[7:10]
#> [1] "  real b0_mu; // prior for betas" "  real b1_mu; // prior for betas"
#> [3] "  real b0_sd; // prior for betas" "  real b1_sd; // prior for betas"

We can start with a naive model of alien introduction to get an idea of how rapid is the alien introduction rate:

naive_model <- stats::glm(aliens ~ time, data = medfish, family = "poisson")
stats::summary.glm(naive_model)
#> 
#> Call:
#> stats::glm(formula = aliens ~ time, family = "poisson", data = medfish)
#> 
#> Coefficients:
#>              Estimate Std. Error z value Pr(>|z|)    
#> (Intercept) -0.319591   0.280405  -1.140 0.254391    
#> time         0.014949   0.004338   3.446 0.000568 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for poisson family taken to be 1)
#> 
#>     Null deviance: 123.07  on 59  degrees of freedom
#> Residual deviance: 110.33  on 58  degrees of freedom
#> AIC: 220.23
#> 
#> Number of Fisher Scoring iterations: 6

We will use these estimates as priors for our model.

coef_table <- summary(naive_model)$coefficients
priors <- c(
  b0_mu = coef_table[1,1],
  b0_sd = coef_table[1,2],
  b1_mu = coef_table[2,1],
  b1_sd = coef_table[2,2]
)
priors
#>        b0_mu        b0_sd        b1_mu        b1_sd 
#> -0.319590861  0.280404647  0.014949156  0.004337772

We join the priors with the rest of the data:

data_for_stan <- c(data_for_stan, priors)

We can then call the sampling function to fit the model:

model_output <- rstan::sampling(native_discoveries_model, data_for_stan)

After the sampling, we can extract the model prediction for the number of undiscovered alien species in each year - unrecorded_I:

stan_summary <- summary(model_output, pars = "unrecorded_I")$summary |>
  tibble::as_tibble() |> 
  tibble::add_column(year = medfish$year)

ggplot2::ggplot(medfish)+
  ggplot2::aes(x = year) + 
  ggplot2::geom_point(ggplot2::aes(y = cumsum(natives)), shape = 21, size = 2, fill = "#377EB8") +
  ggplot2::geom_point(ggplot2::aes(y = cumsum(aliens)), shape = 21,  size = 2, fill = "#E41A1C") +
  ggplot2::geom_ribbon(data = stan_summary, ggplot2::aes(ymin = `2.5%`, ymax = `97.5%`), alpha = 0.2) +
  ggplot2::geom_line(data = stan_summary, ggplot2::aes(y = mean)) + 
  ggplot2::labs(x = "Year", y = "Cumulative number\nof discovered species")

NOTE: This is just an example of the usage of such a model - additional tests should be performed to examine the validity of the results.