MCMC for ‘Big Data’ with Stan

Faster sampling with CmdStan using within-chain parallelization

Statistics
ML
Bayesian Modeling
Big Data
Stan
R
First Published

June 5, 2022

Summary

This post is an extension (and a translation to R) of PyMC-Labs’ benchmarking of MCMC for “Big Data”.

The Stan code was updated to use within-chain parallelization and compiler optimization for faster CPU sampling. Stan was able to achieve similar sampling speeds as PyMC’s JAX + GPU solution, purely on CPU.


Caution

By design, this post contains very few explanations.

Its goal was simply to translate PyMC’s blog post to R, and update their Stan code to use within-chain parallelization and compiler optimizations, for a fairer speed comparison with PyMC.

Feel free to read the original blog post to better understand what the code is doing.

Tip

You can check the page’s source code by clicking on the </> Code button at the top-right.

Setup


library(here)        # Working directory management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies
library(archive)     # Memory efficient object storage

library(cmdstanr)    # Lightweight R interface for Stan
library(posterior)   # Wrangling Stan models' output

library(data.table)  # Fast data manipulation (in-RAM)

library(tibble)      # Extending data.frames             (Tidyverse)
library(readr)       # Reading data from files           (Tidyverse)
library(dplyr)       # Manipulating data.frames - core   (Tidyverse)
library(tidyr)       # Manipulating data.frames - extras (Tidyverse)
library(stringr)     # Manipulating strings              (Tidyverse)
library(purrr)       # Manipulating lists                (Tidyverse)
library(lubridate)   # Manipulating date/time            (Tidyverse)
library(ggplot2)     # Best plotting library             (Tidyverse)

library(bayesplot)   # PPC/Diagnostic plots for Stan models
library(patchwork)   # Combining plots

options(
  mc.cores = max(1L, parallel::detectCores(logical = TRUE)),
  scipen = 999L, 
  digits = 4L,
  ggplot2.discrete.colour = \() scale_color_viridis_d(),
  ggplot2.discrete.fill = \() scale_fill_viridis_d()
)

nrows_print <- 10

setDTthreads(parallel::detectCores(logical = FALSE))
Adding custom knitr hooks
#---------------------------#
####🔺knitr custom hooks ####
#---------------------------#

library(knitr)

## Adding the `time_it` code chunk option
knitr::knit_hooks$set(time_it = local({
  assign("TIMES", list(), .GlobalEnv)
  start <- NULL
  function(before, options) {
    if (before) start <<- Sys.time()
    else TIMES[[options$label]] <<- difftime(Sys.time(), start)
  }
}))
Applying a custom theme to all ggplot objects, both light and dark versions
#---------------------------#
####🔺ggplot knit_prints ####
#---------------------------#

library(knitr)
library(ggplot2)

## Inspired by: https://debruine.github.io/quarto_demo/dark_mode.html
knit_print.ggplot <- function(x, options, ...) {
  if(any(grepl("patchwork", class(x)))) {
    plot_dark <- x & dark_addon_mar
    plot_light <- x & light_addon_mar
  } else {
    plot_dark <- x + dark_addon_mar
    plot_light <- x + light_addon_mar
  }
  
  cat('\n<div class="light-mode">\n')
  print(plot_light)
  cat('</div>\n')
  cat('<div class="dark-mode">\n')
  print(plot_dark)
  cat('</div>\n\n')
}

registerS3method("knit_print", "ggplot", knit_print.ggplot)
Applying a custom theme to all gt tables
#-----------------------#
####🔺gt knit_prints ####
#-----------------------#

library(knitr)
library(gt)

knit_print.grouped_df <- function(x, options, ...) {
  if ("grouped_df" %in% class(x)) x <- ungroup(x)
  
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "grouped_df", knit_print.grouped_df)

knit_print.data.frame <- function(x, options, ...) {
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "data.frame", knit_print.data.frame)
Functions for interactive data presentation
#----------------------------------#
####🔺knitr interactive display ####
#----------------------------------#

library(htmltools)
library(reactable)

## Getting list to display nicely in rendered documents
make_list_reactable <- function(list_dat) {
  
  list_name <- deparse(substitute(list_dat))
  
  get_list_elt_dim <- function(elt) {
    list_elt <- list_dat[[elt]]
    list_elt_dim <- if (any(c("data.frame", "matrix") %in% class(list_elt))) dim(list_elt) else length(list_elt)
    
    return(paste0(list_elt_dim, collapse = ", "))
  }
  
  dat <- data.frame(names(list_dat)) |> 
    set_names(list_name) |> 
    mutate(
      Type = unlist(pick(list_name)) |> map_chr(\(x) class(list_dat[[x]]) |> paste0(collapse = ", ")),
      Dimensions = unlist(pick(list_name)) |> map_chr(get_list_elt_dim)
    )
  
  get_list_details <- function(dat, idx, max_print = 200, max_digits = 3) {
    Element <- dat[[idx]]
    style <- "padding: 0.5rem"
    
    if (any(c("data.frame", "matrix") %in% class(Element))) {
      reactable(data.frame(Element), outlined = TRUE, striped = TRUE, highlight = TRUE, compact = TRUE) |> 
        htmltools::div(style = style) 
    }
    else if ("list" %in% class(Element)) 
      make_list_reactable(Element)
    else if (length(Element) > max_print) {
      htmltools::div(
        htmltools::p(head(Element, max_print) |> round(max_digits) |> paste0(collapse = ", ") |> paste("...", sep = ", ")),
        htmltools::p(stringr::str_glue("[ omitted {length(Element) - max_print} entries ]"), style = "font-style: italic"),
        style = style
      )
    }
    else htmltools::div(round(Element, max_digits) |> paste0(collapse = ", "), style = style)
  }
  
  reactable(
    dat
    , defaultColDef = colDef(vAlign = "center", headerVAlign = "center")
    , details = \(idx) get_list_details(list_dat, idx)
    , outlined = TRUE
    , striped = TRUE
    , highlight = TRUE
    , compact = FALSE
    , fullWidth = TRUE
    , defaultPageSize = 15
  )
}
Installing CmdStan
## Skip this step if CmdStan is already installed

cmdstanr::check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)

cpp_opts <- list(
  stan_threads = TRUE
  , STAN_CPP_OPTIMS = TRUE
  , STAN_NO_RANGE_CHECKS = TRUE # WARN: remove this if you haven't tested the model
  , PRECOMPILED_HEADERS = TRUE
  # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
  , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
  , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
)

cmdstanr::install_cmdstan(cpp_options = cpp_opts, quiet = TRUE)
Loading CmdStan (if already installed)
highest_cmdstan_version <- dir_ls(config$cmdstan_path) |> 
  path_file() |> 
  keep(\(e) str_detect(e, "cmdstan-")) |> 
  bind(x, str_split(x, '-', simplify = TRUE)[,2]) |> 
  reduce(\(x, y) ifelse(utils::compareVersion(x, y) == 1, x, y))

set_cmdstan_path(str_glue("{config$cmdstan_path}cmdstan-{highest_cmdstan_version}"))
Setting up knitr’s engine for CmdStan
## Inspired by: https://mpopov.com/blog/2020/07/30/replacing-the-knitr-engine-for-stan/

## Note: We could haved use cmdstanr::register_knitr_engine(), 
##       but it wouldn't include compiler optimizations & multi-threading by default

knitr::knit_engines$set(
  cmdstan = function(options) {
    output_var <- options$output.var
    if (!is.character(output_var) || length(output_var) != 1L) {
      stop(
        "The chunk option output.var must be a character string ",
        "providing a name for the returned `CmdStanModel` object."
      )
    }
    if (options$eval) {
      if (options$cache) {
        cache_path <- options$cache.path
        if (length(cache_path) == 0L || is.na(cache_path) || cache_path == "NA") {
          cache_path <- ""
        }
        dir <- paste0(cache_path, options$label)
      } else {
        dir <- tempdir()
      }
      file <- cmdstanr::write_stan_file(options$code, dir = dir, force_overwrite = TRUE)
      mod <- cmdstanr::cmdstan_model(
        stan_file = file, 
        cpp_options = list(
          stan_threads = TRUE
          , STAN_CPP_OPTIMS = TRUE
          , STAN_NO_RANGE_CHECKS = TRUE # The model was already tested
          , PRECOMPILED_HEADERS = TRUE
          # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
          , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
          , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
        ),
        stanc_options = list("O1"),
        force_recompile = TRUE
      )
      assign(output_var, mod, envir = knitr::knit_global())
    }
    options$engine <- "stan"
    code <- paste(options$code, collapse = "\n")
    knitr::engine_output(options, code, '')
  }
)

1 Data


1.1 Matches data

Download the matches’ data

Loading the matches’ data:

matches_data_raw <- (
  read_csv(dir_ls(matches_data_path, regexp = "atp_matches_(.*).csv"), show_col_types = FALSE)
  |> select(tourney_date, tourney_level, round, winner_id, winner_name, loser_id, loser_name, score)
  |> mutate(tourney_date = lubridate::ymd(tourney_date))
)

Filtering and cleaning the combined matches’ data (based on the original post’s data processing):

round_numbers = list(
  "R128" = 1,
  "RR" = 1,
  "R64" = 2,
  "R32" = 3,
  "R16" = 4,
  "QF" = 5,
  "SF" = 6,
  "F" = 7
)

(matches_data_clean <- matches_data_raw 
  |> filter(
    tourney_date %between% c("1968-01-01", "2021-06-20"),
    str_detect(score, "RET|W/O|DEF|nbsp|Def.", negate = TRUE),
    str_length(score) > 4,
    tourney_level != "D",
    round %in% names(round_numbers)
  )
  |> mutate(
    round_number = recode(round, !!!round_numbers),
    label = 1
  )
  |> arrange(tourney_date, round_number)
  |> select(-round, -tourney_level)
)
data.frame [160,399 x 8]
tourney_date winner_id winner_name loser_id loser_name score round_number label
1968-01-19 110 023 Richard Coulthard 107 760 Max Senior 12-10 7-5 4-6 7-5 2 1
1968-01-19 109 803 John Brown 106 964 Ernie Mccabe 6-3 6-2 6-4 2 1
1968-01-19 100 257 Ross Case 110 024 Gondo Widjojo 6-4 3-6 6-3 7-5 2 1
1968-01-19 100 105 Allan Stone 110 025 Robert Layton 6-4 6-2 6-1 2 1
1968-01-19 109 966 Warren Jacques 110 026 Bert Kearney 6-4 6-1 7-5 2 1
1968-01-19 107 759 Max Pettman 110 027 Takesji Tsujimoto 6-4 6-1 6-2 2 1
1968-01-19 100 101 Mike Belkin 110 028 M Marchment 6-2 3-6 6-4 9-7 2 1
1968-01-19 100 025 Barry Phillips Moore 108 430 Tony Dawson 6-3 6-0 6-3 2 1
1968-01-19 108 519 William Coghlan 110 029 Peter Oatey 6-0 6-2 9-11 6-3 2 1
1968-01-19 109 799 Geoff Pollard 110 030 Christian Janssens 6-4 6-2 6-4 2 1
1968-01-19 100 146 Jun Kamiwazumi 110 031 Brian Connor 8-6 6-4 6-2 2 1
1968-01-19 100 180 Bill Lloyd 110 032 H Nielson 6-2 7-5 6-4 2 1
1968-01-19 100 013 Neale Fraser 110 033 R Harvey 6-1 6-1 6-0 2 1
1968-01-19 110 034 Merv Guse 110 035 J May 6-1 6-2 6-2 2 1
1968-01-19 100 174 Manuel Orantes 110 036 L Weatherhog 6-4 6-0 6-2 2 1
[ omitted 160,384 entries ]

1.2 Player data

Download the players’ data

Loading the raw player data:

player_data_path <- here("res", "data", "tennis", "atp_players.csv")

(player_data_raw <- read_csv(player_data_path, show_col_types = FALSE) 
 |> mutate(player_name = str_c(name_first, name_last, sep = " "))
 |> select(player_id, player_name)
)
data.frame [55,649 x 2]
player_id player_name
100 001 Gardnar Mulloy
100 002 Pancho Segura
100 003 Frank Sedgman
100 004 Giuseppe Merlo
100 005 Richard Gonzalez
100 006 Grant Golden
100 007 Abe Segal
100 008 Kurt Nielsen
100 009 Istvan Gulyas
100 010 Luis Ayala
100 011 Torben Ulrich
100 012 Nicola Pietrangeli
100 013 Neale Fraser
100 014 Trevor Fancutt
100 015 Sammy Giammalva
[ omitted 55,634 entries ]

Filtering player_data to only keep the players actually present in our data, and updating their IDs:

(player_data <- with(matches_data_clean, tibble(player_id = union(winner_id, loser_id)))
 |> arrange(player_id)
 |> summarize(player_idx = cur_group_id(), .by = player_id)
 |> left_join(player_data_raw, join_by(player_id))
)
data.frame [4,830 x 3]
player_id player_idx player_name
100 001 1 Gardnar Mulloy
100 002 2 Pancho Segura
100 003 3 Frank Sedgman
100 004 4 Giuseppe Merlo
100 005 5 Richard Gonzalez
100 006 6 Grant Golden
100 007 7 Abe Segal
100 009 8 Istvan Gulyas
100 010 9 Luis Ayala
100 011 10 Torben Ulrich
100 012 11 Nicola Pietrangeli
100 013 12 Neale Fraser
100 014 13 Trevor Fancutt
100 015 14 Sammy Giammalva
100 016 15 Ken Rosewall
[ omitted 4,815 entries ]

1.3 Matches + Player data

Allocating the new player IDs (player_idx) to the winner_id and loser_id from matches_data:

(matches_data <- left_join(matches_data_clean, player_data, by = c("winner_id" = "player_id")) 
 |> rename(winner_idx = player_idx) 
 |> relocate(winner_idx, .after = winner_id) 
 |> left_join(player_data, by = c("loser_id" = "player_id")) 
 |> rename(loser_idx = player_idx) 
 |> relocate(loser_idx, .after = loser_id)
 |> drop_na(winner_idx, loser_idx)
 |> select(-matches("player_name"))
)
data.frame [160,399 x 10]
tourney_date winner_id winner_idx winner_name loser_id loser_idx loser_name score round_number label
1968-01-19 110 023 3 655 Richard Coulthard 107 760 3 129 Max Senior 12-10 7-5 4-6 7-5 2 1
1968-01-19 109 803 3 440 John Brown 106 964 2 909 Ernie Mccabe 6-3 6-2 6-4 2 1
1968-01-19 100 257 253 Ross Case 110 024 3 656 Gondo Widjojo 6-4 3-6 6-3 7-5 2 1
1968-01-19 100 105 103 Allan Stone 110 025 3 657 Robert Layton 6-4 6-2 6-1 2 1
1968-01-19 109 966 3 600 Warren Jacques 110 026 3 658 Bert Kearney 6-4 6-1 7-5 2 1
1968-01-19 107 759 3 128 Max Pettman 110 027 3 659 Takesji Tsujimoto 6-4 6-1 6-2 2 1
1968-01-19 100 101 99 Mike Belkin 110 028 3 660 M Marchment 6-2 3-6 6-4 9-7 2 1
1968-01-19 100 025 24 Barry Phillips Moore 108 430 3 325 Tony Dawson 6-3 6-0 6-3 2 1
1968-01-19 108 519 3 349 William Coghlan 110 029 3 661 Peter Oatey 6-0 6-2 9-11 6-3 2 1
1968-01-19 109 799 3 436 Geoff Pollard 110 030 3 662 Christian Janssens 6-4 6-2 6-4 2 1
1968-01-19 100 146 144 Jun Kamiwazumi 110 031 3 663 Brian Connor 8-6 6-4 6-2 2 1
1968-01-19 100 180 178 Bill Lloyd 110 032 3 664 H Nielson 6-2 7-5 6-4 2 1
1968-01-19 100 013 12 Neale Fraser 110 033 3 665 R Harvey 6-1 6-1 6-0 2 1
1968-01-19 110 034 3 666 Merv Guse 110 035 3 667 J May 6-1 6-2 6-2 2 1
1968-01-19 100 174 172 Manuel Orantes 110 036 3 668 L Weatherhog 6-4 6-0 6-2 2 1
[ omitted 160,384 entries ]

2 Model


2.1 Stan code

Updated Stan code with within-chain parallelization

tennis_model
functions {
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1 : num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }

  // Compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end,
                            data array[] int labels, 
                            data array[] int winner_ids, 
                            data array[] int loser_ids, 
                            vector player_skills) {
    real ptarget = 0;
    int N = end - start + 1;

    vector[N] mu = rep_vector(0.0, N);
    for (n in 1 : N) {
      int nn = n + start - 1;
      mu[n] += player_skills[winner_ids[nn]] - player_skills[loser_ids[nn]];
    }
    ptarget += bernoulli_logit_lpmf(labels[start : end] | mu);
    return ptarget;
  }
}
data {
    int n_players;
    int n_matches;
    
    array[n_matches] int<lower=1, upper=n_players> winner_ids; // Winner of game n
    array[n_matches] int<lower=1, upper=n_players> loser_ids;  // Loser of game n
    array[n_matches] int<lower=0, upper=1> labels;             // Always 1 in this model
    
    int grainsize;
}
transformed data {
    array[n_matches] int seq = sequence(1, n_matches);
}
parameters {
    real<lower=0> player_sd;          // Scale of ability variation (hierarchical prior)
    vector[n_players] player_skills;  // Ability of player k
}
model {   
  player_sd ~ std_normal();
  player_skills ~ normal(0, player_sd);
    
  target += reduce_sum(
    partial_log_lik_lpmf, seq, grainsize, 
    labels, winner_ids, loser_ids, player_skills
  );
}

2.2 Stan data

tennis_stan_data <- list(
  n_matches = nrow(matches_data),
  n_players = with(matches_data, length(union(winner_id, loser_id))),
  winner_ids = matches_data$winner_idx,
  loser_ids = matches_data$loser_idx,
  labels = matches_data$label,
  grainsize = max(100L, round(nrow(matches_data) / 60))
)

2.3 Model fit

tennis_mod_fit <- tennis_model$sample(
  data = tennis_stan_data, seed = 256,
  iter_warmup = 1000, iter_sampling = 1000, refresh = 0,
  chains = 4, parallel_chains = 4, threads_per_chain = 7
)
Note

Sampling takes ~2.69 minutes on my CPU (Ryzen 5950X, 16 Cores/32 Threads), on WSL2 (Ubuntu 22)

data.table [4 x 2]
Chain Time
1 162.915s (~2.72 minutes)
2 162.299s (~2.7 minutes)
3 162.62s (~2.71 minutes)
4 157.238s (~2.62 minutes)

3 Model diagnostics


mcmc_neff_hist(neff_ratio(tennis_mod_fit))
mcmc_rhat_hist(rhat(tennis_mod_fit))

Plotting the traces & acf for a random subsets of players:

players_subset <- paste0("player_skills[", sample(player_data$player_idx, 5), "]")

tennis_draws_subset <- tennis_mod_fit$draws(variables = players_subset, format = "draws_list")
wrap_plots(
  mcmc_hist(tennis_draws_subset, facet_args = list(nrow = length(tennis_draws_subset[[1]]))),
  mcmc_trace(tennis_draws_subset, facet_args = list(nrow = length(tennis_draws_subset[[1]]))),
  widths = c(1, 1.5)
)
mcmc_acf(tennis_draws_subset)

Everything seems good enough.

4 Posterior Predictions


4.1 Posterior data

Getting our Posterior Predictions into long format and joining the result with player_data:

(player_skills <- tennis_mod_fit$draws(variables = "player_skills")
  |> bind(x, subset_draws(x, "player_skills", regex = TRUE, draw = sample.int(ndraws(x), size = 500)))
  |> as.data.table()
  |> _[, .(player_skills = list(value)), by = variable
  ][, let(player_idx = as.integer(str_extract(variable, "\\d{1,4}")), variable = NULL)
  ][, let(skill_mean = sapply(player_skills, mean), skill_sd = sapply(player_skills, sd))
  ][as.data.table(player_data), on = "player_idx", nomatch = NULL
  ][order(-skill_mean), .(player_name, player_id, player_idx, skill_mean, skill_sd, player_skills)]
)
data.table [4,830 x 6]
player_name player_id player_idx skill_mean skill_sd player_skills
Novak Djokovic 104 925 2 415 3.54 0.095 <numeric [500]>
Rafael Nadal 104 745 2 367 3.431 0.091 <numeric [500]>
Roger Federer 103 819 2 082 3.319 0.077 <numeric [500]>
Bjorn Borg 100 437 423 3.252 0.095 <numeric [500]>
Ivan Lendl 100 656 613 3.229 0.081 <numeric [500]>
John McEnroe 100 581 553 3.183 0.091 <numeric [500]>
Jimmy Connors 100 284 280 3.162 0.071 <numeric [500]>
Rod Laver 100 029 28 3.022 0.114 <numeric [500]>
Andy Murray 104 918 2 413 2.926 0.094 <numeric [500]>
Pete Sampras 101 948 1 408 2.922 0.085 <numeric [500]>
Boris Becker 101 414 1 127 2.832 0.093 <numeric [500]>
Andre Agassi 101 736 1 310 2.777 0.082 <numeric [500]>
Stefan Edberg 101 222 993 2.74 0.083 <numeric [500]>
Andy Roddick 104 053 2 157 2.704 0.096 <numeric [500]>
Juan Martin del Potro 105 223 2 503 2.678 0.104 <numeric [500]>
[ omitted 4,815 entries ]

4.2 Posterior plots

ridgeline_plot
ridgeline_plot <- function(dat, var) {
  dat <- dat[, .(player_skills = unlist(player_skills)), by = setdiff(names(dat), 'player_skills')
     ][, player_name := factor(player_name, levels = rev(unique(player_name)))]
  
  return(
    ggplot(dat, aes(player_skills, y = {{ var }}, fill = {{ var }}))
    + geom_ribbon(
      stat = "density", outline.type = "upper", color = "grey30",
      aes(
        fill = stage({{ var }}, after_scale = alpha(fill, 0.5)),
        ymin = after_stat(group),
        ymax = after_stat(group + ndensity * 1.6)
      )
    )
    * ggblend::blend("multiply")
    + labs(x = "Player Skills", y = "")
    + scale_y_discrete(position = "right", labels = \(x) str_replace_all(x, "\\s", "\n"))
    + theme(legend.position = "none", axis.line.y = element_blank())
  )
}

Plotting the player_skills posteriors of the top 10 players:

head(player_skills, 10) |> ridgeline_plot(player_name)

─ Session info ───────────────────────────────────────────────────────────────
 setting        value
 version        R version 4.3.1 (2023-06-16)
 os             Ubuntu 22.04.3 LTS
 system         x86_64, linux-gnu
 ui             X11
 language       (EN)
 collate        C.UTF-8
 ctype          C.UTF-8
 tz             Europe/Paris
 date           2024-02-07
 pandoc         3.1.11
 Quarto         1.5.9
 Stan (CmdStan) 2.33.1

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version date (UTC) lib source
 P archive    * 1.1.6   2023-09-18 [?] CRAN (R 4.3.1)
 P bayesplot  * 1.10.0  2022-11-16 [?] CRAN (R 4.3.0)
 P cmdstanr   * 0.6.1   2023-09-13 [?] local
 P crayon     * 1.5.2   2022-09-29 [?] CRAN (R 4.3.0)
 P data.table * 1.15.0  2024-01-30 [?] CRAN (R 4.3.1)
 P dplyr      * 1.1.4   2023-11-17 [?] CRAN (R 4.3.1)
 P fs         * 1.6.3   2023-07-20 [?] CRAN (R 4.3.0)
 P ggplot2    * 3.4.4   2023-10-12 [?] CRAN (R 4.3.1)
 P gt         * 0.10.0  2023-10-07 [?] CRAN (R 4.3.1)
 P here       * 1.0.1   2020-12-13 [?] CRAN (R 4.3.0)
 P htmltools  * 0.5.6.1 2023-10-06 [?] CRAN (R 4.3.1)
 P knitr      * 1.44    2023-09-11 [?] CRAN (R 4.3.0)
 P lubridate  * 1.9.3   2023-09-27 [?] CRAN (R 4.3.1)
 P patchwork  * 1.1.3   2023-08-14 [?] CRAN (R 4.3.0)
 P pipebind   * 0.1.2   2023-08-30 [?] CRAN (R 4.3.0)
 P posterior  * 1.4.1   2023-03-14 [?] CRAN (R 4.3.0)
 P purrr      * 1.0.2   2023-08-10 [?] CRAN (R 4.3.0)
 P reactable  * 0.4.4   2023-03-12 [?] CRAN (R 4.3.0)
 P readr      * 2.1.4   2023-02-10 [?] CRAN (R 4.3.0)
 P stringr    * 1.5.0   2022-12-02 [?] CRAN (R 4.3.0)
 P tibble     * 3.2.1   2023-03-20 [?] CRAN (R 4.3.0)
 P tidyr      * 1.3.0   2023-01-24 [?] CRAN (R 4.3.0)

 [1] /home/mar/Dev/Projects/R/ma-riviere.com/renv/library/R-4.3/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/sandbox/R-4.3/x86_64-pc-linux-gnu/9a444a72

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────
Back to top

Citation

BibTeX citation:
@online{rivière2022,
  author = {Rivière, Marc-Aurèle},
  title = {MCMC for “{Big} {Data}” with {Stan}},
  date = {2022-06-05},
  url = {https://ma-riviere.com/content/code/posts/big-bayes},
  langid = {en},
  abstract = {This post is an extension (and a translation to R) of
    {[}PyMC-Labs’
    benchmarking{]}(https://www.pymc-labs.io/blog-posts/pymc-stan-benchmark/)
    of MCMC for “Big Data”. The Stan code was updated to use
    {[}within-chain
    parallelization{]}(https://mc-stan.org/docs/2\_30/stan-users-guide/reduce-sum.html)
    and {[}compiler
    optimization{]}(https://mc-stan.org/docs/2\_30/stan-users-guide/optimization.html)
    for faster CPU sampling. Stan was able to achieve similar sampling
    speeds as PyMC’s JAX + GPU solution, purely on CPU.}
}
For attribution, please cite this work as:
Rivière, M.-A. (2022, June 5). MCMC for “Big Data” with Stan. https://ma-riviere.com/content/code/posts/big-bayes