Summary

This post is a transposition to R of Ethan Rosenthal’s blog post on modeling Rock Climbing route difficulty using a Bayesian IRT (Item Response Theory) model.

The original Stan code was updated to use within-chain parallelization and compiler optimization for faster CPU sampling.

Several data processing solutions are showcased, using either data.table or dbplyr (with a DuckDB backend), with timings to compare their speed.

Caution

By design, this post contains very few explanations.

Its goal was simply to translate Ethan’s blog post to R, and update his Stan code to use within-chain parallelization and compiler optimizations, for faster sampling.

Feel free to read the original blog post to better understand what the code is doing.

Tip

You can check the page’s source code by clicking on the </> Code button at the top-right.

Setup

library(here)        # Working directory management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies
library(archive)     # Memory efficient object storage

library(cmdstanr)    # Lightweight R interface for Stan
library(posterior)   # Wrangling Stan models' output

library(data.table)  # Fast data manipulation (in-RAM)
library(duckdb)      # DuckDB R interface

library(dplyr)       # Manipulating data.frames - core   (Tidyverse)
library(tidyr)       # Manipulating data.frames - extras (Tidyverse)
library(dbplyr)      # DB/SQL backend for dplyr/tidyr    (Tidyverse)
library(stringr)     # Manipulating strings              (Tidyverse)
library(purrr)       # Manipulating lists                (Tidyverse)
library(lubridate)   # Manipulating date/time            (Tidyverse)
library(ggplot2)     # Best plotting library             (Tidyverse)

library(plotly)      # Interactive plots
library(bayesplot)   # PPC/Diagnostic plots for Stan models
library(patchwork)   # Combining plots

options(
  mc.cores = max(1L, parallel::detectCores(logical = TRUE)),
  scipen = 999L, 
  digits = 4L,
  ggplot2.discrete.colour = \() scale_color_viridis_d(),
  ggplot2.discrete.fill = \() scale_fill_viridis_d()
)

nrows_print <- 10

setDTthreads(parallel::detectCores(logical = FALSE))

Quarto/knitr setup

Adding custom knitr hooks

#---------------------------#
####🔺knitr custom hooks ####
#---------------------------#

library(knitr)

## Adding the `time_it` code chunk option
knitr::knit_hooks$set(time_it = local({
  assign("TIMES", list(), .GlobalEnv)
  start <- NULL
  function(before, options) {
    if (before) start <<- Sys.time()
    else TIMES[[options$label]] <<- difftime(Sys.time(), start)
  }
}))

Applying a custom theme to all ggplot objects, both light and dark versions

#---------------------------#
####🔺ggplot knit_prints ####
#---------------------------#

library(knitr)
library(ggplot2)

## Inspired by: https://debruine.github.io/quarto_demo/dark_mode.html
knit_print.ggplot <- function(x, options, ...) {
  if(any(grepl("patchwork", class(x)))) {
    plot_dark <- x & dark_addon_mar
    plot_light <- x & light_addon_mar
  } else {
    plot_dark <- x + dark_addon_mar
    plot_light <- x + light_addon_mar
  }
  
  cat('\n<div class="light-mode">\n')
  print(plot_light)
  cat('</div>\n')
  cat('<div class="dark-mode">\n')
  print(plot_dark)
  cat('</div>\n\n')
}

registerS3method("knit_print", "ggplot", knit_print.ggplot)

Applying a custom theme to all gt tables

#-----------------------#
####🔺gt knit_prints ####
#-----------------------#

library(knitr)
library(gt)

knit_print.grouped_df <- function(x, options, ...) {
  if ("grouped_df" %in% class(x)) x <- ungroup(x)
  
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "grouped_df", knit_print.grouped_df)

knit_print.data.frame <- function(x, options, ...) {
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "data.frame", knit_print.data.frame)

Functions for interactive data presentation

#----------------------------------#
####🔺knitr interactive display ####
#----------------------------------#

library(htmltools)
library(reactable)

## Getting list to display nicely in rendered documents
make_list_reactable <- function(list_dat) {
  
  list_name <- deparse(substitute(list_dat))
  
  get_list_elt_dim <- function(elt) {
    list_elt <- list_dat[[elt]]
    list_elt_dim <- if (any(c("data.frame", "matrix") %in% class(list_elt))) dim(list_elt) else length(list_elt)
    
    return(paste0(list_elt_dim, collapse = ", "))
  }
  
  dat <- data.frame(names(list_dat)) |> 
    set_names(list_name) |> 
    mutate(
      Type = unlist(pick(list_name)) |> map_chr(\(x) class(list_dat[[x]]) |> paste0(collapse = ", ")),
      Dimensions = unlist(pick(list_name)) |> map_chr(get_list_elt_dim)
    )
  
  get_list_details <- function(dat, idx, max_print = 200, max_digits = 3) {
    Element <- dat[[idx]]
    style <- "padding: 0.5rem"
    
    if (any(c("data.frame", "matrix") %in% class(Element))) {
      reactable(data.frame(Element), outlined = TRUE, striped = TRUE, highlight = TRUE, compact = TRUE) |> 
        htmltools::div(style = style) 
    }
    else if ("list" %in% class(Element)) 
      make_list_reactable(Element)
    else if (length(Element) > max_print) {
      htmltools::div(
        htmltools::p(head(Element, max_print) |> round(max_digits) |> paste0(collapse = ", ") |> paste("...", sep = ", ")),
        htmltools::p(stringr::str_glue("[ omitted {length(Element) - max_print} entries ]"), style = "font-style: italic"),
        style = style
      )
    }
    else htmltools::div(round(Element, max_digits) |> paste0(collapse = ", "), style = style)
  }
  
  reactable(
    dat
    , defaultColDef = colDef(vAlign = "center", headerVAlign = "center")
    , details = \(idx) get_list_details(list_dat, idx)
    , outlined = TRUE
    , striped = TRUE
    , highlight = TRUE
    , compact = FALSE
    , fullWidth = TRUE
    , defaultPageSize = 15
  )
}

Stan setup

Installing CmdStan

## Skip this step if CmdStan is already installed

cmdstanr::check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)

cpp_opts <- list(
  stan_threads = TRUE
  , STAN_CPP_OPTIMS = TRUE
  , STAN_NO_RANGE_CHECKS = TRUE # WARN: remove this if you haven't tested the model
  , PRECOMPILED_HEADERS = TRUE
  # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
  , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
  , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
)

cmdstanr::install_cmdstan(cpp_options = cpp_opts, quiet = TRUE)

Loading CmdStan (if already installed)

highest_cmdstan_version <- dir_ls(config$cmdstan_path) |> 
  path_file() |> 
  keep(\(e) str_detect(e, "cmdstan-")) |> 
  bind(x, str_split(x, '-', simplify = TRUE)[,2]) |> 
  reduce(\(x, y) ifelse(utils::compareVersion(x, y) == 1, x, y))

set_cmdstan_path(str_glue("{config$cmdstan_path}cmdstan-{highest_cmdstan_version}"))

Setting up knitr’s engine for CmdStan

## Inspired by: https://mpopov.com/blog/2020/07/30/replacing-the-knitr-engine-for-stan/

## Note: We could haved use cmdstanr::register_knitr_engine(), 
##       but it wouldn't include compiler optimizations & multi-threading by default

knitr::knit_engines$set(
  cmdstan = function(options) {
    output_var <- options$output.var
    if (!is.character(output_var) || length(output_var) != 1L) {
      stop(
        "The chunk option output.var must be a character string ",
        "providing a name for the returned `CmdStanModel` object."
      )
    }
    if (options$eval) {
      if (options$cache) {
        cache_path <- options$cache.path
        if (length(cache_path) == 0L || is.na(cache_path) || cache_path == "NA") {
          cache_path <- ""
        }
        dir <- paste0(cache_path, options$label)
      } else {
        dir <- tempdir()
      }
      file <- cmdstanr::write_stan_file(options$code, dir = dir, force_overwrite = TRUE)
      mod <- cmdstanr::cmdstan_model(
        stan_file = file, 
        cpp_options = list(
          stan_threads = TRUE
          , STAN_CPP_OPTIMS = TRUE
          , STAN_NO_RANGE_CHECKS = TRUE # The model was already tested
          , PRECOMPILED_HEADERS = TRUE
          # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
          , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
          , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
        ),
        stanc_options = list("O1"),
        force_recompile = TRUE
      )
      assign(output_var, mod, envir = knitr::knit_global())
    }
    options$engine <- "stan"
    code <- paste(options$code, collapse = "\n")
    knitr::engine_output(options, code, '')
  }
)

1 Data

1.1 Extracting the data

Connecting to the .sqlite DB (using DuckDB instead of SQLite):

con <- dbConnect(duckdb(), dbdir = ":memory:")

db_path <- here("res", "data", "climbers.sqlite")

INSTALL sqlite;
LOAD sqlite;
CALL sqlite_attach(?db_path);

dbplyr automatically translates dplyr/tidyr code into SQL !

dbplyr
SQL

(list(
    ## Table 1: ascent
    tbl(con, "ascent") 
    |> filter(country %like% "USA") 
    |> mutate(
        route_id = str_c(
          str_replace_all(crag, ' ', '_'), "__", 
          str_replace_all(name, ' ', '_'), "__", 
          if_else(climb_type == 1, 'boulder', 'rope')
        ),
        ascent_date = to_timestamp(date)
    ) 
    |> select(user_id, route_id, climb_type, grade_id, method_id, ascent_date),
    ## Table 2: grade
    tbl(con, "grade") |> select(grade_id = id, usa_routes, usa_boulders),
    ## Table 3: method
    tbl(con, "method") |> select(method_id = id, method_name = name)
  )
  |> reduce(left_join)
  |> select(-grade_id, -method_id)
  |> compute("climbs", overwrite = TRUE)
)

Time difference of 0.5235 secs

SELECT
  ascent.user_id
  , REPLACE(ascent.crag, ' ', '_')
      || '__' || REPLACE(ascent.name, ' ', '_')
      || '__' || CASE WHEN ascent.climb_type = 1 THEN 'boulder' ELSE 'rope' END
      AS route_id
  , ascent.climb_type as climb_type
  , to_timestamp(ascent.date) AS ascent_date
  , grade.usa_routes
  , grade.usa_boulders
  , method.name AS method_name
FROM ascent
JOIN grade ON grade.id = ascent.grade_id
JOIN method ON method.id = ascent.method_id
WHERE ascent.country = 'USA'

Time difference of 0.487 secs

data.frame [658,822 x 7]

user_id	route_id	climb_type	ascent_date	usa_routes	usa_boulders	method_name
13 733	Vantage__Black_Belt_Internet_Spray_Masters__rope	0	2010-05-07 22:00:00	5.10a	V3	Flash
13 733	Vantage__Choreography__rope	0	2010-05-07 22:00:00	5.10b	V3/4	Onsight
13 733	Vantage__Awareness_Drooling__rope	0	2010-05-07 22:00:00	5.10c	V4	Onsight
13 733	Vantage__Corroding_Through_Nocturnal_Silk__rope	0	2010-05-07 22:00:00	5.8	V1	Onsight
13 733	Vantage__Cornivorous__rope	0	2010-05-08 22:00:00	5.9	V2	Flash
13 733	Vantage__Clipping_the_Cornea__rope	0	2010-05-08 22:00:00	5.7	V1	Flash
27 963	Bishop__Birthday_Direct__boulder	1	2010-05-11 22:00:00	5.10a	V3	Redpoint
13 733	Vantage__Decortication_in_a_Clinical_Stupor__rope	0	2010-05-08 22:00:00	5.10b	V3/4	Flash
13 733	Vantage__A_disease_called_life__rope	0	2010-05-08 22:00:00	5.9	V2	Flash
13 733	Vantage__Think_it_rhymes_with_luck__rope	0	2010-05-08 22:00:00	5.10a	V3	Flash
13 733	Vantage__Oh_Coulee,_Oh_Coulee__rope	0	2010-05-07 22:00:00	5.8	V1	Onsight
27 963	Bishop__Nan_Bread__boulder	1	2010-05-11 22:00:00	5.10c	V4	Onsight
27 963	Bishop__Prow__boulder	1	2010-05-11 22:00:00	5.9	V2	Flash
24 475	Area_74__Backseat_Spooning__boulder	1	2010-05-09 22:00:00	5.12d	V9	Redpoint
18 656	Flagstaff__That_Flakes_It_SDS__boulder	1	2010-05-07 22:00:00	5.13a	V10	Redpoint
[ omitted 658,807 entries ]

1.2 Processing the data

route_ratings <- c(
  str_c("5.", 1:9), 
  map(str_c("5.", 10:15), \(x) str_c(x, letters[1:4])) |> unlist()
)

bouldering_grades <- str_c("V", 0:20)

## Mode for non-numerical data
mode_cat <- function(x) {
  x <- sort(na.omit(as.character(x)))
  unique(x)[which.max(tabulate(match(x, unique(x))))]
}

data.table
dbplyr

threshold_ascents_dt

climbs <- setDT(climbs_df) |> setkey(route_id, user_id)

threshold_ascents_dt <- function(current, lim = 20) {
  new <- current[, if(.N >= lim) .SD, by = user_id][, if(.N >= lim) .SD, by = route_id]
  
  if (nrow(current) != nrow(new)) threshold_ascents_dt(new, lim)
  else return(new)
}

climbs_clean <- (
  climbs[
    order(user_id, route_id, ascent_date, usa_routes, usa_boulders, method_name), .SD[1], 
    by = .(user_id, route_id)
  ][, let(
      usa_boulders = factor(usa_boulders, levels = bouldering_grades),
      usa_routes = factor(usa_routes, levels = route_ratings),
      label = as.integer(method_name %chin% c("Onsight", "Flash"))
    )
  ][, let(route_rating = mode_cat(usa_routes), bouldering_grade = mode_cat(usa_boulders)), by = route_id
  ][, threshold_ascents_dt(.SD)
  ][, let(
      route_idx = frank(route_id, ties.method = "dense"),
      user_idx = frank(user_id, ties.method = "dense")
    )
  ][order(route_idx, user_idx)
    , .(route_idx, route_id, user_idx, climb_type, ascent_date, route_rating, bouldering_grade, label)
  ]
)

Time difference of 10.82 secs

Alternative

climbs_first <- climbs[
    order(user_id, route_id, ascent_date, usa_routes, usa_boulders, method_name), .SD[1], 
    by = .(user_id, route_id)
  ]

climbs_clean <- (
  copy(climbs_first)[
    , threshold_ascents_dt(.SD)
  ][
    ## Replacing all route_ratings for a given route_id by its mode
    climbs_first[
        usa_boulders %in% bouldering_grades
      ][, .(bouldering_grade = mode_cat(usa_boulders)), by = route_id], 
    c("route_id", "bouldering_grade") := list(i.route_id, i.bouldering_grade), 
    on = "route_id"
  ][
    ## Replacing all bouldering_grades for a given route_id by its mode
    climbs_first[
        usa_routes %in% route_ratings
      ][, .(route_rating = mode_cat(usa_routes)), by = route_id], 
    c("route_id", "route_rating") := list(i.route_id, i.route_rating), 
    on = "route_id"
  ][, let(
      route_idx = frank(route_id, ties.method = "dense"),
      user_idx = frank(user_id, ties.method = "dense"),
      label = as.integer(method_name %chin% c("Onsight", "Flash"))
  )
  ][order(route_idx, user_idx)
    , .(route_idx, route_id, user_idx, climb_type, ascent_date, route_rating, bouldering_grade, label)
  ]
)

Time difference of 9.652 secs

threshold_ascents_dbp

threshold_ascents_dbp <- function(current, lim = 20) {
  new <- current |> filter(n() >= lim, .by = user_id) |> filter(n() >= lim, .by = route_id) |> collect()

  if (pull(count(current), n) != nrow(new)) threshold_ascents_dbp(new, lim)
  else {
    duckdb_register(con, "ascent_temp", new)
    return(tbl(con, "ascent_temp"))
  }
}

(tbl(con, "climbs")
  |> slice_min(
    tibble(ascent_date, usa_routes, usa_boulders, method_name), with_ties = FALSE, 
    by = c(user_id, route_id)
  )
  |> compute("climbs_first", overwrite = TRUE)
)

(tbl(con, "climbs_first")
  |> threshold_ascents_dbp()
  ## Replacing all route_ratings for a given route_id by its mode
  |> left_join(
    tbl(con, "climbs_first")
      |> filter(usa_routes %in% route_ratings)
      |> count(route_id, usa_routes)
      |> slice_max(n, by = route_id)
      |> summarize(route_rating = min(usa_routes), .by = route_id),
    by = "route_id"
  )
  ## Replacing all bouldering_grades for a given route_id by its mode
  |> left_join(
    tbl(con, "climbs_first")
      |> filter(usa_boulders %in% bouldering_grades)
      |> count(route_id, usa_boulders)
      |> slice_max(n, by = route_id)
      |> summarize(bouldering_grade = min(usa_boulders), .by = route_id),
    by = "route_id"
  )
  |> mutate(
    route_idx = dense_rank(route_id), 
    user_idx = dense_rank(user_id),
    label = as.integer(method_name %in% c("Onsight", "Flash"))
  )
  |> select(route_idx, route_id, user_idx, climb_type, ascent_date, route_rating, bouldering_grade, label)
  |> arrange(route_idx, user_idx)
  |> compute("climbs_clean", overwrite = TRUE)
)

Time difference of 1.78 secs

data.table [232,887 x 8]

route_idx	route_id	user_idx	climb_type	ascent_date	route_rating	bouldering_grade	label
1	221__Be_awesome__boulder	1 386	1	2017-06-09 22:00:00	5.12a	V7	0
1	221__Be_awesome__boulder	1 601	1	2015-03-18 23:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	1 832	1	2013-06-29 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 032	1	2013-12-26 23:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 075	1	2013-06-21 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 110	1	2013-07-16 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 134	1	2013-10-23 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 149	1	2013-06-21 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 157	1	2013-07-11 22:00:00	5.12a	V7	0
1	221__Be_awesome__boulder	2 170	1	2014-05-26 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 177	1	2013-06-28 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 234	1	2013-08-02 22:00:00	5.12a	V7	1
1	221__Be_awesome__boulder	2 236	1	2013-10-20 22:00:00	5.12a	V7	0
1	221__Be_awesome__boulder	2 261	1	2012-09-23 22:00:00	5.12a	V7	0
1	221__Be_awesome__boulder	2 338	1	2013-06-21 22:00:00	5.12a	V7	1
[ omitted 232,872 entries ]

🔍 Checking that both solutions give the same result

all.equal(
  collect(tbl(con, "climbs_clean")), 
  climbs_clean, 
  check.attributes = FALSE
)

[1] TRUE

2 Model

2.1 Stan code

Updated Stan code using within-chain parallelization

climbing_model

functions {
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1 : num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }

  // Compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end,
                            data array[] int labels, real mean_ability,
                            data array[] int users, vector user_ability,
                            data array[] int routes, vector route_difficulty) {
    real ptarget = 0;
    int N = end - start + 1;

    vector[N] mu = mean_ability + rep_vector(0.0, N);
    for (n in 1 : N) {
      int nn = n + start - 1;
      mu[n] += user_ability[users[nn]] - route_difficulty[routes[nn]];
    }
    ptarget += bernoulli_logit_lpmf(labels[start : end] | mu);
    return ptarget;
  }
}

data {
  int<lower=1> num_ascents;
  int<lower=1> num_users;
  int<lower=1> num_routes;
  array[num_ascents] int<lower=1, upper=num_users> users;
  array[num_ascents] int<lower=1, upper=num_routes> routes;
  array[num_ascents] int<lower=0, upper=1> labels;

  int grainsize;
}

transformed data {
  array[num_ascents] int seq = sequence(1, num_ascents);
}

parameters {
  real mean_ability;
  vector[num_users] user_ability;
  vector[num_routes] route_difficulty;
}

model {
  user_ability ~ std_normal();
  route_difficulty ~ std_normal();
  mean_ability ~ std_normal();

  target += reduce_sum(
    partial_log_lik_lpmf, seq, grainsize, 
    labels, mean_ability, users, user_ability, routes, route_difficulty
  );
}

2.2 Stan data

climbing_stan_data <- list(
  num_ascents = nrow(climbs_clean),
  num_users = n_distinct(climbs_clean$user_id),
  num_routes = n_distinct(climbs_clean$route_id),
  routes = pull(climbs_clean, route_idx),
  users = pull(climbs_clean, user_idx),
  labels = pull(climbs_clean, label) |> as.integer(),
  grainsize = max(100, round(nrow(climbs_clean) / 50))
)

2.3 Model fit

climbing_mod_fit <- climbing_model$sample(
  data = climbing_stan_data, seed = 666,
  iter_warmup = 500, iter_sampling = 1000, refresh = 0,
  chains = 6, parallel_chains = 6, threads_per_chain = 5
)

Note

Sampling takes ~4.81 minutes on my CPU (Ryzen 5950X, 16 Cores/32 Threads), on WSL2 (Ubuntu 22)

data.table [6 x 2]

Chain	Time
1	282.858s (~4.71 minutes)
2	294.551s (~4.91 minutes)
3	293.979s (~4.9 minutes)
4	288.381s (~4.81 minutes)
5	285.559s (~4.76 minutes)
6	285.031s (~4.75 minutes)

3 Model diagnostics

mcmc_neff_hist(neff_ratio(climbing_mod_fit))

mcmc_rhat_hist(rhat(climbing_mod_fit))

Plotting random subsets of the traces:

hist_trace_plot

hist_trace_plot <- function(mod, vars) {
  draws <- mod$draws(variables = vars, format = "draws_list")
  wrap_plots(
    mcmc_hist(draws, facet_args = list(nrow = length(vars))),
    mcmc_trace(draws, facet_args = list(nrow = length(vars))),
    widths = c(1, 1.5)
  )
}

hist_trace_plot(
  climbing_mod_fit, 
  paste0("route_difficulty[", unique(climbs_clean, by = "route_idx")[, route_idx] |> sample(5), "]")
)

hist_trace_plot(
  climbing_mod_fit, 
  paste0("user_ability[", unique(climbs_clean, by = "user_idx")[, user_idx] |> sample(5), "]")
)

Everything seems good.

4 Posterior Predictions

4.1 Posterior data

Getting our Posterior Predictions (subset of 500 draws per route) into long format:

draws <- (
  ## For each player, take a subsample of 500 draws from their posterior
  climbing_mod_fit$draws(variables = "route_difficulty")
  |> bind(x, subset_draws(x, "route_difficulty", regex = TRUE, draw = sample.int(ndraws(x), size = 500)))
  |> as.data.table()
  |> _[, .(route_difficulty = list(value)), by = variable
  ][, let(route_idx = as.integer(str_extract(variable, "\\d{1,4}")), variable = NULL)]
)

climbs_pp <- (climbs_clean[, .(route_idx, route_id, bouldering_grade, route_rating, climb_type)] 
  |> unique(by = "route_idx")
  |> _[draws, on = "route_idx", nomatch = NULL
  ][order(route_idx)]
)

data.table [4,288 x 6]

route_idx	route_id	bouldering_grade	route_rating	climb_type	route_difficulty
1	221__Be_awesome__boulder	V7	5.12a	1	<numeric [500]>
2	221__Black_Magic_Woman__boulder	V7	5.12a	1	<numeric [500]>
3	221__Doug_Reed_Roof__boulder	V5	5.11a	1	<numeric [500]>
4	221__Druid_Roof__boulder	V7	5.12a	1	<numeric [500]>
5	221__Dump_Arete__boulder	V4	5.10c	1	<numeric [500]>
6	221__Iceberg__boulder	V5	5.11a	1	<numeric [500]>
7	221__Instinct__boulder	V9	5.12d	1	<numeric [500]>
8	221__M1__boulder	V3	5.10a	1	<numeric [500]>
9	221__Ominous_Roof__boulder	V9	5.12d	1	<numeric [500]>
10	221__Senderella__boulder	V8	5.12c	1	<numeric [500]>
11	221__Sign_of_the_Times__boulder	V5	5.11a	1	<numeric [500]>
12	221__The_Iceberg__boulder	V5	5.11a	1	<numeric [500]>
13	221__West_Texas__boulder	V3	5.10a	1	<numeric [500]>
14	221__What_Up_Arete__boulder	V6	5.11d	1	<numeric [500]>
15	American_Fork__39__rope	V5	5.11b	0	<numeric [500]>
[ omitted 4,273 entries ]

Time difference of 1.683 secs

## Getting the draws into DuckDB
(climbing_mod_fit$draws(variables = "route_difficulty") 
  |> bind(x, subset_draws(x, "route_difficulty", regex = TRUE, draw = sample.int(ndraws(x), size = 500))) 
  |> as_draws_df()
  |> pivot_longer(everything(), names_to = "route_idx", names_pattern = ".*\\[(\\d{1,4})\\]")
  |> duckdb_register(con, "draws", df = _)
)

## Generating out Posterior Predictions data
(tbl(con, "climbs_clean")
  |> select(route_idx, route_id, bouldering_grade, route_rating, climb_type)
  |> distinct(route_idx, .keep_all = TRUE)
  |> inner_join(tbl(con, "draws"), by = "route_idx")
  |> summarize(
    .by = route_idx,
    across(c(bouldering_grade, route_rating, climb_type), first),
    route_difficulty = list(value)
  )
  |> arrange(route_idx)
  |> compute("climbs_pp", overwrite = TRUE)
)

data.frame [4,288 x 5]

route_idx	bouldering_grade	route_rating	climb_type	route_difficulty
1	V7	5.12a	1	<numeric [500]>
2	V7	5.12a	1	<numeric [500]>
3	V5	5.11a	1	<numeric [500]>
4	V7	5.12a	1	<numeric [500]>
5	V4	5.10c	1	<numeric [500]>
6	V5	5.11a	1	<numeric [500]>
7	V9	5.12d	1	<numeric [500]>
8	V3	5.10a	1	<numeric [500]>
9	V9	5.12d	1	<numeric [500]>
10	V8	5.12c	1	<numeric [500]>
11	V5	5.11a	1	<numeric [500]>
12	V5	5.11a	1	<numeric [500]>
13	V3	5.10a	1	<numeric [500]>
14	V6	5.11d	1	<numeric [500]>
15	V5	5.11b	0	<numeric [500]>
[ omitted 4,273 entries ]

Time difference of 0.686 secs

With dplyr, we can use the rvar format to encapsulate the samples from the model, which drastically reduces the size of the samples’ data.frame

(inner_join(
    as.data.frame(climbs_clean) |> 
      select(route_idx, route_id, bouldering_grade, route_rating, climb_type) |> 
      distinct(route_idx, .keep_all = TRUE),
    tidybayes::spread_rvars(climbing_mod_fit, route_difficulty[route_idx], ndraws = 500),
    by = "route_idx"
  )
  |> arrange(route_idx)
)

data.frame [4,288 x 6]

route_idx	route_id	bouldering_grade	route_rating	climb_type	route_difficulty
1	221__Be_awesome__boulder	V7	5.12a	1	-1.9498 ± 0.44
2	221__Black_Magic_Woman__boulder	V7	5.12a	1	0.9663 ± 0.41
3	221__Doug_Reed_Roof__boulder	V5	5.11a	1	-0.0081 ± 0.33
4	221__Druid_Roof__boulder	V7	5.12a	1	0.6663 ± 0.36
5	221__Dump_Arete__boulder	V4	5.10c	1	0.9178 ± 0.63
6	221__Iceberg__boulder	V5	5.11a	1	-0.4723 ± 0.47
7	221__Instinct__boulder	V9	5.12d	1	1.4382 ± 0.45
8	221__M1__boulder	V3	5.10a	1	-0.1295 ± 0.42
9	221__Ominous_Roof__boulder	V9	5.12d	1	2.0628 ± 0.53
10	221__Senderella__boulder	V8	5.12c	1	1.3378 ± 0.53
11	221__Sign_of_the_Times__boulder	V5	5.11a	1	0.4977 ± 0.44
12	221__The_Iceberg__boulder	V5	5.11a	1	0.3689 ± 0.52
13	221__West_Texas__boulder	V3	5.10a	1	-0.8065 ± 0.48
14	221__What_Up_Arete__boulder	V6	5.11d	1	0.8361 ± 0.43
15	American_Fork__39__rope	V5	5.11b	0	-1.1915 ± 0.38
[ omitted 4,273 entries ]

Time difference of 0.947 secs

4.2 Posterior plots

4.2.1 Ridgeline plots

ridgeline_plot

ridgeline_plot <- function(dat, var, title) {
  
  ## Unlisting the route_difficulties and making sure the route_ratings/bouldering_grades are ordered properly
  dat <- dat[
    , .(route_difficulty = unlist(route_difficulty)), by = setdiff(names(dat), 'route_difficulty')
  ][, let(
      bouldering_grade = factor(bouldering_grade, levels = bouldering_grades),
      route_rating = factor(route_rating, levels = route_ratings)
    )
  ]
  
  return(
    ggplot(dat, aes(route_difficulty, y = {{ var }}, fill = {{ var }}))
    + geom_ribbon(
      aes(
        fill = stage({{ var }}, after_scale = alpha(fill, 0.5)),
        ymin = after_stat(group),
        ymax = after_stat(group + ndensity * 1.6)
      ),
      stat = "density", outline.type = "upper", color = "grey30"
    )
    * ggblend::blend("multiply")
    + geom_vline(xintercept = 0, linetype = "dashed", color = "grey50")
    + labs(title = title, x = "Route Difficulty", y = "")
    + scale_y_discrete(position = "right")
    + theme(
      legend.position = "none", 
      axis.line.y = element_blank(),
      plot.title = element_text(hjust = 0.5)
    )
  )
}

Route Ratings:

climbs_pp[climb_type == 0] |> 
  ridgeline_plot(route_rating, "Climbing Route Posteriors")

Bouldering Grades:

climbs_pp[climb_type == 1 & bouldering_grade != "V0"] |> 
  ridgeline_plot(bouldering_grade, "Bouldering Problem Posteriors")

4.2.2 Strip plots

strip_plot

strip_plot <- function(dat, var, title) {
  
  strip_plot <- (dat 
    |> separate_wider_delim(route_id, names = c("crag", "route_name", NA), delim = "__")
    |> mutate(
      route_difficulty = map_dbl(route_difficulty, mean),
      bouldering_grade = factor(bouldering_grade, levels = bouldering_grades),
      route_rating = factor(route_rating, levels = route_ratings),
      across(c(crag, route_name), \(x) str_replace_all(x, "_", " "))
    )
    |> ggplot(aes(route_difficulty, y = {{ var }}, color = {{ var }}))
      + geom_point(
        aes(group = crag, linesize = route_name), # Adding unused aesthetics to get plotly's automated tooltips
        position = position_jitter(height = 0.2), alpha = 0.6
      )
      + labs(
        title = title,
        x = "Route Difficulty", 
        y = str_to_title(str_replace_all(deparse(substitute(var)), "_", " "))
      )
      + theme(legend.position = "none")
  )
  
  return(ggplotly(strip_plot, tooltip = c("group", "linesize")))
}

Route Ratings:

climbs_pp[climb_type == 0] |> 
  strip_plot(route_rating, "Climbing Route Difficulties")

Bouldering Grades:

climbs_pp[climb_type == 1 & bouldering_grade != "V0"] |> 
  strip_plot(bouldering_grade, "Bouldering Problem Difficulties")

💻 Expand for Session Info

─ Session info ───────────────────────────────────────────────────────────────
 setting        value
 version        R version 4.3.1 (2023-06-16)
 os             Ubuntu 22.04.3 LTS
 system         x86_64, linux-gnu
 ui             X11
 language       (EN)
 collate        C.UTF-8
 ctype          C.UTF-8
 tz             Europe/Paris
 date           2024-02-07
 pandoc         3.1.11
 Quarto         1.5.9
 Stan (CmdStan) 2.33.1

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version date (UTC) lib source
 P archive    * 1.1.6   2023-09-18 [?] CRAN (R 4.3.1)
 P bayesplot  * 1.10.0  2022-11-16 [?] CRAN (R 4.3.0)
 P cmdstanr   * 0.6.1   2023-09-13 [?] local
 P crayon     * 1.5.2   2022-09-29 [?] CRAN (R 4.3.0)
 P data.table * 1.15.0  2024-01-30 [?] CRAN (R 4.3.1)
 P DBI        * 1.1.3   2022-06-18 [?] CRAN (R 4.3.0)
 P dbplyr     * 2.4.0   2023-10-26 [?] CRAN (R 4.3.1)
 P dplyr      * 1.1.4   2023-11-17 [?] CRAN (R 4.3.1)
 P duckdb     * 0.9.2-1 2023-11-28 [?] CRAN (R 4.3.1)
 P fs         * 1.6.3   2023-07-20 [?] CRAN (R 4.3.0)
 P ggplot2    * 3.4.4   2023-10-12 [?] CRAN (R 4.3.1)
 P gt         * 0.10.0  2023-10-07 [?] CRAN (R 4.3.1)
 P here       * 1.0.1   2020-12-13 [?] CRAN (R 4.3.0)
 P htmltools  * 0.5.6.1 2023-10-06 [?] CRAN (R 4.3.1)
 P knitr      * 1.44    2023-09-11 [?] CRAN (R 4.3.0)
 P lubridate  * 1.9.3   2023-09-27 [?] CRAN (R 4.3.1)
 P patchwork  * 1.1.3   2023-08-14 [?] CRAN (R 4.3.0)
 P pipebind   * 0.1.2   2023-08-30 [?] CRAN (R 4.3.0)
 P plotly     * 4.10.2  2023-06-03 [?] CRAN (R 4.3.0)
 P posterior  * 1.4.1   2023-03-14 [?] CRAN (R 4.3.0)
 P purrr      * 1.0.2   2023-08-10 [?] CRAN (R 4.3.0)
 P reactable  * 0.4.4   2023-03-12 [?] CRAN (R 4.3.0)
 P stringr    * 1.5.0   2022-12-02 [?] CRAN (R 4.3.0)
 P tidyr      * 1.3.0   2023-01-24 [?] CRAN (R 4.3.0)

 [1] /home/mar/Dev/Projects/R/ma-riviere.com/renv/library/R-4.3/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/sandbox/R-4.3/x86_64-pc-linux-gnu/9a444a72

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

Citation

BibTeX citation:

@online{rivière2022,
  author = {Rivière, Marc-Aurèle},
  title = {Bayesian {Rock} {Climbing} {Rankings}},
  date = {2022-04-19},
  url = {https://ma-riviere.com/content/code/posts/climbing},
  langid = {en},
  abstract = {This post is a transposition to R of Ethan Rosenthal’s
    {[}blog
    post{]}(https://www.ethanrosenthal.com/2022/04/15/bayesian-rock-climbing/)
    on modeling Rock Climbing route difficulty using a Bayesian IRT
    (Item Response Theory) model. The original Stan code was updated to
    use {[}within-chain
    parallelization{]}(https://mc-stan.org/docs/2\_30/stan-users-guide/reduce-sum.html)
    and {[}compiler
    optimization{]}(https://mc-stan.org/docs/2\_30/stan-users-guide/optimization.html)
    for faster CPU sampling. Several data processing solutions are
    showcased, using either `data.table` or `dbplyr` (with a `DuckDB`
    backend), with timings to compare their speed.}
}

For attribution, please cite this work as:

Rivière, M.-A. (2022, April 19). Bayesian Rock Climbing Rankings. https://ma-riviere.com/content/code/posts/climbing