Bayesian Rock Climbing Rankings

With R and Stan

Statistics
ML
Bayesian Modeling
Stan
R
First Published

April 19, 2022

Summary

This post is a transposition to R of Ethan Rosenthal’s blog post on modeling Rock Climbing route difficulty using a Bayesian IRT (Item Response Theory) model.

The original Stan code was updated to use within-chain parallelization and compiler optimization for faster CPU sampling.

Several data processing solutions are showcased, using either data.table or dbplyr (with a DuckDB backend), with timings to compare their speed.


Caution

By design, this post contains very few explanations.

Its goal was simply to translate Ethan’s blog post to R, and update his Stan code to use within-chain parallelization and compiler optimizations, for faster sampling.

Feel free to read the original blog post to better understand what the code is doing.

Tip

You can check the page’s source code by clicking on the </> Code button at the top-right.

Setup


library(here)        # Working directory management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies
library(archive)     # Memory efficient object storage

library(cmdstanr)    # Lightweight R interface for Stan
library(posterior)   # Wrangling Stan models' output

library(data.table)  # Fast data manipulation (in-RAM)
library(duckdb)      # DuckDB R interface

library(dplyr)       # Manipulating data.frames - core   (Tidyverse)
library(tidyr)       # Manipulating data.frames - extras (Tidyverse)
library(dbplyr)      # DB/SQL backend for dplyr/tidyr    (Tidyverse)
library(stringr)     # Manipulating strings              (Tidyverse)
library(purrr)       # Manipulating lists                (Tidyverse)
library(lubridate)   # Manipulating date/time            (Tidyverse)
library(ggplot2)     # Best plotting library             (Tidyverse)

library(plotly)      # Interactive plots
library(bayesplot)   # PPC/Diagnostic plots for Stan models
library(patchwork)   # Combining plots

options(
  mc.cores = max(1L, parallel::detectCores(logical = TRUE)),
  scipen = 999L, 
  digits = 4L,
  ggplot2.discrete.colour = \() scale_color_viridis_d(),
  ggplot2.discrete.fill = \() scale_fill_viridis_d()
)

nrows_print <- 10

setDTthreads(parallel::detectCores(logical = FALSE))
Adding custom knitr hooks
#---------------------------#
####🔺knitr custom hooks ####
#---------------------------#

library(knitr)

## Adding the `time_it` code chunk option
knitr::knit_hooks$set(time_it = local({
  assign("TIMES", list(), .GlobalEnv)
  start <- NULL
  function(before, options) {
    if (before) start <<- Sys.time()
    else TIMES[[options$label]] <<- difftime(Sys.time(), start)
  }
}))
Applying a custom theme to all ggplot objects, both light and dark versions
#---------------------------#
####🔺ggplot knit_prints ####
#---------------------------#

library(knitr)
library(ggplot2)

## Inspired by: https://debruine.github.io/quarto_demo/dark_mode.html
knit_print.ggplot <- function(x, options, ...) {
  if(any(grepl("patchwork", class(x)))) {
    plot_dark <- x & dark_addon_mar
    plot_light <- x & light_addon_mar
  } else {
    plot_dark <- x + dark_addon_mar
    plot_light <- x + light_addon_mar
  }
  
  cat('\n<div class="light-mode">\n')
  print(plot_light)
  cat('</div>\n')
  cat('<div class="dark-mode">\n')
  print(plot_dark)
  cat('</div>\n\n')
}

registerS3method("knit_print", "ggplot", knit_print.ggplot)
Applying a custom theme to all gt tables
#-----------------------#
####🔺gt knit_prints ####
#-----------------------#

library(knitr)
library(gt)

knit_print.grouped_df <- function(x, options, ...) {
  if ("grouped_df" %in% class(x)) x <- ungroup(x)
  
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "grouped_df", knit_print.grouped_df)

knit_print.data.frame <- function(x, options, ...) {
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "data.frame", knit_print.data.frame)
Functions for interactive data presentation
#----------------------------------#
####🔺knitr interactive display ####
#----------------------------------#

library(htmltools)
library(reactable)

## Getting list to display nicely in rendered documents
make_list_reactable <- function(list_dat) {
  
  list_name <- deparse(substitute(list_dat))
  
  get_list_elt_dim <- function(elt) {
    list_elt <- list_dat[[elt]]
    list_elt_dim <- if (any(c("data.frame", "matrix") %in% class(list_elt))) dim(list_elt) else length(list_elt)
    
    return(paste0(list_elt_dim, collapse = ", "))
  }
  
  dat <- data.frame(names(list_dat)) |> 
    set_names(list_name) |> 
    mutate(
      Type = unlist(pick(list_name)) |> map_chr(\(x) class(list_dat[[x]]) |> paste0(collapse = ", ")),
      Dimensions = unlist(pick(list_name)) |> map_chr(get_list_elt_dim)
    )
  
  get_list_details <- function(dat, idx, max_print = 200, max_digits = 3) {
    Element <- dat[[idx]]
    style <- "padding: 0.5rem"
    
    if (any(c("data.frame", "matrix") %in% class(Element))) {
      reactable(data.frame(Element), outlined = TRUE, striped = TRUE, highlight = TRUE, compact = TRUE) |> 
        htmltools::div(style = style) 
    }
    else if ("list" %in% class(Element)) 
      make_list_reactable(Element)
    else if (length(Element) > max_print) {
      htmltools::div(
        htmltools::p(head(Element, max_print) |> round(max_digits) |> paste0(collapse = ", ") |> paste("...", sep = ", ")),
        htmltools::p(stringr::str_glue("[ omitted {length(Element) - max_print} entries ]"), style = "font-style: italic"),
        style = style
      )
    }
    else htmltools::div(round(Element, max_digits) |> paste0(collapse = ", "), style = style)
  }
  
  reactable(
    dat
    , defaultColDef = colDef(vAlign = "center", headerVAlign = "center")
    , details = \(idx) get_list_details(list_dat, idx)
    , outlined = TRUE
    , striped = TRUE
    , highlight = TRUE
    , compact = FALSE
    , fullWidth = TRUE
    , defaultPageSize = 15
  )
}
Installing CmdStan
## Skip this step if CmdStan is already installed

cmdstanr::check_cmdstan_toolchain(fix = TRUE, quiet = TRUE)

cpp_opts <- list(
  stan_threads = TRUE
  , STAN_CPP_OPTIMS = TRUE
  , STAN_NO_RANGE_CHECKS = TRUE # WARN: remove this if you haven't tested the model
  , PRECOMPILED_HEADERS = TRUE
  # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
  , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
  , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
)

cmdstanr::install_cmdstan(cpp_options = cpp_opts, quiet = TRUE)
Loading CmdStan (if already installed)
highest_cmdstan_version <- dir_ls(config$cmdstan_path) |> 
  path_file() |> 
  keep(\(e) str_detect(e, "cmdstan-")) |> 
  bind(x, str_split(x, '-', simplify = TRUE)[,2]) |> 
  reduce(\(x, y) ifelse(utils::compareVersion(x, y) == 1, x, y))

set_cmdstan_path(str_glue("{config$cmdstan_path}cmdstan-{highest_cmdstan_version}"))
Setting up knitr’s engine for CmdStan
## Inspired by: https://mpopov.com/blog/2020/07/30/replacing-the-knitr-engine-for-stan/

## Note: We could haved use cmdstanr::register_knitr_engine(), 
##       but it wouldn't include compiler optimizations & multi-threading by default

knitr::knit_engines$set(
  cmdstan = function(options) {
    output_var <- options$output.var
    if (!is.character(output_var) || length(output_var) != 1L) {
      stop(
        "The chunk option output.var must be a character string ",
        "providing a name for the returned `CmdStanModel` object."
      )
    }
    if (options$eval) {
      if (options$cache) {
        cache_path <- options$cache.path
        if (length(cache_path) == 0L || is.na(cache_path) || cache_path == "NA") {
          cache_path <- ""
        }
        dir <- paste0(cache_path, options$label)
      } else {
        dir <- tempdir()
      }
      file <- cmdstanr::write_stan_file(options$code, dir = dir, force_overwrite = TRUE)
      mod <- cmdstanr::cmdstan_model(
        stan_file = file, 
        cpp_options = list(
          stan_threads = TRUE
          , STAN_CPP_OPTIMS = TRUE
          , STAN_NO_RANGE_CHECKS = TRUE # The model was already tested
          , PRECOMPILED_HEADERS = TRUE
          # , CXXFLAGS_OPTIM = "-march=native -mtune=native"
          , CXXFLAGS_OPTIM_TBB = "-mtune=native -march=native"
          , CXXFLAGS_OPTIM_SUNDIALS = "-mtune=native -march=native"
        ),
        stanc_options = list("O1"),
        force_recompile = TRUE
      )
      assign(output_var, mod, envir = knitr::knit_global())
    }
    options$engine <- "stan"
    code <- paste(options$code, collapse = "\n")
    knitr::engine_output(options, code, '')
  }
)

1 Data


1.1 Extracting the data

Connecting to the .sqlite DB (using DuckDB instead of SQLite):

con <- dbConnect(duckdb(), dbdir = ":memory:")

db_path <- here("res", "data", "climbers.sqlite")
INSTALL sqlite;
LOAD sqlite;
CALL sqlite_attach(?db_path);

dbplyr automatically translates dplyr/tidyr code into SQL !

(list(
    ## Table 1: ascent
    tbl(con, "ascent") 
    |> filter(country %like% "USA") 
    |> mutate(
        route_id = str_c(
          str_replace_all(crag, ' ', '_'), "__", 
          str_replace_all(name, ' ', '_'), "__", 
          if_else(climb_type == 1, 'boulder', 'rope')
        ),
        ascent_date = to_timestamp(date)
    ) 
    |> select(user_id, route_id, climb_type, grade_id, method_id, ascent_date),
    ## Table 2: grade
    tbl(con, "grade") |> select(grade_id = id, usa_routes, usa_boulders),
    ## Table 3: method
    tbl(con, "method") |> select(method_id = id, method_name = name)
  )
  |> reduce(left_join)
  |> select(-grade_id, -method_id)
  |> compute("climbs", overwrite = TRUE)
)

Time difference of 0.5235 secs

SELECT
  ascent.user_id
  , REPLACE(ascent.crag, ' ', '_')
      || '__' || REPLACE(ascent.name, ' ', '_')
      || '__' || CASE WHEN ascent.climb_type = 1 THEN 'boulder' ELSE 'rope' END
      AS route_id
  , ascent.climb_type as climb_type
  , to_timestamp(ascent.date) AS ascent_date
  , grade.usa_routes
  , grade.usa_boulders
  , method.name AS method_name
FROM ascent
JOIN grade ON grade.id = ascent.grade_id
JOIN method ON method.id = ascent.method_id
WHERE ascent.country = 'USA'

Time difference of 0.487 secs

data.frame [658,822 x 7]
user_id route_id climb_type ascent_date usa_routes usa_boulders method_name
13 733 Vantage__Black_Belt_Internet_Spray_Masters__rope 0 2010-05-07 22:00:00 5.10a V3 Flash
13 733 Vantage__Choreography__rope 0 2010-05-07 22:00:00 5.10b V3/4 Onsight
13 733 Vantage__Awareness_Drooling__rope 0 2010-05-07 22:00:00 5.10c V4 Onsight
13 733 Vantage__Corroding_Through_Nocturnal_Silk__rope 0 2010-05-07 22:00:00 5.8 V1 Onsight
13 733 Vantage__Cornivorous__rope 0 2010-05-08 22:00:00 5.9 V2 Flash
13 733 Vantage__Clipping_the_Cornea__rope 0 2010-05-08 22:00:00 5.7 V1 Flash
27 963 Bishop__Birthday_Direct__boulder 1 2010-05-11 22:00:00 5.10a V3 Redpoint
13 733 Vantage__Decortication_in_a_Clinical_Stupor__rope 0 2010-05-08 22:00:00 5.10b V3/4 Flash
13 733 Vantage__A_disease_called_life__rope 0 2010-05-08 22:00:00 5.9 V2 Flash
13 733 Vantage__Think_it_rhymes_with_luck__rope 0 2010-05-08 22:00:00 5.10a V3 Flash
13 733 Vantage__Oh_Coulee,_Oh_Coulee__rope 0 2010-05-07 22:00:00 5.8 V1 Onsight
27 963 Bishop__Nan_Bread__boulder 1 2010-05-11 22:00:00 5.10c V4 Onsight
27 963 Bishop__Prow__boulder 1 2010-05-11 22:00:00 5.9 V2 Flash
24 475 Area_74__Backseat_Spooning__boulder 1 2010-05-09 22:00:00 5.12d V9 Redpoint
18 656 Flagstaff__That_Flakes_It_SDS__boulder 1 2010-05-07 22:00:00 5.13a V10 Redpoint
[ omitted 658,807 entries ]

1.2 Processing the data

route_ratings <- c(
  str_c("5.", 1:9), 
  map(str_c("5.", 10:15), \(x) str_c(x, letters[1:4])) |> unlist()
)

bouldering_grades <- str_c("V", 0:20)

## Mode for non-numerical data
mode_cat <- function(x) {
  x <- sort(na.omit(as.character(x)))
  unique(x)[which.max(tabulate(match(x, unique(x))))]
}
threshold_ascents_dt
climbs <- setDT(climbs_df) |> setkey(route_id, user_id)

threshold_ascents_dt <- function(current, lim = 20) {
  new <- current[, if(.N >= lim) .SD, by = user_id][, if(.N >= lim) .SD, by = route_id]
  
  if (nrow(current) != nrow(new)) threshold_ascents_dt(new, lim)
  else return(new)
}
climbs_clean <- (
  climbs[
    order(user_id, route_id, ascent_date, usa_routes, usa_boulders, method_name), .SD[1], 
    by = .(user_id, route_id)
  ][, let(
      usa_boulders = factor(usa_boulders, levels = bouldering_grades),
      usa_routes = factor(usa_routes, levels = route_ratings),
      label = as.integer(method_name %chin% c("Onsight", "Flash"))
    )
  ][, let(route_rating = mode_cat(usa_routes), bouldering_grade = mode_cat(usa_boulders)), by = route_id
  ][, threshold_ascents_dt(.SD)
  ][, let(
      route_idx = frank(route_id, ties.method = "dense"),
      user_idx = frank(user_id, ties.method = "dense")
    )
  ][order(route_idx, user_idx)
    , .(route_idx, route_id, user_idx, climb_type, ascent_date, route_rating, bouldering_grade, label)
  ]
)
Time difference of 10.82 secs
climbs_first <- climbs[
    order(user_id, route_id, ascent_date, usa_routes, usa_boulders, method_name), .SD[1], 
    by = .(user_id, route_id)
  ]

climbs_clean <- (
  copy(climbs_first)[
    , threshold_ascents_dt(.SD)
  ][
    ## Replacing all route_ratings for a given route_id by its mode
    climbs_first[
        usa_boulders %in% bouldering_grades
      ][, .(bouldering_grade = mode_cat(usa_boulders)), by = route_id], 
    c("route_id", "bouldering_grade") := list(i.route_id, i.bouldering_grade), 
    on = "route_id"
  ][
    ## Replacing all bouldering_grades for a given route_id by its mode
    climbs_first[
        usa_routes %in% route_ratings
      ][, .(route_rating = mode_cat(usa_routes)), by = route_id], 
    c("route_id", "route_rating") := list(i.route_id, i.route_rating), 
    on = "route_id"
  ][, let(
      route_idx = frank(route_id, ties.method = "dense"),
      user_idx = frank(user_id, ties.method = "dense"),
      label = as.integer(method_name %chin% c("Onsight", "Flash"))
  )
  ][order(route_idx, user_idx)
    , .(route_idx, route_id, user_idx, climb_type, ascent_date, route_rating, bouldering_grade, label)
  ]
)
Time difference of 9.652 secs
threshold_ascents_dbp
threshold_ascents_dbp <- function(current, lim = 20) {
  new <- current |> filter(n() >= lim, .by = user_id) |> filter(n() >= lim, .by = route_id) |> collect()

  if (pull(count(current), n) != nrow(new)) threshold_ascents_dbp(new, lim)
  else {
    duckdb_register(con, "ascent_temp", new)
    return(tbl(con, "ascent_temp"))
  }
}
(tbl(con, "climbs")
  |> slice_min(
    tibble(ascent_date, usa_routes, usa_boulders, method_name), with_ties = FALSE, 
    by = c(user_id, route_id)
  )
  |> compute("climbs_first", overwrite = TRUE)
)
(tbl(con, "climbs_first")
  |> threshold_ascents_dbp()
  ## Replacing all route_ratings for a given route_id by its mode
  |> left_join(
    tbl(con, "climbs_first")
      |> filter(usa_routes %in% route_ratings)
      |> count(route_id, usa_routes)
      |> slice_max(n, by = route_id)
      |> summarize(route_rating = min(usa_routes), .by = route_id),
    by = "route_id"
  )
  ## Replacing all bouldering_grades for a given route_id by its mode
  |> left_join(
    tbl(con, "climbs_first")
      |> filter(usa_boulders %in% bouldering_grades)
      |> count(route_id, usa_boulders)
      |> slice_max(n, by = route_id)
      |> summarize(bouldering_grade = min(usa_boulders), .by = route_id),
    by = "route_id"
  )
  |> mutate(
    route_idx = dense_rank(route_id), 
    user_idx = dense_rank(user_id),
    label = as.integer(method_name %in% c("Onsight", "Flash"))
  )
  |> select(route_idx, route_id, user_idx, climb_type, ascent_date, route_rating, bouldering_grade, label)
  |> arrange(route_idx, user_idx)
  |> compute("climbs_clean", overwrite = TRUE)
)
Time difference of 1.78 secs
data.table [232,887 x 8]
route_idx route_id user_idx climb_type ascent_date route_rating bouldering_grade label
1 221__Be_awesome__boulder 1 386 1 2017-06-09 22:00:00 5.12a V7 0
1 221__Be_awesome__boulder 1 601 1 2015-03-18 23:00:00 5.12a V7 1
1 221__Be_awesome__boulder 1 832 1 2013-06-29 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 032 1 2013-12-26 23:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 075 1 2013-06-21 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 110 1 2013-07-16 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 134 1 2013-10-23 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 149 1 2013-06-21 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 157 1 2013-07-11 22:00:00 5.12a V7 0
1 221__Be_awesome__boulder 2 170 1 2014-05-26 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 177 1 2013-06-28 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 234 1 2013-08-02 22:00:00 5.12a V7 1
1 221__Be_awesome__boulder 2 236 1 2013-10-20 22:00:00 5.12a V7 0
1 221__Be_awesome__boulder 2 261 1 2012-09-23 22:00:00 5.12a V7 0
1 221__Be_awesome__boulder 2 338 1 2013-06-21 22:00:00 5.12a V7 1
[ omitted 232,872 entries ]
all.equal(
  collect(tbl(con, "climbs_clean")), 
  climbs_clean, 
  check.attributes = FALSE
)
[1] TRUE

2 Model


2.1 Stan code

Updated Stan code using within-chain parallelization

climbing_model
functions {
  array[] int sequence(int start, int end) {
    array[end - start + 1] int seq;
    for (n in 1 : num_elements(seq)) {
      seq[n] = n + start - 1;
    }
    return seq;
  }

  // Compute partial sums of the log-likelihood
  real partial_log_lik_lpmf(array[] int seq, int start, int end,
                            data array[] int labels, real mean_ability,
                            data array[] int users, vector user_ability,
                            data array[] int routes, vector route_difficulty) {
    real ptarget = 0;
    int N = end - start + 1;

    vector[N] mu = mean_ability + rep_vector(0.0, N);
    for (n in 1 : N) {
      int nn = n + start - 1;
      mu[n] += user_ability[users[nn]] - route_difficulty[routes[nn]];
    }
    ptarget += bernoulli_logit_lpmf(labels[start : end] | mu);
    return ptarget;
  }
}

data {
  int<lower=1> num_ascents;
  int<lower=1> num_users;
  int<lower=1> num_routes;
  array[num_ascents] int<lower=1, upper=num_users> users;
  array[num_ascents] int<lower=1, upper=num_routes> routes;
  array[num_ascents] int<lower=0, upper=1> labels;

  int grainsize;
}

transformed data {
  array[num_ascents] int seq = sequence(1, num_ascents);
}

parameters {
  real mean_ability;
  vector[num_users] user_ability;
  vector[num_routes] route_difficulty;
}

model {
  user_ability ~ std_normal();
  route_difficulty ~ std_normal();
  mean_ability ~ std_normal();

  target += reduce_sum(
    partial_log_lik_lpmf, seq, grainsize, 
    labels, mean_ability, users, user_ability, routes, route_difficulty
  );
}

2.2 Stan data

climbing_stan_data <- list(
  num_ascents = nrow(climbs_clean),
  num_users = n_distinct(climbs_clean$user_id),
  num_routes = n_distinct(climbs_clean$route_id),
  routes = pull(climbs_clean, route_idx),
  users = pull(climbs_clean, user_idx),
  labels = pull(climbs_clean, label) |> as.integer(),
  grainsize = max(100, round(nrow(climbs_clean) / 50))
)

2.3 Model fit

climbing_mod_fit <- climbing_model$sample(
  data = climbing_stan_data, seed = 666,
  iter_warmup = 500, iter_sampling = 1000, refresh = 0,
  chains = 6, parallel_chains = 6, threads_per_chain = 5
)
Note

Sampling takes ~4.81 minutes on my CPU (Ryzen 5950X, 16 Cores/32 Threads), on WSL2 (Ubuntu 22)

data.table [6 x 2]
Chain Time
1 282.858s (~4.71 minutes)
2 294.551s (~4.91 minutes)
3 293.979s (~4.9 minutes)
4 288.381s (~4.81 minutes)
5 285.559s (~4.76 minutes)
6 285.031s (~4.75 minutes)

3 Model diagnostics


mcmc_neff_hist(neff_ratio(climbing_mod_fit))
mcmc_rhat_hist(rhat(climbing_mod_fit))

Plotting random subsets of the traces:

hist_trace_plot
hist_trace_plot <- function(mod, vars) {
  draws <- mod$draws(variables = vars, format = "draws_list")
  wrap_plots(
    mcmc_hist(draws, facet_args = list(nrow = length(vars))),
    mcmc_trace(draws, facet_args = list(nrow = length(vars))),
    widths = c(1, 1.5)
  )
}
hist_trace_plot(
  climbing_mod_fit, 
  paste0("route_difficulty[", unique(climbs_clean, by = "route_idx")[, route_idx] |> sample(5), "]")
)
hist_trace_plot(
  climbing_mod_fit, 
  paste0("user_ability[", unique(climbs_clean, by = "user_idx")[, user_idx] |> sample(5), "]")
)

Everything seems good.

4 Posterior Predictions


4.1 Posterior data

Getting our Posterior Predictions (subset of 500 draws per route) into long format:

draws <- (
  ## For each player, take a subsample of 500 draws from their posterior
  climbing_mod_fit$draws(variables = "route_difficulty")
  |> bind(x, subset_draws(x, "route_difficulty", regex = TRUE, draw = sample.int(ndraws(x), size = 500)))
  |> as.data.table()
  |> _[, .(route_difficulty = list(value)), by = variable
  ][, let(route_idx = as.integer(str_extract(variable, "\\d{1,4}")), variable = NULL)]
)

climbs_pp <- (climbs_clean[, .(route_idx, route_id, bouldering_grade, route_rating, climb_type)] 
  |> unique(by = "route_idx")
  |> _[draws, on = "route_idx", nomatch = NULL
  ][order(route_idx)]
)
data.table [4,288 x 6]
route_idx route_id bouldering_grade route_rating climb_type route_difficulty
1 221__Be_awesome__boulder V7 5.12a 1 <numeric [500]>
2 221__Black_Magic_Woman__boulder V7 5.12a 1 <numeric [500]>
3 221__Doug_Reed_Roof__boulder V5 5.11a 1 <numeric [500]>
4 221__Druid_Roof__boulder V7 5.12a 1 <numeric [500]>
5 221__Dump_Arete__boulder V4 5.10c 1 <numeric [500]>
6 221__Iceberg__boulder V5 5.11a 1 <numeric [500]>
7 221__Instinct__boulder V9 5.12d 1 <numeric [500]>
8 221__M1__boulder V3 5.10a 1 <numeric [500]>
9 221__Ominous_Roof__boulder V9 5.12d 1 <numeric [500]>
10 221__Senderella__boulder V8 5.12c 1 <numeric [500]>
11 221__Sign_of_the_Times__boulder V5 5.11a 1 <numeric [500]>
12 221__The_Iceberg__boulder V5 5.11a 1 <numeric [500]>
13 221__West_Texas__boulder V3 5.10a 1 <numeric [500]>
14 221__What_Up_Arete__boulder V6 5.11d 1 <numeric [500]>
15 American_Fork__39__rope V5 5.11b 0 <numeric [500]>
[ omitted 4,273 entries ]

Time difference of 1.683 secs

## Getting the draws into DuckDB
(climbing_mod_fit$draws(variables = "route_difficulty") 
  |> bind(x, subset_draws(x, "route_difficulty", regex = TRUE, draw = sample.int(ndraws(x), size = 500))) 
  |> as_draws_df()
  |> pivot_longer(everything(), names_to = "route_idx", names_pattern = ".*\\[(\\d{1,4})\\]")
  |> duckdb_register(con, "draws", df = _)
)

## Generating out Posterior Predictions data
(tbl(con, "climbs_clean")
  |> select(route_idx, route_id, bouldering_grade, route_rating, climb_type)
  |> distinct(route_idx, .keep_all = TRUE)
  |> inner_join(tbl(con, "draws"), by = "route_idx")
  |> summarize(
    .by = route_idx,
    across(c(bouldering_grade, route_rating, climb_type), first),
    route_difficulty = list(value)
  )
  |> arrange(route_idx)
  |> compute("climbs_pp", overwrite = TRUE)
)
data.frame [4,288 x 5]
route_idx bouldering_grade route_rating climb_type route_difficulty
1 V7 5.12a 1 <numeric [500]>
2 V7 5.12a 1 <numeric [500]>
3 V5 5.11a 1 <numeric [500]>
4 V7 5.12a 1 <numeric [500]>
5 V4 5.10c 1 <numeric [500]>
6 V5 5.11a 1 <numeric [500]>
7 V9 5.12d 1 <numeric [500]>
8 V3 5.10a 1 <numeric [500]>
9 V9 5.12d 1 <numeric [500]>
10 V8 5.12c 1 <numeric [500]>
11 V5 5.11a 1 <numeric [500]>
12 V5 5.11a 1 <numeric [500]>
13 V3 5.10a 1 <numeric [500]>
14 V6 5.11d 1 <numeric [500]>
15 V5 5.11b 0 <numeric [500]>
[ omitted 4,273 entries ]

Time difference of 0.686 secs

With dplyr, we can use the rvar format to encapsulate the samples from the model, which drastically reduces the size of the samples’ data.frame

(inner_join(
    as.data.frame(climbs_clean) |> 
      select(route_idx, route_id, bouldering_grade, route_rating, climb_type) |> 
      distinct(route_idx, .keep_all = TRUE),
    tidybayes::spread_rvars(climbing_mod_fit, route_difficulty[route_idx], ndraws = 500),
    by = "route_idx"
  )
  |> arrange(route_idx)
)
data.frame [4,288 x 6]
route_idx route_id bouldering_grade route_rating climb_type route_difficulty
1 221__Be_awesome__boulder V7 5.12a 1 -1.9498 ± 0.44
2 221__Black_Magic_Woman__boulder V7 5.12a 1 0.9663 ± 0.41
3 221__Doug_Reed_Roof__boulder V5 5.11a 1 -0.0081 ± 0.33
4 221__Druid_Roof__boulder V7 5.12a 1 0.6663 ± 0.36
5 221__Dump_Arete__boulder V4 5.10c 1 0.9178 ± 0.63
6 221__Iceberg__boulder V5 5.11a 1 -0.4723 ± 0.47
7 221__Instinct__boulder V9 5.12d 1 1.4382 ± 0.45
8 221__M1__boulder V3 5.10a 1 -0.1295 ± 0.42
9 221__Ominous_Roof__boulder V9 5.12d 1 2.0628 ± 0.53
10 221__Senderella__boulder V8 5.12c 1 1.3378 ± 0.53
11 221__Sign_of_the_Times__boulder V5 5.11a 1 0.4977 ± 0.44
12 221__The_Iceberg__boulder V5 5.11a 1 0.3689 ± 0.52
13 221__West_Texas__boulder V3 5.10a 1 -0.8065 ± 0.48
14 221__What_Up_Arete__boulder V6 5.11d 1 0.8361 ± 0.43
15 American_Fork__39__rope V5 5.11b 0 -1.1915 ± 0.38
[ omitted 4,273 entries ]

Time difference of 0.947 secs

4.2 Posterior plots

4.2.1 Ridgeline plots

ridgeline_plot
ridgeline_plot <- function(dat, var, title) {
  
  ## Unlisting the route_difficulties and making sure the route_ratings/bouldering_grades are ordered properly
  dat <- dat[
    , .(route_difficulty = unlist(route_difficulty)), by = setdiff(names(dat), 'route_difficulty')
  ][, let(
      bouldering_grade = factor(bouldering_grade, levels = bouldering_grades),
      route_rating = factor(route_rating, levels = route_ratings)
    )
  ]
  
  return(
    ggplot(dat, aes(route_difficulty, y = {{ var }}, fill = {{ var }}))
    + geom_ribbon(
      aes(
        fill = stage({{ var }}, after_scale = alpha(fill, 0.5)),
        ymin = after_stat(group),
        ymax = after_stat(group + ndensity * 1.6)
      ),
      stat = "density", outline.type = "upper", color = "grey30"
    )
    * ggblend::blend("multiply")
    + geom_vline(xintercept = 0, linetype = "dashed", color = "grey50")
    + labs(title = title, x = "Route Difficulty", y = "")
    + scale_y_discrete(position = "right")
    + theme(
      legend.position = "none", 
      axis.line.y = element_blank(),
      plot.title = element_text(hjust = 0.5)
    )
  )
}

Route Ratings:

climbs_pp[climb_type == 0] |> 
  ridgeline_plot(route_rating, "Climbing Route Posteriors")

Bouldering Grades:

climbs_pp[climb_type == 1 & bouldering_grade != "V0"] |> 
  ridgeline_plot(bouldering_grade, "Bouldering Problem Posteriors")

4.2.2 Strip plots

strip_plot
strip_plot <- function(dat, var, title) {
  
  strip_plot <- (dat 
    |> separate_wider_delim(route_id, names = c("crag", "route_name", NA), delim = "__")
    |> mutate(
      route_difficulty = map_dbl(route_difficulty, mean),
      bouldering_grade = factor(bouldering_grade, levels = bouldering_grades),
      route_rating = factor(route_rating, levels = route_ratings),
      across(c(crag, route_name), \(x) str_replace_all(x, "_", " "))
    )
    |> ggplot(aes(route_difficulty, y = {{ var }}, color = {{ var }}))
      + geom_point(
        aes(group = crag, linesize = route_name), # Adding unused aesthetics to get plotly's automated tooltips
        position = position_jitter(height = 0.2), alpha = 0.6
      )
      + labs(
        title = title,
        x = "Route Difficulty", 
        y = str_to_title(str_replace_all(deparse(substitute(var)), "_", " "))
      )
      + theme(legend.position = "none")
  )
  
  return(ggplotly(strip_plot, tooltip = c("group", "linesize")))
}

Route Ratings:

climbs_pp[climb_type == 0] |> 
  strip_plot(route_rating, "Climbing Route Difficulties")

Bouldering Grades:

climbs_pp[climb_type == 1 & bouldering_grade != "V0"] |> 
  strip_plot(bouldering_grade, "Bouldering Problem Difficulties")

─ Session info ───────────────────────────────────────────────────────────────
 setting        value
 version        R version 4.3.1 (2023-06-16)
 os             Ubuntu 22.04.3 LTS
 system         x86_64, linux-gnu
 ui             X11
 language       (EN)
 collate        C.UTF-8
 ctype          C.UTF-8
 tz             Europe/Paris
 date           2024-02-07
 pandoc         3.1.11
 Quarto         1.5.9
 Stan (CmdStan) 2.33.1

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version date (UTC) lib source
 P archive    * 1.1.6   2023-09-18 [?] CRAN (R 4.3.1)
 P bayesplot  * 1.10.0  2022-11-16 [?] CRAN (R 4.3.0)
 P cmdstanr   * 0.6.1   2023-09-13 [?] local
 P crayon     * 1.5.2   2022-09-29 [?] CRAN (R 4.3.0)
 P data.table * 1.15.0  2024-01-30 [?] CRAN (R 4.3.1)
 P DBI        * 1.1.3   2022-06-18 [?] CRAN (R 4.3.0)
 P dbplyr     * 2.4.0   2023-10-26 [?] CRAN (R 4.3.1)
 P dplyr      * 1.1.4   2023-11-17 [?] CRAN (R 4.3.1)
 P duckdb     * 0.9.2-1 2023-11-28 [?] CRAN (R 4.3.1)
 P fs         * 1.6.3   2023-07-20 [?] CRAN (R 4.3.0)
 P ggplot2    * 3.4.4   2023-10-12 [?] CRAN (R 4.3.1)
 P gt         * 0.10.0  2023-10-07 [?] CRAN (R 4.3.1)
 P here       * 1.0.1   2020-12-13 [?] CRAN (R 4.3.0)
 P htmltools  * 0.5.6.1 2023-10-06 [?] CRAN (R 4.3.1)
 P knitr      * 1.44    2023-09-11 [?] CRAN (R 4.3.0)
 P lubridate  * 1.9.3   2023-09-27 [?] CRAN (R 4.3.1)
 P patchwork  * 1.1.3   2023-08-14 [?] CRAN (R 4.3.0)
 P pipebind   * 0.1.2   2023-08-30 [?] CRAN (R 4.3.0)
 P plotly     * 4.10.2  2023-06-03 [?] CRAN (R 4.3.0)
 P posterior  * 1.4.1   2023-03-14 [?] CRAN (R 4.3.0)
 P purrr      * 1.0.2   2023-08-10 [?] CRAN (R 4.3.0)
 P reactable  * 0.4.4   2023-03-12 [?] CRAN (R 4.3.0)
 P stringr    * 1.5.0   2022-12-02 [?] CRAN (R 4.3.0)
 P tidyr      * 1.3.0   2023-01-24 [?] CRAN (R 4.3.0)

 [1] /home/mar/Dev/Projects/R/ma-riviere.com/renv/library/R-4.3/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/sandbox/R-4.3/x86_64-pc-linux-gnu/9a444a72

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────
Back to top

Citation

BibTeX citation:
@online{rivière2022,
  author = {Rivière, Marc-Aurèle},
  title = {Bayesian {Rock} {Climbing} {Rankings}},
  date = {2022-04-19},
  url = {https://ma-riviere.com/content/code/posts/climbing},
  langid = {en},
  abstract = {This post is a transposition to R of Ethan Rosenthal’s
    {[}blog
    post{]}(https://www.ethanrosenthal.com/2022/04/15/bayesian-rock-climbing/)
    on modeling Rock Climbing route difficulty using a Bayesian IRT
    (Item Response Theory) model. The original Stan code was updated to
    use {[}within-chain
    parallelization{]}(https://mc-stan.org/docs/2\_30/stan-users-guide/reduce-sum.html)
    and {[}compiler
    optimization{]}(https://mc-stan.org/docs/2\_30/stan-users-guide/optimization.html)
    for faster CPU sampling. Several data processing solutions are
    showcased, using either `data.table` or `dbplyr` (with a `DuckDB`
    backend), with timings to compare their speed.}
}
For attribution, please cite this work as:
Rivière, M.-A. (2022, April 19). Bayesian Rock Climbing Rankings. https://ma-riviere.com/content/code/posts/climbing