Fast spatial data matching in R

Exploring different solutions to match locations based on their geographical distance

Big Data
Data Manipulation
GIS
R
SQL
DuckDB
Spatial
First Published

June 18, 2022

Summary

This post showcases various solutions to efficiently match unknown locations to known ones by their geographical proximity (using lat/long coordinates), on a dataset with millions of entries.


Tip

You can check the source code by clicking on the </> Code button at the top-right.

The data is a list of bike rides from the Divvy bike-sharing service in Chicago, and the stations are the bike stations where the bikes can be picked up and dropped off. This post was prompted by a question on Reddit: how could one identify the many unknown stations within the rides data (i.e. rides’ start or end stations missing their id and name) to known stations, by their geographical proximity.

This post is related to a case study of Google’s Data Analytics certificate, and the data can be found here (June 2021 to May 2022).

v1: 2022-06-18

v2: 2022-10-29

  • Added a DuckDB + arrow example for the data loading.
  • Added data cleaning section (removing implausible rides).
  • Added sf, dbplyr and data.table (manual) examples for the spatial join.

v3: 2023-03-12

  • Updated code to newest tidyverse version (.by, reframe, list_rbind, join_by, …)

v4: 2023-09-27

  • Updated the SQL/dbplyr examples with DuckDB v0.8.1:
    • Using the new PIVOT/UNPIVOT SQL statements
    • Using the new spatial DuckDB extension for spatial operations
  • Updates the data.table examples:
    • Using both sf and manual distances calculations (haversine) for spatial operations
    • Removed the dtplyr examples
  • Improved the document’s structure & explanations

Setup


library(here)        # Working directory management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies

library(readr)       # Reading data from files           (Tidyverse)
library(dplyr)       # Manipulating data.frames - core   (Tidyverse)
library(tidyr)       # Manipulating data.frames - extras (Tidyverse)
library(stringr)     # Manipulating strings              (Tidyverse)
library(purrr)       # Manipulating lists                (Tidyverse)
library(lubridate)   # Manipulating date/time            (Tidyverse)
library(ggplot2)     # Best plotting library             (Tidyverse)

library(dbplyr)      # SQL back-end for dplyr            (Tidyverse)
library(duckdb)      # Quack Stack

library(arrow)       # Fast and efficient data reading
library(data.table)  # Fast data manipulation (in-RAM)

library(sf)          # Spatial data manipulation
library(fuzzyjoin)   # Non-equi joins & coordinates-based joins

library(leaflet)     # Interative map plots

options(
  dplyr.strict_sql = FALSE,
  scipen = 999L, 
  digits = 4L
)

nrows_print <- 10

data.table::setDTthreads(parallel::detectCores(logical = FALSE))
Adding custom knitr hooks
#---------------------------#
####🔺knitr custom hooks ####
#---------------------------#

library(knitr)

## Adding the `time_it` code chunk option
knitr::knit_hooks$set(time_it = local({
  assign("TIMES", list(), .GlobalEnv)
  start <- NULL
  function(before, options) {
    if (before) start <<- Sys.time()
    else TIMES[[options$label]] <<- difftime(Sys.time(), start)
  }
}))
Applying a custom theme to all gt tables
#-----------------------#
####🔺gt knit_prints ####
#-----------------------#

library(knitr)
library(gt)

knit_print.grouped_df <- function(x, options, ...) {
  if ("grouped_df" %in% class(x)) x <- ungroup(x)
  
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "grouped_df", knit_print.grouped_df)

knit_print.data.frame <- function(x, options, ...) {
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "data.frame", knit_print.data.frame)

1 Loading the data


data_path <- here("res", "data", "stations")

files <- dir_ls(data_path, glob = "*.csv")
rides <- lapply(files, \(f) fread(f, na.strings = "")) |> rbindlist()
Time difference of 11.95 secs
rides_tidy <- read_csv(files, show_col_types = FALSE)
Time difference of 5.122 secs
Alternatives
map(files, \(f) read_csv(f, show_col_types = FALSE)) |> list_rbind()

map_dfr(files, \(f) read_csv(f, show_col_types = FALSE))
con <- duckdb::dbConnect(
  duckdb(), 
  dbdir = ":memory:", 
  config = list(memory_limit = "30GB", temp_directory = here("_temp"), threads = 16L)
)
duckdb_read_csv(con, "rides", files)
Time difference of 3.219 secs
Importing an existing df from the R env
dbWriteTable(con, "rides", rides_tidy) # As a TABLE

copy_to(con, rides_tidy, "rides", indexes = "ride_id")

duckdb_register(con, "rides", rides_tidy) # As a VIEW
Importing while including the file names in the data
## Courtesy of Tristan Mahr (@tjmahr)

withr::with_dir(
  data_path, 
  dbSendQuery(con,"CREATE TABLE rides AS SELECT * FROM read_csv_auto('*.csv', FILENAME = TRUE)")
)
open_csv_dataset(data_path, convert_options = CsvConvertOptions$create(strings_can_be_null = TRUE)) |> 
  to_duckdb(con, "rides_arrow")
Time difference of 0.2388 secs

Output:

data.table [5,860,776 x 13]
ride_id rideable_type started_at ended_at start_station_name start_station_id end_station_name end_station_id start_lat start_lng end_lat end_lng member_casual
99FEC93BA843FB20 electric_bike 2021-06-13 14:31:28 2021-06-13 14:34:11 NA NA NA NA 41.8 −87.59 41.8 −87.6 member
06048DCFC8520CAF electric_bike 2021-06-04 11:18:02 2021-06-04 11:24:19 NA NA NA NA 41.79 −87.59 41.8 −87.6 member
9598066F68045DF2 electric_bike 2021-06-04 09:49:35 2021-06-04 09:55:34 NA NA NA NA 41.8 −87.6 41.79 −87.59 member
B03C0FE48C412214 electric_bike 2021-06-03 19:56:05 2021-06-03 20:21:55 NA NA NA NA 41.78 −87.58 41.8 −87.6 member
B9EEA89F8FEE73B7 electric_bike 2021-06-04 14:05:51 2021-06-04 14:09:59 NA NA NA NA 41.8 −87.59 41.79 −87.59 member
62B943CEAAA420BA electric_bike 2021-06-03 19:32:01 2021-06-03 19:38:46 NA NA NA NA 41.78 −87.58 41.78 −87.58 member
7E2546FBA79C46EE electric_bike 2021-06-10 16:30:10 2021-06-10 16:36:21 NA NA NA NA 41.79 −87.6 41.79 −87.59 member
3DDF3BBF6C4C3C89 electric_bike 2021-06-10 17:00:30 2021-06-10 17:06:48 NA NA NA NA 41.79 −87.59 41.8 −87.59 member
2608805637155AB6 electric_bike 2021-06-10 12:46:16 2021-06-10 12:55:02 NA NA NA NA 41.93 −87.67 41.94 −87.68 member
AF529C946F28ED42 electric_bike 2021-06-23 17:57:29 2021-06-23 18:06:40 NA NA Michigan Ave & Oak St 13042 41.88 −87.61 41.901 −87.624 member
E6010941FB92E4A6 electric_bike 2021-06-22 19:28:02 2021-06-22 19:39:48 NA NA NA NA 41.87 −87.62 41.87 −87.64 member
1149C0723F7AFFD5 electric_bike 2021-06-29 17:35:49 2021-06-29 17:55:11 NA NA NA NA 41.9 −87.63 41.9 −87.68 member
8762DB62099E6011 electric_bike 2021-06-05 14:55:05 2021-06-05 15:13:29 NA NA NA NA 41.89 −87.62 41.88 −87.62 member
BE3AC77CBFF17E6A electric_bike 2021-06-05 14:05:00 2021-06-05 14:09:01 NA NA NA NA 41.89 −87.62 41.89 −87.62 member
8E9F2CB0893B96A0 electric_bike 2021-06-05 13:39:04 2021-06-05 13:57:21 NA NA NA NA 41.88 −87.62 41.89 −87.62 member
[ omitted 5,860,761 entries ]
✦ For each step of the process, there will be two solutions:
  1. An In-RAM solution using data.table to manipulate the data. Spatial operations will be done with both precise (using the sf package) and approximate (e.g. the haversine distance) methods.

  2. An Out-of-RAM solution using DuckDB to manipulate the data. Spatial operations will be shown using both precise (using the spatial DuckDB extension) and approximate (e.g. haversine distance) methods. Code examples will be provided in both SQL and dbplyr (which translates Tidyverse code to SQL behind the scenes).

2 Cleaning & reshaping the data


2.1 Removing rides with missing coordinates

First, since our ultimate goal is to use geographic information (lat/lng coordinates) to identify stations, we’ll get rid of rides that have missing geographic information:

rides <- rides[!is.na(start_lat) & !is.na(start_lng) & !is.na(end_lat) & !is.na(end_lng), ]
(tbl(con, "rides")
  |> mutate(across(ends_with("_at"), \(x) strptime(x, "%Y-%m-%d %H:%M:%S"))) # String -> Date object
  |> filter(if_all(matches("_lat$|_lng$"), \(x) !is.na(x)))
  |> copy_to(con, df = _, "rides", overwrite = TRUE)
)

2.2 Removing implausible rides

Then, we are going to filter out rides that are implausible (i.e. too short, too long, made too quickly, etc). We will compute distances (and speeds), which we can do in one of two ways:

  1. An accurate way where we turn our lat/lng coordinates into two 2D points on a projected coordinate system, which is a small version of the globe (flattened and corrected for distortion) that can be used for area and distance calculations.

  2. A quick and dirty way using the haversine function to compute the “distance on a sphere” between the two sets of coordinates.

N.B.: The distances/time/speed thresholds I use here are arbitrary. Choose what makes sense depending on your domain knowledge.

Precise distance with sf::st_distance:

First, let’s create a new dataset with a POINT geometry for both start and end stations’ positions:

rides_geom <- (
  rides
  |> st_as_sf(coords = c("start_lng", "start_lat"), remove = FALSE, crs = 4326)
  |> st_set_geometry("start_pos")
  |> mutate(end_pos = map2(end_lng, end_lat, \(x, y) st_point(c(x, y))) |> st_sfc(crs = 4326))
  |> as.data.table()
)

Then, use those two points to compute an accurate distance between each rides’ start_pos and end_pos, and use that distance to filter implausible rides:

rides_clean_geom <- (
  rides_geom[
    , ride_dist_m := st_distance(st_transform(start_pos, 5069), st_transform(end_pos, 5069), by_element = TRUE)
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  ][start_station_id != end_station_id | as.numeric(ride_dist_m) %between% c(100, 30000)
  ## Add ride duration and speed info
  ][, ride_dur_min := difftime(ended_at, started_at, units = "mins") |> abs() |> as.numeric()
  ][, ride_speed_km_h := fifelse(ride_dur_min == 0, NA, (as.numeric(ride_dist_m) / 1000) / (ride_dur_min / 60))
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  ][ride_dur_min %between% c(5, 1440)
  ## Remove rides that have implausible speeds (> 40 km/h for regular bikes, and 60 km/h for electric ones)
  ][ (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
     | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  ## Removing the new columns now that the cleaning is done
  ][, let(ride_dist_m = NULL, ride_dur_min = NULL, ride_speed_km_h = NULL)]
)
Time difference of 2.779 mins

Approximate distance with the haversine:

The haversine function computes the distance between two points on a sphere, based on their coordinates. But the Earth isn’t a sphere, so the estimated distance will only be approximate. However, for small distances, it’s accurate enough for our goals. And it’s much faster !

rides_clean_haversine <- (
  rides[
    , ride_dist_m := geosphere::distHaversine(.SD[, .(start_lng, start_lat)], .SD[, .(end_lng, end_lat)])
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  ][start_station_id != end_station_id | ride_dist_m %between% c(100, 30000)
  ## Add ride duration and speed info
  ][, ride_dur_min := difftime(ended_at, started_at, units = "mins") |> abs() |> as.numeric()
  ][, ride_speed_km_h := fifelse(ride_dur_min == 0, NA, (ride_dist_m / 1000) / (ride_dur_min / 60))
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  ][ride_dur_min %between% c(5, 1440)
  ## Remove rides that have implausible speeds (> 40 km/h for regular bikes, and 60 km/h for electric ones)
  ][ (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
     | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  ## Removing the new columns now that the cleaning is done
  ][, let(ride_dist_m = NULL, ride_dur_min = NULL, ride_speed_km_h = NULL)]
)
Time difference of 4.401 secs

The haversine solution is 38 times faster !

Precise distance with st_distance of the spatial extension:

INSTALL spatial;
LOAD spatial;
(tbl(con, "rides")
  |> mutate(
    start_pos = st_point(start_lat, start_lng) |> st_transform("EPSG:4326", "EPSG:5069"), 
    end_pos = st_point(end_lat, end_lng) |> st_transform("EPSG:4326", "EPSG:5069"),
    ride_dist_m = st_distance(start_pos, end_pos)
  )
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  |> filter(between(ride_dist_m, 100, 30000))
  ## Add ride duration and speed info
  |> mutate(
    ride_dur_min = abs(date_sub("second", started_at, ended_at) / 60),
    ride_speed_km_h = if_else(ride_dur_min == 0, NA, (ride_dist_m / 1000) / (ride_dur_min / 60))
  )
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  |> filter(between(ride_dur_min, 5, 1440))
  ## Rides with implausible speeds
  |> filter(
    (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
    | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  )
  ## Removing the new columns now that the cleaning is done
  |> select(-ride_dist_m, -ride_dur_min, -ride_speed_km_h)
  ## Generating a new table from the result of the cleaning
  |> compute("rides_clean_geom", overwrite = TRUE)
)
Time difference of 4.928 mins

Approximate distance with the haversine:

CREATE OR REPLACE FUNCTION haversine(lng1, lat1, lng2, lat2) 
  AS 6378137 * acos( 
    cos(radians(lat1)) * cos(radians(lat2)) * cos(radians(lng2) - radians(lng1)) +
    sin(radians(lat1)) * sin(radians(lat2))
  );
(tbl(con, "rides")
  |> mutate(ride_dist_m = haversine(start_lng, start_lat, end_lng, end_lat))
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  |> filter(between(ride_dist_m, 100, 30000))
  ## Add ride duration and speed info
  |> mutate(
    ride_dur_min = abs(date_sub("second", started_at, ended_at) / 60),
    ride_speed_km_h = if_else(ride_dur_min == 0, NA, (ride_dist_m / 1000) / (ride_dur_min / 60))
  )
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  |> filter(between(ride_dur_min, 5, 1440))
  ## Rides with implausible speeds
  |> filter(
    (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
    | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  )
  ## Removing the new columns now that the cleaning is done
  |> select(-ride_dist_m, -ride_dur_min, -ride_speed_km_h)
  ## Generating a new table from the result of the cleaning
  |> compute("rides_clean_haversine", overwrite = TRUE)
)
Time difference of 1.846 secs

The haversine solution is 160 times faster !

Output:

data.table [4,383,020 x 15]
ride_id rideable_type started_at ended_at start_station_name start_station_id end_station_name end_station_id start_lat start_lng end_lat end_lng member_casual start_pos end_pos
06048DCFC8520CAF electric_bike 2021-06-04 11:18:02 2021-06-04 11:24:19 NA NA NA NA 41.79 −87.59 41.8 −87.6 member c(-87.59, 41.79) c(-87.6, 41.8)
9598066F68045DF2 electric_bike 2021-06-04 09:49:35 2021-06-04 09:55:34 NA NA NA NA 41.8 −87.6 41.79 −87.59 member c(-87.6, 41.8) c(-87.59, 41.79)
B03C0FE48C412214 electric_bike 2021-06-03 19:56:05 2021-06-03 20:21:55 NA NA NA NA 41.78 −87.58 41.8 −87.6 member c(-87.58, 41.78) c(-87.6, 41.8)
7E2546FBA79C46EE electric_bike 2021-06-10 16:30:10 2021-06-10 16:36:21 NA NA NA NA 41.79 −87.6 41.79 −87.59 member c(-87.6, 41.79) c(-87.59, 41.79)
3DDF3BBF6C4C3C89 electric_bike 2021-06-10 17:00:30 2021-06-10 17:06:48 NA NA NA NA 41.79 −87.59 41.8 −87.59 member c(-87.59, 41.79) c(-87.59, 41.8)
2608805637155AB6 electric_bike 2021-06-10 12:46:16 2021-06-10 12:55:02 NA NA NA NA 41.93 −87.67 41.94 −87.68 member c(-87.67, 41.93) c(-87.68, 41.94)
AF529C946F28ED42 electric_bike 2021-06-23 17:57:29 2021-06-23 18:06:40 NA NA Michigan Ave & Oak St 13042 41.88 −87.61 41.901 −87.624 member c(-87.61, 41.88) c(-87.623698, 41.9010521666667)
E6010941FB92E4A6 electric_bike 2021-06-22 19:28:02 2021-06-22 19:39:48 NA NA NA NA 41.87 −87.62 41.87 −87.64 member c(-87.62, 41.87) c(-87.64, 41.87)
1149C0723F7AFFD5 electric_bike 2021-06-29 17:35:49 2021-06-29 17:55:11 NA NA NA NA 41.9 −87.63 41.9 −87.68 member c(-87.63, 41.9) c(-87.68, 41.9)
8762DB62099E6011 electric_bike 2021-06-05 14:55:05 2021-06-05 15:13:29 NA NA NA NA 41.89 −87.62 41.88 −87.62 member c(-87.62, 41.89) c(-87.62, 41.88)
8E9F2CB0893B96A0 electric_bike 2021-06-05 13:39:04 2021-06-05 13:57:21 NA NA NA NA 41.88 −87.62 41.89 −87.62 member c(-87.62, 41.88) c(-87.62, 41.89)
6344B71B7BB6E09E electric_bike 2021-06-22 18:52:53 2021-06-22 18:59:13 NA NA NA NA 41.79 −87.59 41.8 −87.6 member c(-87.59, 41.79) c(-87.6, 41.8)
59CE9444E2ED2530 electric_bike 2021-06-02 10:30:11 2021-06-02 10:37:03 NA NA NA NA 41.79 −87.6 41.8 −87.59 member c(-87.6, 41.79) c(-87.59, 41.8)
F7107122A837A50B electric_bike 2021-06-08 18:31:31 2021-06-08 18:38:25 NA NA NA NA 41.78 −87.6 41.8 −87.6 member c(-87.6, 41.78) c(-87.6, 41.8)
45ABF9231CC02E3C electric_bike 2021-06-07 22:24:08 2021-06-07 22:35:25 NA NA NA NA 41.79 −87.58 41.78 −87.6 member c(-87.58, 41.79) c(-87.6, 41.78)
[ omitted 4,383,005 entries ]

Our cleaning removed 1,472,720 entries !

Code
(rides_clean_haversine[
    , .(ride_id, dist_haversine = geosphere::distHaversine(.SD[, .(start_lng, start_lat)], .SD[, .(end_lng, end_lat)]))
  ][rides_clean_geom[
    , .(ride_id, dist_geom = st_distance(st_transform(start_pos, st_crs(5069)), st_transform(end_pos, st_crs(5069)), by_element = TRUE))]
    , on = "ride_id"
  ][, .(distance_diff = abs(dist_haversine - as.numeric(dist_geom))), by = ride_id]
  |> ggplot(aes(x = distance_diff))
      + geom_histogram(bins = 100, color = NA, fill = "lightblue")
      + labs(
        title = "Distribution of the differences in distance",
        subtitle = "Between **sf::st_distance** and **haversine**",
        x = "Difference in distance (in meters)", y = ""
      )
      + theme(axis.text.y = element_blank())
)

2.3 Pivoting/melting the data

Since we are interested in filling missing station information, we will first pivot the data in order to only have one station per row, to facilitate subsequent operations based on their coordinates.

rides_clean_geom_long <- (
  rides_clean_geom
  |> melt(measure = measure(way, value.name, pattern = "(end|start).*_(at|name|id|lat|lng|pos)"))
)
Time difference of 0.4001 secs
(tbl(con, "rides_clean_geom")
  |> pivot_longer(
    cols = matches("^end|^start"), names_pattern = "(end|start).*_(at|name|id|lat|lng|pos)", 
    names_to = c("way", ".value")
  )
  |> compute("rides_clean_geom_long", overwrite = TRUE)
)
Time difference of 0.5486 secs
CREATE OR REPLACE TABLE rides_clean_geom_long AS 
SELECT * FROM (
  UNPIVOT rides_clean_geom
  ON (started_at, start_station_name, start_station_id, start_lat, start_lng, start_pos) as 'start', 
     (ended_at, end_station_name, end_station_id, end_lat, end_lng, end_pos) as 'end'
  INTO
    NAME way
    VALUE at, name, id, lat, lng, pos
);
Time difference of 4.001 secs

Output:

data.frame [8,765,038 x 10]
ride_id rideable_type member_casual way at name id lat lng pos
60F97090AC85F55E classic_bike member start 2021-06-27 12:26:58 Clark St & Grace St TA1307000127 41.951 −87.659 <raw [32]>
AAD9B0718A051AFE classic_bike casual start 2021-06-11 19:38:31 Dearborn St & Monroe St TA1305000006 41.881 −87.63 <raw [32]>
38D67BCCF380A714 classic_bike casual start 2021-06-14 15:20:00 Clark St & Grace St TA1307000127 41.951 −87.659 <raw [32]>
E4930F461D7CAC3C classic_bike casual start 2021-06-23 17:24:39 Wells St & Walton St TA1306000011 41.9 −87.634 <raw [32]>
7CCAD714D89C9D69 classic_bike casual start 2021-06-05 16:57:13 Wells St & Walton St TA1306000011 41.9 −87.634 <raw [32]>
61B8AB46091738BE classic_bike casual start 2021-06-04 22:11:15 Clark St & Leland Ave TA1309000014 41.967 −87.667 <raw [32]>
06983E079ECF6A1C classic_bike casual start 2021-06-05 00:47:35 Clark St & Grace St TA1307000127 41.951 −87.659 <raw [32]>
BFAFDECF376F4AC3 classic_bike casual start 2021-06-16 20:48:52 Michigan Ave & Oak St 13042 41.901 −87.624 <raw [32]>
3EF3C3675F6029B2 classic_bike casual start 2021-06-16 19:26:25 Michigan Ave & Oak St 13042 41.901 −87.624 <raw [32]>
B6E524DF38B2085B classic_bike casual start 2021-06-13 13:21:38 Larrabee St & Armitage Ave TA1309000006 41.918 −87.644 <raw [32]>
[ omitted 8,765,023 entries ]

3 Which stations are unidentified ?


Our ultimate goal is to fill missing stations’ name and id properties based on their geographical proximity to other know stations.

So let’s see which stations miss either their name or id after the cleaning we’ve done:

rides_clean_geom_long_unk <- rides_clean_geom_long[is.na(id) | is.na(name), ]
(tbl(con, "rides_clean_geom_long") 
  |> filter(if_any(matches("id$|name$"), is.na))
  |> compute("rides_clean_geom_long_unk", overwrite = TRUE)
)
CREATE OR REPLACE TABLE rides_clean_geom_long_unk AS
SELECT * FROM rides_clean_geom_long
WHERE (name IS NULL) OR (id IS NULL) 

Output:

data.frame [1,199,219 x 10]
ride_id rideable_type member_casual way at name id lat lng pos
00000B4F1F71F9C2 electric_bike member end 2021-09-08 16:37:54 NA NA 41.91 −87.7 <raw [32]>
00000B4F1F71F9C2 electric_bike member start 2021-09-08 16:31:38 NA NA 41.91 −87.69 <raw [32]>
00000E22FBA89D81 electric_bike member start 2022-05-19 14:42:55 NA NA 41.9 −87.62 <raw [32]>
000018B1D040DB44 electric_bike member end 2022-04-25 10:44:19 NA NA 41.79 −87.59 <raw [32]>
000018B1D040DB44 electric_bike member start 2022-04-25 10:37:22 NA NA 41.79 −87.6 <raw [32]>
00002E385DB2888C electric_bike casual end 2022-05-07 16:40:28 NA NA 41.87 −87.75 <raw [32]>
00002E385DB2888C electric_bike casual start 2022-05-07 16:28:53 NA NA 41.89 −87.75 <raw [32]>
00004B000220B299 electric_bike member end 2022-05-06 17:53:31 NA NA 41.98 −87.67 <raw [32]>
0000664E22910718 electric_bike casual end 2021-08-09 16:01:27 NA NA 42.02 −87.67 <raw [32]>
000095C5DDB7E97A electric_bike member end 2021-09-06 13:48:39 NA NA 41.91 −87.63 <raw [32]>
[ omitted 1,199,204 entries ]

There are 1,199,219 stations missing an id or name in the original data !

But why are so many stations unknown/unidentified ?

Could it be an issue of location accuracy ? I.e. the GPS information was too bad for the ride app to properly locate where the user was when arriving or leaving a station ?

If this is the case, then most of the unknown stations should have poor coordinate accuracy (i.e. their lat/lng values should have too few decimal points).

Let’s find out !

4 A question of location accuracy ?


First, a quick recap on how lat/lng coordinates precision (i.e. the number of decimals) relates to positioning accuracy:

Decimal Distance at the equator (m) Distance at the 45th (N or S) parallel (m)
0 111,120 78,710
1 11,112 7,871
2 1,111.2 787.1
3 111.12 78.71
4 11.112 7.871
5 1.1112 0.7871

See this thread for more details.

Roughly a 100 meters radius of uncertainty to locate a station when you only have 3 decimal points in your coordinates is quite large. It seems like a good threshold for us, so let’s remove entries where either lat or lng have 3 or less decimals, since those entries would not have sufficient precision to be reliably matched to a known station by their position alone.

Let’s define a function to only keep stations with 4 or more decimals in their lat or lng coordinate (which we will call the accuracy-4 or filtered data henceforth):

has_acc4 <- function(x) str_length(str_remove(as.character(abs(x)), ".*\\.")) >= 4
CREATE FUNCTION has_acc4(x) AS length(str_split(CAST(abs(x) AS VARCHAR(16)), '.')[2]) >= 4

And now, let’s apply it to our table of unknown stations (rides_clean_geom_long_unk), and see how many remain afterwards:

rides_clean_geom_long_unk_acc4 <- rides_clean_geom_long_unk[has_acc4(lat) & has_acc4(lng), ]
Time difference of 1.773 secs
(tbl(con, "rides_clean_geom_long_unk") 
  |> filter(if_all(c(lat, lng), has_acc4))
  |> compute("rides_clean_geom_long_unk_acc4", overwrite = TRUE)
)
Time difference of 0.08684 secs
CREATE OR REPLACE TABLE rides_clean_geom_long_unk_acc4 AS
SELECT * FROM rides_clean_geom_long_unk
WHERE has_acc4(lat) AND has_acc4(lng)
Time difference of 0.02707 secs

Output:

data.table [2 x 10]
ride_id rideable_type member_casual way at name id lat lng pos
176105D1F8A1216B electric_bike casual start 2021-07-18 03:44:22 NA 13221 41.908 −87.673 c(-87.672552, 41.907655)
EE197EDA4CF8CFE5 electric_bike casual start 2021-09-22 07:14:42 NA WL-008 41.867 −87.641 c(-87.641087, 41.867117)

After filtering imprecise coordinates, there are only 2 stations missing either their id or name !

The fact that most of the unknown stations got removed by the coordinate-accuracy filtering gives credence to our previous hypothesis: most of the previous 1,199,219 unidentified stations were missing their id/name because their location was too imprecise to be matched to any known station by the ride app.

Which means that attempting to do such matching ourselves would be an exercise in futility …

But let’s attempt it anyway, since it is the purpose of this blog post !

5 Location-based station identification


First, we’ll need to assemble a reference/look-up table for stations. This table will link each station (identified by a unique combination of id and name) to a unique set of coordinates (i.e. the location of that station). This will allow us to later match unknown stations from the rides data to those in this reference table, by comparing their locations.

5.1 Station reference table

Our rides data contains a small number of stations that each appear multiple times, either as the start or end of a ride. For each unique station, we will aggregate all the rides that start or end at that station, and set the station’s coordinates at the center of all the existing coordinates for that station. We have to way to go about this aggregation:

  1. An accurate method such as st_centroid to find out the “center” of this set coordinates (using sf / the spatial extension).

  2. An approximate method such as taking the average of all the lat/lng values we have in our rides data, for each station. This solution can be imprecise since it does not take the precision of each coordinate into account (e.g. giving more weight to more accurate coordinates), and the result will have a misleading level of accuracy due to the averaging process.

Let’s do both and see how they compare !

5.1.1 Using the accuracy-4 data

Here, since we want to establish our “stations” lookup table from the “accuracy-4” data, we need to apply our has_acc4 function to rides_clean_geom_long to only keep entries (i.e. rides) with 4 or more decimals in their lat or lng coordinate:

rides_clean_geom_long_acc4 <- rides_clean_geom_long[has_acc4(lat) & has_acc4(lng), ]
Time difference of 17.62 secs
(tbl(con, "rides_clean_geom_long") 
  |> filter(if_all(c(lat, lng), has_acc4))
  |> compute("rides_clean_geom_long_acc4", overwrite = TRUE)
)
Time difference of 2.883 secs

Now, let’s use the resulting table (rides_clean_geom_long_acc4) to build our stations lookup table:

Precise location with sf::st_centroid:

stations_clean_acc4_geom <- (
  rides_clean_geom_long_acc4
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, find the centroid of the "pos" POINT
  |> _[, .(pos = st_centroid(st_combine(pos))), by = .(id, name)]
  ## Order the result by station id
  |> setorder(id)
)
Time difference of 18.97 secs

Approximate location by averaging the coordinates:

stations_clean_acc4_mean <- (
  rides_clean_geom_long_acc4
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, average lat and lng values
  |> dcast(id + name ~ ., fun.aggregate = mean, value.var = c("lat", "lng"))
  ## Order the result by station id
  |> setorder(id)
)
Time difference of 0.681 secs

Precise location using st_centroid from the spatial extension:

(tbl(con, "rides_clean_geom_long_acc4")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      pos = list(pos) |> st_collect() |> st_centroid()
  )
  |> arrange(id)
  |> compute("stations_clean_acc4_geom", overwrite = TRUE)
)
Time difference of 0.6707 secs
CREATE OR REPLACE TABLE stations_clean_acc4_geom AS
FROM rides_clean_geom_long_acc4
SELECT id, name, st_centroid(st_collect(list(pos))) AS pos_centroid 
WHERE (id IS NOT NULL) AND (name IS NOT NULL)
GROUP BY id, name
ORDER BY id;
Time difference of 0.5262 secs

Approximate location by averaging the coordinates:

(tbl(con, "rides_clean_geom_long_acc4")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      across(c(lat, lng), mean)
  )
  |> arrange(id)
  |> compute("stations_clean_acc4_mean", overwrite = TRUE)
)
Time difference of 0.1491 secs


Geometry

data.table [700 x 3]
id name pos
13001 Michigan Ave & Washington St c(-87.624608303352, 41.8839739921181)
13006 LaSalle St & Washington St c(-87.6325570579559, 41.8827048196346)
13008 Millennium Park c(-87.6240839243461, 41.8810511820254)
13011 Canal St & Adams St c(-87.6399264828786, 41.8793312329564)
13016 St. Clair St & Erie St c(-87.6228155199715, 41.8943253700403)
13017 Franklin St & Chicago Ave c(-87.635691270916, 41.8967491893866)
13021 Clinton St & Lake St c(-87.6418271678616, 41.8856048981661)
13022 Streeter Dr & Grand Ave c(-87.6120624843549, 41.8922700930906)
13028 900 W Harrison St c(-87.6498031272599, 41.8747486017636)
13029 Field Museum c(-87.6179053613466, 41.8653188860571)
13033 Milwaukee Ave & Grand Ave c(-87.6483560756473, 41.8915921258399)
13034 Michigan Ave & Pearson St c(-87.6235541871001, 41.8975481429342)
13036 Michigan Ave & Madison St c(-87.6251647130149, 41.8821223263018)
13037 Clinton St & Tilden St c(-87.6407754094505, 41.8758825068496)
13042 Michigan Ave & Oak St c(-87.6237787792645, 41.9009749388655)
[ omitted 685 entries ]


Average

data.table [700 x 4]
id name lat lng
13001 Michigan Ave & Washington St 41.8839739908871 -87.6246083065155
13006 LaSalle St & Washington St 41.8827048071804 -87.6325570732024
13008 Millennium Park 41.8810511786992 -87.6240839338728
13011 Canal St & Adams St 41.8793312323952 -87.6399264829988
13016 St. Clair St & Erie St 41.8943253676363 -87.622815520661
13017 Franklin St & Chicago Ave 41.89674918882 -87.6356912707959
13021 Clinton St & Lake St 41.8856048974089 -87.6418271677671
13022 Streeter Dr & Grand Ave 41.892270090806 -87.6120624881201
13028 900 W Harrison St 41.8747486007635 -87.6498031288495
13029 Field Museum 41.865318885576 -87.6179053629858
13033 Milwaukee Ave & Grand Ave 41.8915921241123 -87.6483560767347
13034 Michigan Ave & Pearson St 41.8975481425484 -87.6235541868204
13036 Michigan Ave & Madison St 41.8821223254923 -87.6251647148276
13037 Clinton St & Tilden St 41.8758825065679 -87.6407754093102
13042 Michigan Ave & Oak St 41.9009749374364 -87.6237787872266
[ omitted 685 entries ]

5.1.2 Using the unfiltered data

Precise location with sf::st_centroid:

stations_clean_geom <- (
  rides_clean_geom_long
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, find the centroid of the "pos" POINT
  |> _[, .(pos = st_centroid(st_combine(pos))), by = .(id, name)]
  ## Order the result by station id
  |> setorder(id)
)
Time difference of 20.4 secs

Approximate location by averaging the coordinates:

stations_clean_mean <- (
  rides_clean_geom_long
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, average lat and lng values
  |> dcast(id + name ~ ., fun.agg = mean, value.var = c("lat", "lng"))
  ## Order the result by station id
  |> setorder(id)
)
Time difference of 1.045 secs

Precise location using st_centroid from the spatial extension:

(tbl(con, "rides_clean_geom_long")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      pos = list(pos) |> st_collect() |> st_centroid()
  )
  |> arrange(id)
  |> compute("stations_clean_geom", overwrite = TRUE)
)
Time difference of 0.6858 secs
CREATE OR REPLACE TABLE stations_clean_geom AS
FROM rides_clean_geom_long
SELECT id, name, st_centroid(st_collect(list(pos))) AS pos_centroid 
WHERE (id IS NOT NULL) AND (name IS NOT NULL)
GROUP BY id, name
ORDER BY id;
Time difference of 0.5395 secs

Approximate location by averaging the coordinates:

(tbl(con, "rides_clean_geom_long")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      across(c(lat, lng), mean)
  )
  |> arrange(id)
  |> compute("stations_clean_mean", overwrite = TRUE)
)
Time difference of 0.1496 secs


Geometry

data.table [1,124 x 3]
id name pos
021320 MTV Hubbard St c(-87.68, 41.89)
13001 Michigan Ave & Washington St c(-87.624608297604, 41.8839739897761)
13006 LaSalle St & Washington St c(-87.632557037407, 41.8827048386161)
13008 Millennium Park c(-87.6240839190639, 41.8810511819097)
13011 Canal St & Adams St c(-87.6399264899551, 41.8793312145491)
13016 St. Clair St & Erie St c(-87.6228155045237, 41.8943253499102)
13017 Franklin St & Chicago Ave c(-87.6356912705203, 41.8967492007012)
13021 Clinton St & Lake St c(-87.6418271802395, 41.8856048845063)
13022 Streeter Dr & Grand Ave c(-87.6120624804387, 41.8922700897914)
13028 900 W Harrison St c(-87.6498031272599, 41.8747486017636)
13029 Field Museum c(-87.6179053695639, 41.8653188837371)
13033 Milwaukee Ave & Grand Ave c(-87.6483560756473, 41.8915921258399)
13034 Michigan Ave & Pearson St c(-87.6235541982785, 41.8975481149106)
13036 Michigan Ave & Madison St c(-87.6251647026189, 41.882122307852)
13037 Clinton St & Tilden St c(-87.6407754553424, 41.8758825305761)
[ omitted 1,109 entries ]


Average

data.table [1,124 x 4]
id name lat lng
021320 MTV Hubbard St 41.89 -87.68
13001 Michigan Ave & Washington St 41.8839739885453 -87.6246083007669
13006 LaSalle St & Washington St 41.8827048261626 -87.6325570526523
13008 Millennium Park 41.8810511785838 -87.62408392859
13011 Canal St & Adams St 41.879331213988 -87.6399264900752
13016 St. Clair St & Erie St 41.8943253475063 -87.6228155052132
13017 Franklin St & Chicago Ave 41.8967492001346 -87.6356912704003
13021 Clinton St & Lake St 41.8856048837492 -87.641827180145
13022 Streeter Dr & Grand Ave 41.8922700875069 -87.6120624842036
13028 900 W Harrison St 41.8747486007635 -87.6498031288495
13029 Field Museum 41.865318883256 -87.617905371203
13033 Milwaukee Ave & Grand Ave 41.8915921241123 -87.6483560767347
13034 Michigan Ave & Pearson St 41.8975481145247 -87.6235541979986
13036 Michigan Ave & Madison St 41.8821223070425 -87.6251647044313
13037 Clinton St & Tilden St 41.8758825302944 -87.6407754552023
[ omitted 1,109 entries ]

It would seem that keeping the unfiltered data yields more stations in the lookup table.

But is this additional data any good ? Or did it lower the overall accuracy of the stations’ locations by introducing lower-precision coordinates in the mix ?

Let’s compare both sources of data (filtered or not) and methods of combining locations (i.e. st_centroid vs simple average of lat/lng coordinates) by visualizing where they place the stations on a map:

Code
stations_locs <- (
  left_join(
    ## Combining the results of geom & haversine stations from the unfiltered data
    inner_join(tbl(con, "stations_clean_geom"), tbl(con, "stations_clean_mean"), join_by(id, name))
    |> mutate(
      lat_unfiltered_mean = lat, lng_unfiltered_mean = lng,
      lat_unfiltered_centroid = st_x(st_transform(pos, "EPSG:5069", "EPSG:4326")),
      lng_unfiltered_centroid = st_y(st_transform(pos, "EPSG:5069", "EPSG:4326"))
    )
    |> select(id, name, lat_unfiltered_mean, lng_unfiltered_mean, lat_unfiltered_centroid, lng_unfiltered_centroid),
    ## Combining the results of geom & haversine stations from the filtered data
    inner_join(tbl(con, "stations_clean_acc4_geom"), tbl(con, "stations_clean_acc4_mean"), join_by(id, name)) 
    |> mutate(
      lat_filtered_mean = lat, lng_filtered_mean = lng,
      lat_filtered_centroid = st_x(st_transform(pos, "EPSG:5069", "EPSG:4326")), 
      lng_filtered_centroid = st_y(st_transform(pos, "EPSG:5069", "EPSG:4326"))
    )
    |> select(id, name, lat_filtered_mean, lng_filtered_mean, lat_filtered_centroid, lng_filtered_centroid),
    join_by(id, name)
  )
  |> pivot_longer(cols = -c(id, name), names_sep = "_", names_to = c(".value", "data_source", "method"))
  |> collect()
  |> filter(!is.na(lat) & !is.na(lng))
  |> mutate(group = str_c(data_source, method, sep = "_"))
)

my_markers <- iconList(
  filtered_centroid = makeIcon("C_green.png", iconWidth = 45, iconHeight = 50),
  filtered_mean = makeIcon("M_green.png", iconWidth = 45, iconHeight = 50),
  unfiltered_centroid = makeIcon("C_orange.png", iconWidth = 45, iconHeight = 50),
  unfiltered_mean = makeIcon("M_orange.png", iconWidth = 45, iconHeight = 50)
)

(stations_locs
  |> st_as_sf(coords = c("lng", "lat"), crs = 4326)
  |> leaflet()
  |> addTiles()
  |> fitBounds(-88.6, 41.8, -87, 42.1)
  |> addMarkers(
    popup = ~sprintf("%s (id: %s)", name, id),
    icon = ~my_markers[group],
    clusterOptions = markerClusterOptions()
  )
)