Summary

This post showcases various solutions to efficiently match unknown locations to known ones by their geographical proximity (using lat/long coordinates), on a dataset with millions of entries.

Tip

You can check the source code by clicking on the </> Code button at the top-right.

The data is a list of bike rides from the Divvy bike-sharing service in Chicago, and the stations are the bike stations where the bikes can be picked up and dropped off. This post was prompted by a question on Reddit: how could one identify the many unknown stations within the rides data (i.e. rides’ start or end stations missing their id and name) to known stations, by their geographical proximity.

This post is related to a case study of Google’s Data Analytics certificate, and the data can be found here (June 2021 to May 2022).

🆕 Expand for Version History

v1: 2022-06-18

v2: 2022-10-29

Added a DuckDB + arrow example for the data loading.
Added data cleaning section (removing implausible rides).
Added sf, dbplyr and data.table (manual) examples for the spatial join.

v3: 2023-03-12

Updated code to newest tidyverse version (.by, reframe, list_rbind, join_by, …)

v4: 2023-09-27

Updated the SQL/dbplyr examples with DuckDB v0.8.1:
- Using the new PIVOT/UNPIVOT SQL statements
- Using the new spatial DuckDB extension for spatial operations
Updates the data.table examples:
- Using both sf and manual distances calculations (haversine) for spatial operations
- Removed the dtplyr examples
Improved the document’s structure & explanations

Setup

library(here)        # Working directory management
library(fs)          # File & folder manipulation
library(pipebind)    # Piping goodies

library(readr)       # Reading data from files           (Tidyverse)
library(dplyr)       # Manipulating data.frames - core   (Tidyverse)
library(tidyr)       # Manipulating data.frames - extras (Tidyverse)
library(stringr)     # Manipulating strings              (Tidyverse)
library(purrr)       # Manipulating lists                (Tidyverse)
library(lubridate)   # Manipulating date/time            (Tidyverse)
library(ggplot2)     # Best plotting library             (Tidyverse)

library(dbplyr)      # SQL back-end for dplyr            (Tidyverse)
library(duckdb)      # Quack Stack

library(arrow)       # Fast and efficient data reading
library(data.table)  # Fast data manipulation (in-RAM)

library(sf)          # Spatial data manipulation
library(fuzzyjoin)   # Non-equi joins & coordinates-based joins

library(leaflet)     # Interative map plots

options(
  dplyr.strict_sql = FALSE,
  scipen = 999L, 
  digits = 4L
)

nrows_print <- 10

data.table::setDTthreads(parallel::detectCores(logical = FALSE))

Quarto/knitr setup

Adding custom knitr hooks

#---------------------------#
####🔺knitr custom hooks ####
#---------------------------#

library(knitr)

## Adding the `time_it` code chunk option
knitr::knit_hooks$set(time_it = local({
  assign("TIMES", list(), .GlobalEnv)
  start <- NULL
  function(before, options) {
    if (before) start <<- Sys.time()
    else TIMES[[options$label]] <<- difftime(Sys.time(), start)
  }
}))

Applying a custom theme to all gt tables

#-----------------------#
####🔺gt knit_prints ####
#-----------------------#

library(knitr)
library(gt)

knit_print.grouped_df <- function(x, options, ...) {
  if ("grouped_df" %in% class(x)) x <- ungroup(x)
  
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "grouped_df", knit_print.grouped_df)

knit_print.data.frame <- function(x, options, ...) {
  cl <- intersect(class(x), c("data.table", "data.frame"))[1]
  nrows <- ifelse(!is.null(options$total_rows), as.numeric(options$total_rows), dim(x)[1])
  is_open <- ifelse(!is.null(options[["details-open"]]), as.logical(options[["details-open"]]), FALSE)
  
  cat(str_glue("\n<details{ifelse(is_open, ' open', '')}>\n"))
  cat("<summary>\n")
  cat(str_glue("\n*{cl} [{scales::label_comma()(nrows)} x {dim(x)[2]}]*\n"))
  cat("</summary>\n<br>\n")
  print(gt::as_raw_html(style_table(x, nrows)))
  cat("</details>\n\n")
}

registerS3method("knit_print", "data.frame", knit_print.data.frame)

1 Loading the data

data_path <- here("res", "data", "stations")

files <- dir_ls(data_path, glob = "*.csv")

rides <- lapply(files, \(f) fread(f, na.strings = "")) |> rbindlist()

Time difference of 11.95 secs

rides_tidy <- read_csv(files, show_col_types = FALSE)

Time difference of 5.122 secs

Alternatives

map(files, \(f) read_csv(f, show_col_types = FALSE)) |> list_rbind()

map_dfr(files, \(f) read_csv(f, show_col_types = FALSE))

con <- duckdb::dbConnect(
  duckdb(), 
  dbdir = ":memory:", 
  config = list(memory_limit = "30GB", temp_directory = here("_temp"), threads = 16L)
)

duckdb_read_csv(con, "rides", files)

Time difference of 3.219 secs

Importing an existing df from the R env

dbWriteTable(con, "rides", rides_tidy) # As a TABLE

copy_to(con, rides_tidy, "rides", indexes = "ride_id")

duckdb_register(con, "rides", rides_tidy) # As a VIEW

Importing while including the file names in the data

## Courtesy of Tristan Mahr (@tjmahr)

withr::with_dir(
  data_path, 
  dbSendQuery(con,"CREATE TABLE rides AS SELECT * FROM read_csv_auto('*.csv', FILENAME = TRUE)")
)

open_csv_dataset(data_path, convert_options = CsvConvertOptions$create(strings_can_be_null = TRUE)) |> 
  to_duckdb(con, "rides_arrow")

Time difference of 0.2388 secs

Output:

data.table [5,860,776 x 13]

ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual
99FEC93BA843FB20	electric_bike	2021-06-13 14:31:28	2021-06-13 14:34:11	NA	NA	NA	NA	41.8	−87.59	41.8	−87.6	member
06048DCFC8520CAF	electric_bike	2021-06-04 11:18:02	2021-06-04 11:24:19	NA	NA	NA	NA	41.79	−87.59	41.8	−87.6	member
9598066F68045DF2	electric_bike	2021-06-04 09:49:35	2021-06-04 09:55:34	NA	NA	NA	NA	41.8	−87.6	41.79	−87.59	member
B03C0FE48C412214	electric_bike	2021-06-03 19:56:05	2021-06-03 20:21:55	NA	NA	NA	NA	41.78	−87.58	41.8	−87.6	member
B9EEA89F8FEE73B7	electric_bike	2021-06-04 14:05:51	2021-06-04 14:09:59	NA	NA	NA	NA	41.8	−87.59	41.79	−87.59	member
62B943CEAAA420BA	electric_bike	2021-06-03 19:32:01	2021-06-03 19:38:46	NA	NA	NA	NA	41.78	−87.58	41.78	−87.58	member
7E2546FBA79C46EE	electric_bike	2021-06-10 16:30:10	2021-06-10 16:36:21	NA	NA	NA	NA	41.79	−87.6	41.79	−87.59	member
3DDF3BBF6C4C3C89	electric_bike	2021-06-10 17:00:30	2021-06-10 17:06:48	NA	NA	NA	NA	41.79	−87.59	41.8	−87.59	member
2608805637155AB6	electric_bike	2021-06-10 12:46:16	2021-06-10 12:55:02	NA	NA	NA	NA	41.93	−87.67	41.94	−87.68	member
AF529C946F28ED42	electric_bike	2021-06-23 17:57:29	2021-06-23 18:06:40	NA	NA	Michigan Ave & Oak St	13042	41.88	−87.61	41.901	−87.624	member
E6010941FB92E4A6	electric_bike	2021-06-22 19:28:02	2021-06-22 19:39:48	NA	NA	NA	NA	41.87	−87.62	41.87	−87.64	member
1149C0723F7AFFD5	electric_bike	2021-06-29 17:35:49	2021-06-29 17:55:11	NA	NA	NA	NA	41.9	−87.63	41.9	−87.68	member
8762DB62099E6011	electric_bike	2021-06-05 14:55:05	2021-06-05 15:13:29	NA	NA	NA	NA	41.89	−87.62	41.88	−87.62	member
BE3AC77CBFF17E6A	electric_bike	2021-06-05 14:05:00	2021-06-05 14:09:01	NA	NA	NA	NA	41.89	−87.62	41.89	−87.62	member
8E9F2CB0893B96A0	electric_bike	2021-06-05 13:39:04	2021-06-05 13:57:21	NA	NA	NA	NA	41.88	−87.62	41.89	−87.62	member
[ omitted 5,860,761 entries ]

✦ For each step of the process, there will be two solutions:

An In-RAM solution using data.table to manipulate the data. Spatial operations will be done with both precise (using the sf package) and approximate (e.g. the haversine distance) methods.
An Out-of-RAM solution using DuckDB to manipulate the data. Spatial operations will be shown using both precise (using the spatial DuckDB extension) and approximate (e.g. haversine distance) methods. Code examples will be provided in both SQL and dbplyr (which translates Tidyverse code to SQL behind the scenes).

2 Cleaning & reshaping the data

2.1 Removing rides with missing coordinates

First, since our ultimate goal is to use geographic information (lat/lng coordinates) to identify stations, we’ll get rid of rides that have missing geographic information:

In-RAM
Out-of-RAM

rides <- rides[!is.na(start_lat) & !is.na(start_lng) & !is.na(end_lat) & !is.na(end_lng), ]

(tbl(con, "rides")
  |> mutate(across(ends_with("_at"), \(x) strptime(x, "%Y-%m-%d %H:%M:%S"))) # String -> Date object
  |> filter(if_all(matches("_lat$|_lng$"), \(x) !is.na(x)))
  |> copy_to(con, df = _, "rides", overwrite = TRUE)
)

2.2 Removing implausible rides

Then, we are going to filter out rides that are implausible (i.e. too short, too long, made too quickly, etc). We will compute distances (and speeds), which we can do in one of two ways:

An accurate way where we turn our lat/lng coordinates into two 2D points on a projected coordinate system, which is a small version of the globe (flattened and corrected for distortion) that can be used for area and distance calculations.
A quick and dirty way using the haversine function to compute the “distance on a sphere” between the two sets of coordinates.

N.B.: The distances/time/speed thresholds I use here are arbitrary. Choose what makes sense depending on your domain knowledge.

In-RAM
Out-of-RAM

Precise distance with sf::st_distance:

First, let’s create a new dataset with a POINT geometry for both start and end stations’ positions:

rides_geom <- (
  rides
  |> st_as_sf(coords = c("start_lng", "start_lat"), remove = FALSE, crs = 4326)
  |> st_set_geometry("start_pos")
  |> mutate(end_pos = map2(end_lng, end_lat, \(x, y) st_point(c(x, y))) |> st_sfc(crs = 4326))
  |> as.data.table()
)

Then, use those two points to compute an accurate distance between each rides’ start_pos and end_pos, and use that distance to filter implausible rides:

rides_clean_geom <- (
  rides_geom[
    , ride_dist_m := st_distance(st_transform(start_pos, 5069), st_transform(end_pos, 5069), by_element = TRUE)
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  ][start_station_id != end_station_id | as.numeric(ride_dist_m) %between% c(100, 30000)
  ## Add ride duration and speed info
  ][, ride_dur_min := difftime(ended_at, started_at, units = "mins") |> abs() |> as.numeric()
  ][, ride_speed_km_h := fifelse(ride_dur_min == 0, NA, (as.numeric(ride_dist_m) / 1000) / (ride_dur_min / 60))
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  ][ride_dur_min %between% c(5, 1440)
  ## Remove rides that have implausible speeds (> 40 km/h for regular bikes, and 60 km/h for electric ones)
  ][ (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
     | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  ## Removing the new columns now that the cleaning is done
  ][, let(ride_dist_m = NULL, ride_dur_min = NULL, ride_speed_km_h = NULL)]
)

Time difference of 2.779 mins

Approximate distance with the haversine:

The haversine function computes the distance between two points on a sphere, based on their coordinates. But the Earth isn’t a sphere, so the estimated distance will only be approximate. However, for small distances, it’s accurate enough for our goals. And it’s much faster !

rides_clean_haversine <- (
  rides[
    , ride_dist_m := geosphere::distHaversine(.SD[, .(start_lng, start_lat)], .SD[, .(end_lng, end_lat)])
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  ][start_station_id != end_station_id | ride_dist_m %between% c(100, 30000)
  ## Add ride duration and speed info
  ][, ride_dur_min := difftime(ended_at, started_at, units = "mins") |> abs() |> as.numeric()
  ][, ride_speed_km_h := fifelse(ride_dur_min == 0, NA, (ride_dist_m / 1000) / (ride_dur_min / 60))
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  ][ride_dur_min %between% c(5, 1440)
  ## Remove rides that have implausible speeds (> 40 km/h for regular bikes, and 60 km/h for electric ones)
  ][ (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
     | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  ## Removing the new columns now that the cleaning is done
  ][, let(ride_dist_m = NULL, ride_dur_min = NULL, ride_speed_km_h = NULL)]
)

Time difference of 4.401 secs

The haversine solution is 38 times faster !

Precise distance with st_distance of the spatial extension:

INSTALL spatial;
LOAD spatial;

(tbl(con, "rides")
  |> mutate(
    start_pos = st_point(start_lat, start_lng) |> st_transform("EPSG:4326", "EPSG:5069"), 
    end_pos = st_point(end_lat, end_lng) |> st_transform("EPSG:4326", "EPSG:5069"),
    ride_dist_m = st_distance(start_pos, end_pos)
  )
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  |> filter(between(ride_dist_m, 100, 30000))
  ## Add ride duration and speed info
  |> mutate(
    ride_dur_min = abs(date_sub("second", started_at, ended_at) / 60),
    ride_speed_km_h = if_else(ride_dur_min == 0, NA, (ride_dist_m / 1000) / (ride_dur_min / 60))
  )
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  |> filter(between(ride_dur_min, 5, 1440))
  ## Rides with implausible speeds
  |> filter(
    (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
    | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  )
  ## Removing the new columns now that the cleaning is done
  |> select(-ride_dist_m, -ride_dur_min, -ride_speed_km_h)
  ## Generating a new table from the result of the cleaning
  |> compute("rides_clean_geom", overwrite = TRUE)
)

Time difference of 4.928 mins

Approximate distance with the haversine:

CREATE OR REPLACE FUNCTION haversine(lng1, lat1, lng2, lat2) 
  AS 6378137 * acos( 
    cos(radians(lat1)) * cos(radians(lat2)) * cos(radians(lng2) - radians(lng1)) +
    sin(radians(lat1)) * sin(radians(lat2))
  );

(tbl(con, "rides")
  |> mutate(ride_dist_m = haversine(start_lng, start_lat, end_lng, end_lat))
  ## Remove rides that loop on themselves, are too short (<= 100 m), or too long (>= 30 km)
  |> filter(between(ride_dist_m, 100, 30000))
  ## Add ride duration and speed info
  |> mutate(
    ride_dur_min = abs(date_sub("second", started_at, ended_at) / 60),
    ride_speed_km_h = if_else(ride_dur_min == 0, NA, (ride_dist_m / 1000) / (ride_dur_min / 60))
  )
  ## Remove rides that are too fast (< 5 min) or too long (> 1 day)
  |> filter(between(ride_dur_min, 5, 1440))
  ## Rides with implausible speeds
  |> filter(
    (rideable_type == "classic_bike" & ride_speed_km_h <= 40)
    | (rideable_type == "electric_bike" & ride_speed_km_h <= 60)
  )
  ## Removing the new columns now that the cleaning is done
  |> select(-ride_dist_m, -ride_dur_min, -ride_speed_km_h)
  ## Generating a new table from the result of the cleaning
  |> compute("rides_clean_haversine", overwrite = TRUE)
)

Time difference of 1.846 secs

The haversine solution is 160 times faster !

Output:

data.table [4,383,020 x 15]

ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	start_pos	end_pos
06048DCFC8520CAF	electric_bike	2021-06-04 11:18:02	2021-06-04 11:24:19	NA	NA	NA	NA	41.79	−87.59	41.8	−87.6	member	c(-87.59, 41.79)	c(-87.6, 41.8)
9598066F68045DF2	electric_bike	2021-06-04 09:49:35	2021-06-04 09:55:34	NA	NA	NA	NA	41.8	−87.6	41.79	−87.59	member	c(-87.6, 41.8)	c(-87.59, 41.79)
B03C0FE48C412214	electric_bike	2021-06-03 19:56:05	2021-06-03 20:21:55	NA	NA	NA	NA	41.78	−87.58	41.8	−87.6	member	c(-87.58, 41.78)	c(-87.6, 41.8)
7E2546FBA79C46EE	electric_bike	2021-06-10 16:30:10	2021-06-10 16:36:21	NA	NA	NA	NA	41.79	−87.6	41.79	−87.59	member	c(-87.6, 41.79)	c(-87.59, 41.79)
3DDF3BBF6C4C3C89	electric_bike	2021-06-10 17:00:30	2021-06-10 17:06:48	NA	NA	NA	NA	41.79	−87.59	41.8	−87.59	member	c(-87.59, 41.79)	c(-87.59, 41.8)
2608805637155AB6	electric_bike	2021-06-10 12:46:16	2021-06-10 12:55:02	NA	NA	NA	NA	41.93	−87.67	41.94	−87.68	member	c(-87.67, 41.93)	c(-87.68, 41.94)
AF529C946F28ED42	electric_bike	2021-06-23 17:57:29	2021-06-23 18:06:40	NA	NA	Michigan Ave & Oak St	13042	41.88	−87.61	41.901	−87.624	member	c(-87.61, 41.88)	c(-87.623698, 41.9010521666667)
E6010941FB92E4A6	electric_bike	2021-06-22 19:28:02	2021-06-22 19:39:48	NA	NA	NA	NA	41.87	−87.62	41.87	−87.64	member	c(-87.62, 41.87)	c(-87.64, 41.87)
1149C0723F7AFFD5	electric_bike	2021-06-29 17:35:49	2021-06-29 17:55:11	NA	NA	NA	NA	41.9	−87.63	41.9	−87.68	member	c(-87.63, 41.9)	c(-87.68, 41.9)
8762DB62099E6011	electric_bike	2021-06-05 14:55:05	2021-06-05 15:13:29	NA	NA	NA	NA	41.89	−87.62	41.88	−87.62	member	c(-87.62, 41.89)	c(-87.62, 41.88)
8E9F2CB0893B96A0	electric_bike	2021-06-05 13:39:04	2021-06-05 13:57:21	NA	NA	NA	NA	41.88	−87.62	41.89	−87.62	member	c(-87.62, 41.88)	c(-87.62, 41.89)
6344B71B7BB6E09E	electric_bike	2021-06-22 18:52:53	2021-06-22 18:59:13	NA	NA	NA	NA	41.79	−87.59	41.8	−87.6	member	c(-87.59, 41.79)	c(-87.6, 41.8)
59CE9444E2ED2530	electric_bike	2021-06-02 10:30:11	2021-06-02 10:37:03	NA	NA	NA	NA	41.79	−87.6	41.8	−87.59	member	c(-87.6, 41.79)	c(-87.59, 41.8)
F7107122A837A50B	electric_bike	2021-06-08 18:31:31	2021-06-08 18:38:25	NA	NA	NA	NA	41.78	−87.6	41.8	−87.6	member	c(-87.6, 41.78)	c(-87.6, 41.8)
45ABF9231CC02E3C	electric_bike	2021-06-07 22:24:08	2021-06-07 22:35:25	NA	NA	NA	NA	41.79	−87.58	41.78	−87.6	member	c(-87.58, 41.79)	c(-87.6, 41.78)
[ omitted 4,383,005 entries ]

Our cleaning removed 1,472,720 entries !

🔍 How much do distances from haversine & st_distance differ ?

Code

(rides_clean_haversine[
    , .(ride_id, dist_haversine = geosphere::distHaversine(.SD[, .(start_lng, start_lat)], .SD[, .(end_lng, end_lat)]))
  ][rides_clean_geom[
    , .(ride_id, dist_geom = st_distance(st_transform(start_pos, st_crs(5069)), st_transform(end_pos, st_crs(5069)), by_element = TRUE))]
    , on = "ride_id"
  ][, .(distance_diff = abs(dist_haversine - as.numeric(dist_geom))), by = ride_id]
  |> ggplot(aes(x = distance_diff))
      + geom_histogram(bins = 100, color = NA, fill = "lightblue")
      + labs(
        title = "Distribution of the differences in distance",
        subtitle = "Between **sf::st_distance** and **haversine**",
        x = "Difference in distance (in meters)", y = ""
      )
      + theme(axis.text.y = element_blank())
)

2.3 Pivoting/melting the data

Since we are interested in filling missing station information, we will first pivot the data in order to only have one station per row, to facilitate subsequent operations based on their coordinates.

In-RAM
Out-of-RAM

rides_clean_geom_long <- (
  rides_clean_geom
  |> melt(measure = measure(way, value.name, pattern = "(end|start).*_(at|name|id|lat|lng|pos)"))
)

Time difference of 0.4001 secs

(tbl(con, "rides_clean_geom")
  |> pivot_longer(
    cols = matches("^end|^start"), names_pattern = "(end|start).*_(at|name|id|lat|lng|pos)", 
    names_to = c("way", ".value")
  )
  |> compute("rides_clean_geom_long", overwrite = TRUE)
)

Time difference of 0.5486 secs

┕ SQL

CREATE OR REPLACE TABLE rides_clean_geom_long AS 
SELECT * FROM (
  UNPIVOT rides_clean_geom
  ON (started_at, start_station_name, start_station_id, start_lat, start_lng, start_pos) as 'start', 
     (ended_at, end_station_name, end_station_id, end_lat, end_lng, end_pos) as 'end'
  INTO
    NAME way
    VALUE at, name, id, lat, lng, pos
);

Time difference of 4.001 secs

Output:

data.frame [8,765,038 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
60F97090AC85F55E	classic_bike	member	start	2021-06-27 12:26:58	Clark St & Grace St	TA1307000127	41.951	−87.659	<raw [32]>
AAD9B0718A051AFE	classic_bike	casual	start	2021-06-11 19:38:31	Dearborn St & Monroe St	TA1305000006	41.881	−87.63	<raw [32]>
38D67BCCF380A714	classic_bike	casual	start	2021-06-14 15:20:00	Clark St & Grace St	TA1307000127	41.951	−87.659	<raw [32]>
E4930F461D7CAC3C	classic_bike	casual	start	2021-06-23 17:24:39	Wells St & Walton St	TA1306000011	41.9	−87.634	<raw [32]>
7CCAD714D89C9D69	classic_bike	casual	start	2021-06-05 16:57:13	Wells St & Walton St	TA1306000011	41.9	−87.634	<raw [32]>
61B8AB46091738BE	classic_bike	casual	start	2021-06-04 22:11:15	Clark St & Leland Ave	TA1309000014	41.967	−87.667	<raw [32]>
06983E079ECF6A1C	classic_bike	casual	start	2021-06-05 00:47:35	Clark St & Grace St	TA1307000127	41.951	−87.659	<raw [32]>
BFAFDECF376F4AC3	classic_bike	casual	start	2021-06-16 20:48:52	Michigan Ave & Oak St	13042	41.901	−87.624	<raw [32]>
3EF3C3675F6029B2	classic_bike	casual	start	2021-06-16 19:26:25	Michigan Ave & Oak St	13042	41.901	−87.624	<raw [32]>
B6E524DF38B2085B	classic_bike	casual	start	2021-06-13 13:21:38	Larrabee St & Armitage Ave	TA1309000006	41.918	−87.644	<raw [32]>
[ omitted 8,765,023 entries ]

3 Which stations are unidentified ?

Our ultimate goal is to fill missing stations’ name and id properties based on their geographical proximity to other know stations.

So let’s see which stations miss either their name or id after the cleaning we’ve done:

In-RAM
Out-of-RAM

rides_clean_geom_long_unk <- rides_clean_geom_long[is.na(id) | is.na(name), ]

(tbl(con, "rides_clean_geom_long") 
  |> filter(if_any(matches("id$|name$"), is.na))
  |> compute("rides_clean_geom_long_unk", overwrite = TRUE)
)

┕ SQL

CREATE OR REPLACE TABLE rides_clean_geom_long_unk AS
SELECT * FROM rides_clean_geom_long
WHERE (name IS NULL) OR (id IS NULL)

Output:

data.frame [1,199,219 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
00000B4F1F71F9C2	electric_bike	member	end	2021-09-08 16:37:54	NA	NA	41.91	−87.7	<raw [32]>
00000B4F1F71F9C2	electric_bike	member	start	2021-09-08 16:31:38	NA	NA	41.91	−87.69	<raw [32]>
00000E22FBA89D81	electric_bike	member	start	2022-05-19 14:42:55	NA	NA	41.9	−87.62	<raw [32]>
000018B1D040DB44	electric_bike	member	end	2022-04-25 10:44:19	NA	NA	41.79	−87.59	<raw [32]>
000018B1D040DB44	electric_bike	member	start	2022-04-25 10:37:22	NA	NA	41.79	−87.6	<raw [32]>
00002E385DB2888C	electric_bike	casual	end	2022-05-07 16:40:28	NA	NA	41.87	−87.75	<raw [32]>
00002E385DB2888C	electric_bike	casual	start	2022-05-07 16:28:53	NA	NA	41.89	−87.75	<raw [32]>
00004B000220B299	electric_bike	member	end	2022-05-06 17:53:31	NA	NA	41.98	−87.67	<raw [32]>
0000664E22910718	electric_bike	casual	end	2021-08-09 16:01:27	NA	NA	42.02	−87.67	<raw [32]>
000095C5DDB7E97A	electric_bike	member	end	2021-09-06 13:48:39	NA	NA	41.91	−87.63	<raw [32]>
[ omitted 1,199,204 entries ]

There are 1,199,219 stations missing an id or name in the original data !

But why are so many stations unknown/unidentified ?

Could it be an issue of location accuracy ? I.e. the GPS information was too bad for the ride app to properly locate where the user was when arriving or leaving a station ?

If this is the case, then most of the unknown stations should have poor coordinate accuracy (i.e. their lat/lng values should have too few decimal points).

Let’s find out !

4 A question of location accuracy ?

First, a quick recap on how lat/lng coordinates precision (i.e. the number of decimals) relates to positioning accuracy:

Decimal	Distance at the equator (m)	Distance at the 45th (N or S) parallel (m)
0	111,120	78,710
1	11,112	7,871
2	1,111.2	787.1
3	111.12	78.71
4	11.112	7.871
5	1.1112	0.7871

See this thread for more details.

Roughly a 100 meters radius of uncertainty to locate a station when you only have 3 decimal points in your coordinates is quite large. It seems like a good threshold for us, so let’s remove entries where either lat or lng have 3 or less decimals, since those entries would not have sufficient precision to be reliably matched to a known station by their position alone.

Let’s define a function to only keep stations with 4 or more decimals in their lat or lng coordinate (which we will call the accuracy-4 or filtered data henceforth):

has_acc4 <- function(x) str_length(str_remove(as.character(abs(x)), ".*\\.")) >= 4

CREATE FUNCTION has_acc4(x) AS length(str_split(CAST(abs(x) AS VARCHAR(16)), '.')[2]) >= 4

And now, let’s apply it to our table of unknown stations (rides_clean_geom_long_unk), and see how many remain afterwards:

In-RAM
Out-of-RAM

rides_clean_geom_long_unk_acc4 <- rides_clean_geom_long_unk[has_acc4(lat) & has_acc4(lng), ]

Time difference of 1.773 secs

(tbl(con, "rides_clean_geom_long_unk") 
  |> filter(if_all(c(lat, lng), has_acc4))
  |> compute("rides_clean_geom_long_unk_acc4", overwrite = TRUE)
)

Time difference of 0.08684 secs

┕ SQL

CREATE OR REPLACE TABLE rides_clean_geom_long_unk_acc4 AS
SELECT * FROM rides_clean_geom_long_unk
WHERE has_acc4(lat) AND has_acc4(lng)

Time difference of 0.02707 secs

Output:

data.table [2 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
176105D1F8A1216B	electric_bike	casual	start	2021-07-18 03:44:22	NA	13221	41.908	−87.673	c(-87.672552, 41.907655)
EE197EDA4CF8CFE5	electric_bike	casual	start	2021-09-22 07:14:42	NA	WL-008	41.867	−87.641	c(-87.641087, 41.867117)

After filtering imprecise coordinates, there are only 2 stations missing either their id or name !

The fact that most of the unknown stations got removed by the coordinate-accuracy filtering gives credence to our previous hypothesis: most of the previous 1,199,219 unidentified stations were missing their id/name because their location was too imprecise to be matched to any known station by the ride app.

Which means that attempting to do such matching ourselves would be an exercise in futility …

But let’s attempt it anyway, since it is the purpose of this blog post !

5 Location-based station identification

First, we’ll need to assemble a reference/look-up table for stations. This table will link each station (identified by a unique combination of id and name) to a unique set of coordinates (i.e. the location of that station). This will allow us to later match unknown stations from the rides data to those in this reference table, by comparing their locations.

5.1 Station reference table

Our rides data contains a small number of stations that each appear multiple times, either as the start or end of a ride. For each unique station, we will aggregate all the rides that start or end at that station, and set the station’s coordinates at the center of all the existing coordinates for that station. We have to way to go about this aggregation:

An accurate method such as st_centroid to find out the “center” of this set coordinates (using sf / the spatial extension).
An approximate method such as taking the average of all the lat/lng values we have in our rides data, for each station. This solution can be imprecise since it does not take the precision of each coordinate into account (e.g. giving more weight to more accurate coordinates), and the result will have a misleading level of accuracy due to the averaging process.

Let’s do both and see how they compare !

5.1.1 Using the accuracy-4 data

Here, since we want to establish our “stations” lookup table from the “accuracy-4” data, we need to apply our has_acc4 function to rides_clean_geom_long to only keep entries (i.e. rides) with 4 or more decimals in their lat or lng coordinate:

In-RAM
Out-of-RAM

rides_clean_geom_long_acc4 <- rides_clean_geom_long[has_acc4(lat) & has_acc4(lng), ]

Time difference of 17.62 secs

(tbl(con, "rides_clean_geom_long") 
  |> filter(if_all(c(lat, lng), has_acc4))
  |> compute("rides_clean_geom_long_acc4", overwrite = TRUE)
)

Time difference of 2.883 secs

Now, let’s use the resulting table (rides_clean_geom_long_acc4) to build our stations lookup table:

In-RAM
Out-of-RAM

Precise location with sf::st_centroid:

stations_clean_acc4_geom <- (
  rides_clean_geom_long_acc4
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, find the centroid of the "pos" POINT
  |> _[, .(pos = st_centroid(st_combine(pos))), by = .(id, name)]
  ## Order the result by station id
  |> setorder(id)
)

Time difference of 18.97 secs

Approximate location by averaging the coordinates:

stations_clean_acc4_mean <- (
  rides_clean_geom_long_acc4
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, average lat and lng values
  |> dcast(id + name ~ ., fun.aggregate = mean, value.var = c("lat", "lng"))
  ## Order the result by station id
  |> setorder(id)
)

Time difference of 0.681 secs

Precise location using st_centroid from the spatial extension:

(tbl(con, "rides_clean_geom_long_acc4")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      pos = list(pos) |> st_collect() |> st_centroid()
  )
  |> arrange(id)
  |> compute("stations_clean_acc4_geom", overwrite = TRUE)
)

Time difference of 0.6707 secs

┕ SQL

CREATE OR REPLACE TABLE stations_clean_acc4_geom AS
FROM rides_clean_geom_long_acc4
SELECT id, name, st_centroid(st_collect(list(pos))) AS pos_centroid 
WHERE (id IS NOT NULL) AND (name IS NOT NULL)
GROUP BY id, name
ORDER BY id;

Time difference of 0.5262 secs

Approximate location by averaging the coordinates:

(tbl(con, "rides_clean_geom_long_acc4")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      across(c(lat, lng), mean)
  )
  |> arrange(id)
  |> compute("stations_clean_acc4_mean", overwrite = TRUE)
)

Time difference of 0.1491 secs

👀 Output

Geometry

data.table [700 x 3]

id	name	pos
13001	Michigan Ave & Washington St	c(-87.624608303352, 41.8839739921181)
13006	LaSalle St & Washington St	c(-87.6325570579559, 41.8827048196346)
13008	Millennium Park	c(-87.6240839243461, 41.8810511820254)
13011	Canal St & Adams St	c(-87.6399264828786, 41.8793312329564)
13016	St. Clair St & Erie St	c(-87.6228155199715, 41.8943253700403)
13017	Franklin St & Chicago Ave	c(-87.635691270916, 41.8967491893866)
13021	Clinton St & Lake St	c(-87.6418271678616, 41.8856048981661)
13022	Streeter Dr & Grand Ave	c(-87.6120624843549, 41.8922700930906)
13028	900 W Harrison St	c(-87.6498031272599, 41.8747486017636)
13029	Field Museum	c(-87.6179053613466, 41.8653188860571)
13033	Milwaukee Ave & Grand Ave	c(-87.6483560756473, 41.8915921258399)
13034	Michigan Ave & Pearson St	c(-87.6235541871001, 41.8975481429342)
13036	Michigan Ave & Madison St	c(-87.6251647130149, 41.8821223263018)
13037	Clinton St & Tilden St	c(-87.6407754094505, 41.8758825068496)
13042	Michigan Ave & Oak St	c(-87.6237787792645, 41.9009749388655)
[ omitted 685 entries ]

Average

data.table [700 x 4]

id	name	lat	lng
13001	Michigan Ave & Washington St	41.8839739908871	-87.6246083065155
13006	LaSalle St & Washington St	41.8827048071804	-87.6325570732024
13008	Millennium Park	41.8810511786992	-87.6240839338728
13011	Canal St & Adams St	41.8793312323952	-87.6399264829988
13016	St. Clair St & Erie St	41.8943253676363	-87.622815520661
13017	Franklin St & Chicago Ave	41.89674918882	-87.6356912707959
13021	Clinton St & Lake St	41.8856048974089	-87.6418271677671
13022	Streeter Dr & Grand Ave	41.892270090806	-87.6120624881201
13028	900 W Harrison St	41.8747486007635	-87.6498031288495
13029	Field Museum	41.865318885576	-87.6179053629858
13033	Milwaukee Ave & Grand Ave	41.8915921241123	-87.6483560767347
13034	Michigan Ave & Pearson St	41.8975481425484	-87.6235541868204
13036	Michigan Ave & Madison St	41.8821223254923	-87.6251647148276
13037	Clinton St & Tilden St	41.8758825065679	-87.6407754093102
13042	Michigan Ave & Oak St	41.9009749374364	-87.6237787872266
[ omitted 685 entries ]

Precise location with sf::st_centroid:

stations_clean_geom <- (
  rides_clean_geom_long
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, find the centroid of the "pos" POINT
  |> _[, .(pos = st_centroid(st_combine(pos))), by = .(id, name)]
  ## Order the result by station id
  |> setorder(id)
)

Time difference of 20.4 secs

Approximate location by averaging the coordinates:

stations_clean_mean <- (
  rides_clean_geom_long
  ## Remove unknown stations
  |> na.omit(cols = c("id", "name"))
  ## For each id + name unique combination, average lat and lng values
  |> dcast(id + name ~ ., fun.agg = mean, value.var = c("lat", "lng"))
  ## Order the result by station id
  |> setorder(id)
)

Time difference of 1.045 secs

Precise location using st_centroid from the spatial extension:

(tbl(con, "rides_clean_geom_long")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      pos = list(pos) |> st_collect() |> st_centroid()
  )
  |> arrange(id)
  |> compute("stations_clean_geom", overwrite = TRUE)
)

Time difference of 0.6858 secs

┕ SQL

CREATE OR REPLACE TABLE stations_clean_geom AS
FROM rides_clean_geom_long
SELECT id, name, st_centroid(st_collect(list(pos))) AS pos_centroid 
WHERE (id IS NOT NULL) AND (name IS NOT NULL)
GROUP BY id, name
ORDER BY id;

Time difference of 0.5395 secs

Approximate location by averaging the coordinates:

(tbl(con, "rides_clean_geom_long")
  |> filter(!is.na(id), !is.na(name))
  |> summarize(
      .by = c(id, name),
      across(c(lat, lng), mean)
  )
  |> arrange(id)
  |> compute("stations_clean_mean", overwrite = TRUE)
)

Time difference of 0.1496 secs

👀 Output

Geometry

data.table [1,124 x 3]

id	name	pos
021320	MTV Hubbard St	c(-87.68, 41.89)
13001	Michigan Ave & Washington St	c(-87.624608297604, 41.8839739897761)
13006	LaSalle St & Washington St	c(-87.632557037407, 41.8827048386161)
13008	Millennium Park	c(-87.6240839190639, 41.8810511819097)
13011	Canal St & Adams St	c(-87.6399264899551, 41.8793312145491)
13016	St. Clair St & Erie St	c(-87.6228155045237, 41.8943253499102)
13017	Franklin St & Chicago Ave	c(-87.6356912705203, 41.8967492007012)
13021	Clinton St & Lake St	c(-87.6418271802395, 41.8856048845063)
13022	Streeter Dr & Grand Ave	c(-87.6120624804387, 41.8922700897914)
13028	900 W Harrison St	c(-87.6498031272599, 41.8747486017636)
13029	Field Museum	c(-87.6179053695639, 41.8653188837371)
13033	Milwaukee Ave & Grand Ave	c(-87.6483560756473, 41.8915921258399)
13034	Michigan Ave & Pearson St	c(-87.6235541982785, 41.8975481149106)
13036	Michigan Ave & Madison St	c(-87.6251647026189, 41.882122307852)
13037	Clinton St & Tilden St	c(-87.6407754553424, 41.8758825305761)
[ omitted 1,109 entries ]

Average

data.table [1,124 x 4]

id	name	lat	lng
021320	MTV Hubbard St	41.89	-87.68
13001	Michigan Ave & Washington St	41.8839739885453	-87.6246083007669
13006	LaSalle St & Washington St	41.8827048261626	-87.6325570526523
13008	Millennium Park	41.8810511785838	-87.62408392859
13011	Canal St & Adams St	41.879331213988	-87.6399264900752
13016	St. Clair St & Erie St	41.8943253475063	-87.6228155052132
13017	Franklin St & Chicago Ave	41.8967492001346	-87.6356912704003
13021	Clinton St & Lake St	41.8856048837492	-87.641827180145
13022	Streeter Dr & Grand Ave	41.8922700875069	-87.6120624842036
13028	900 W Harrison St	41.8747486007635	-87.6498031288495
13029	Field Museum	41.865318883256	-87.617905371203
13033	Milwaukee Ave & Grand Ave	41.8915921241123	-87.6483560767347
13034	Michigan Ave & Pearson St	41.8975481145247	-87.6235541979986
13036	Michigan Ave & Madison St	41.8821223070425	-87.6251647044313
13037	Clinton St & Tilden St	41.8758825302944	-87.6407754552023
[ omitted 1,109 entries ]

It would seem that keeping the unfiltered data yields more stations in the lookup table.

But is this additional data any good ? Or did it lower the overall accuracy of the stations’ locations by introducing lower-precision coordinates in the mix ?

🔍 Comparing the location of each unique station

Let’s compare both sources of data (filtered or not) and methods of combining locations (i.e. st_centroid vs simple average of lat/lng coordinates) by visualizing where they place the stations on a map:

Code

stations_locs <- (
  left_join(
    ## Combining the results of geom & haversine stations from the unfiltered data
    inner_join(tbl(con, "stations_clean_geom"), tbl(con, "stations_clean_mean"), join_by(id, name))
    |> mutate(
      lat_unfiltered_mean = lat, lng_unfiltered_mean = lng,
      lat_unfiltered_centroid = st_x(st_transform(pos, "EPSG:5069", "EPSG:4326")),
      lng_unfiltered_centroid = st_y(st_transform(pos, "EPSG:5069", "EPSG:4326"))
    )
    |> select(id, name, lat_unfiltered_mean, lng_unfiltered_mean, lat_unfiltered_centroid, lng_unfiltered_centroid),
    ## Combining the results of geom & haversine stations from the filtered data
    inner_join(tbl(con, "stations_clean_acc4_geom"), tbl(con, "stations_clean_acc4_mean"), join_by(id, name)) 
    |> mutate(
      lat_filtered_mean = lat, lng_filtered_mean = lng,
      lat_filtered_centroid = st_x(st_transform(pos, "EPSG:5069", "EPSG:4326")), 
      lng_filtered_centroid = st_y(st_transform(pos, "EPSG:5069", "EPSG:4326"))
    )
    |> select(id, name, lat_filtered_mean, lng_filtered_mean, lat_filtered_centroid, lng_filtered_centroid),
    join_by(id, name)
  )
  |> pivot_longer(cols = -c(id, name), names_sep = "_", names_to = c(".value", "data_source", "method"))
  |> collect()
  |> filter(!is.na(lat) & !is.na(lng))
  |> mutate(group = str_c(data_source, method, sep = "_"))
)

my_markers <- iconList(
  filtered_centroid = makeIcon("C_green.png", iconWidth = 45, iconHeight = 50),
  filtered_mean = makeIcon("M_green.png", iconWidth = 45, iconHeight = 50),
  unfiltered_centroid = makeIcon("C_orange.png", iconWidth = 45, iconHeight = 50),
  unfiltered_mean = makeIcon("M_orange.png", iconWidth = 45, iconHeight = 50)
)

(stations_locs
  |> st_as_sf(coords = c("lng", "lat"), crs = 4326)
  |> leaflet()
  |> addTiles()
  |> fitBounds(-88.6, 41.8, -87, 42.1)
  |> addMarkers(
    popup = ~sprintf("%s (id: %s)", name, id),
    icon = ~my_markers[group],
    clusterOptions = markerClusterOptions()
  )
)

If you zoom on a specific station and click on its marker, you’ll see either 2 or 4 pins displayed. A station having only 2 pins is a station that was only present in the unfiltered data (i.e. only orange markers). For stations having 4 pins, they should have 2 orange and 2 green markers. In orange are the coordinates from the unfiltered data, while the filtered ones are in green. The M marker is for the mean method (average of lat/lng), while the C marker is for the centroid method.

First, we can see that many of the stations with 4 pins line up pretty closely with a bike icon on the map, which should share the same name as the station closest to it. However, stations with only 2 orange pins are often much further away from any identifiable bike icon, or sometimes in the middle of a housing block. This suggest that stations only present in the unfiltered stations lookup table have unreliable locations.

As for stations present in both lookup tables, there seems to be virtually no differences in the 4 pins’ position. If we look at the lat/lng coordinates themselves, they only start differing at around the 8th or 9th digit after the decimal point, which is a difference in the range of a millimeter.

Taken together, this means we should only use the filtered (accuracy-4) stations lookup table going forwards, but we use the average of lat/lng instead of the st_centroid if we want to save time on computations.

PS: Abetter solution here would have been to query OpenStreetMap or Google Maps for the coordinates of each station, and build our lookup table with those.

5.2 Distance-based join

Now that we have our station lookup table, let’s try matching the unknown stations from the rides data to the closest (by proximity) station in our reference table, using a distance-based join. Here, we will set the distance threshold to 11 meters, meaning that the match will only happen if both stations are within 11 meters of each other.

N.B.: This is an arbitrary distance threshold, feel free to use one that makes sense to you.

5.2.1 On the accuracy-4 data

There are 2 entries from rides_clean that could potentially be proximity-matched to a known station from our lookup table.

In-RAM
Out-of-RAM

Precise geom-distance join using sf geometries:

matched_clean_acc4_geom <- (
   st_join(
    st_sf(rides_clean_geom_long_unk_acc4) |> mutate(pos = st_transform(pos, 5069)),
    st_sf(stations_clean_acc4_geom) |> mutate(pos = st_transform(pos, 5069)),
    join = st_is_within_distance,
    dist = 11, # In meters
    left = FALSE # Does an inner_join
  )
  |> as.data.frame()
  |> mutate(
    name = coalesce(name.x, name.y), 
    id = coalesce(id.x, id.y)
  )
  ## Getting the distance info between each unknown station and the known ones matched to it
  |> left_join(stations_clean_acc4_geom, join_by(id, name))
  |> mutate(
    dist = st_distance(
      st_transform(pos.x, 5069), 
      st_transform(pos.y, 5069), 
      by_element = TRUE
    ) |> as.numeric(),
    pos = pos.x
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> select(colnames(rides_clean_geom_long_unk_acc4))
  |> arrange(ride_id)
  |> as.data.table()
)

Time difference of 0.06641 secs

Approximate haversine-distance join using fuzzyjoin:

matched_clean_acc4_haversine <- (
   fuzzyjoin::geo_inner_join(
    rides_clean_geom_long_unk_acc4,
    stations_clean_acc4_mean,
    by = c("lng", "lat"),
    method = "haversine",
    unit = "km",
    max_dist = 0.011, # 11 meters
    distance_col = "dist"
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> mutate(
    name = coalesce(name.x, name.y), 
    id = coalesce(id.x, id.y), 
    lat = lat.x, lng = lng.x
  )
  |> select(colnames(rides_clean_geom_long_unk_acc4))
  |> arrange(ride_id)
  |> as.data.table()
)

Time difference of 0.106 secs

Precise geom-distance join using spatial geometries:

(inner_join(
    tbl(con, "rides_clean_geom_long_unk_acc4"),
    tbl(con, "stations_clean_acc4_geom"),
    sql_on = "st_dwithin(RHS.pos, LHS.pos, 11)"
  )
  ## Getting the distance info between each unknown station and the known ones matched to it
  |> mutate(
    name = coalesce(name.x, name.y),
    id = coalesce(id.x, id.y),
    dist = st_distance(pos.x, pos.y) |> as.numeric(),
    pos = pos.x
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> select(any_of(colnames(tbl(con, "rides_clean_geom_long_unk_acc4"))))
  |> arrange(ride_id)
  |> compute("matched_clean_acc4_geom", overwrite = TRUE)
)

Time difference of 0.1583 secs

┕ SQL

CREATE OR REPLACE TABLE matched_clean_acc4_geom AS
SELECT COLUMNS(c -> c NOT SIMILAR TO '(.*\.x|.*\.y|dist.*)')
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY ride_id, way ORDER BY dist) AS dist_rank
  FROM (
    SELECT
      *
      , COALESCE("name.x", "name.y") AS "name"
      , COALESCE("id.x", "id.y") AS id
      , CAST(st_distance("pos.x", "pos.y") AS NUMERIC) AS dist
      , "pos.x" AS pos
    FROM (
      SELECT
        ride_id, rideable_type, member_casual, way
        , r.name AS "name.x"
        , r.id AS "id.x"
        , at
        , lat
        , lng
        , r.pos AS "pos.x"
        , s.id AS "id.y"
        , s.name AS "name.y"
        , s.pos AS "pos.y"
      FROM (
        SELECT *, st_transform(pos, 'EPSG:4326', 'EPSG:5069') AS pos
        FROM rides_clean_geom_long_unk_acc4
      ) r
      INNER JOIN (
        SELECT *, st_transform(pos, 'EPSG:4326', 'EPSG:5069') AS pos
        FROM stations_clean_acc4_geom
      ) s
      ON st_dwithin(s.pos, r.pos, 11)
    )
  )
)
WHERE (dist_rank <= 1)
ORDER BY ride_id;

Time difference of 0.01295 secs

Approximate haversine-distance join:

(inner_join(
    tbl(con, "rides_clean_geom_long_unk_acc4") |> select(-pos),
    tbl(con, "stations_clean_acc4_mean"),
    sql_on = "haversine(LHS.lng, LHS.lat, RHS.lng, RHS.lat) <= 11"
  )
  ## Getting the distance info between each unknown station and the known ones matched to it
  |> mutate(
    dist = haversine(lng.x, lat.x, lng.y, lat.y),
    name = coalesce(name.x, name.y),
    id = coalesce(id.x, id.y),
    lat = lat.x, lng = lng.x
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> select(any_of(colnames(tbl(con, "rides_clean_geom_long_unk_acc4"))))
  |> arrange(ride_id)
  |> compute("matched_clean_acc4_haversine", overwrite = TRUE)
)

Time difference of 0.1729 secs

👀 Output

From our 2 rows missing an id or name, 2 got matched !

Geometry join:

data.frame [2 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
176105D1F8A1216B	electric_bike	casual	start	2021-07-18 03:44:22	Wood St & Milwaukee Ave	13221	41.908	−87.673	<raw [32]>
EE197EDA4CF8CFE5	electric_bike	casual	start	2021-09-22 07:14:42	Clinton St & Roosevelt Rd	WL-008	41.867	−87.641	<raw [32]>

Haversine join:

data.frame [2 x 9]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng
176105D1F8A1216B	electric_bike	casual	start	2021-07-18 03:44:22	Wood St & Milwaukee Ave	13221	41.908	−87.673
EE197EDA4CF8CFE5	electric_bike	casual	start	2021-09-22 07:14:42	Clinton St & Roosevelt Rd	WL-008	41.867	−87.641

And it seems both solution agree on the names of the 2 unidentified stations !

🔍 But did we really need a proximity join here ?

If we look at the unidentified stations before the join (i.e. rides_clean_geom_long_unk_acc4), we can see that the two unknown stations had an id but no name:

data.table [2 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
176105D1F8A1216B	electric_bike	casual	start	2021-07-18 03:44:22	NA	13221	41.908	−87.673	c(-87.672552, 41.907655)
EE197EDA4CF8CFE5	electric_bike	casual	start	2021-09-22 07:14:42	NA	WL-008	41.867	−87.641	c(-87.641087, 41.867117)

Since they were only missing one of the two identifiers, we could have filled the other missing information with a regular join to stations_clean_acc4_geom, instead of a proximity-based join, which is more resource intensive and less accurate.

Let’s check that both methods yield the same results:

stations_clean_acc4_mean[
    matched_clean_acc4_haversine, on = .(id)
  ][, .(id, name_from_id = name, name_from_distance = i.name, lat, lng)]

data.table [2 x 5]

id	name_from_id	name_from_distance	lat	lng
13221	Wood St & Milwaukee Ave	Wood St & Milwaukee Ave	41.908	−87.673
WL-008	Clinton St & Roosevelt Rd	Clinton St & Roosevelt Rd	41.867	−87.641

At least, we can see that the proximity-matched name and the id-matched one are the same, meaning our proximity-matching method works well !

5.2.2 On the unfiltered data

There are 1,199,219 entries from rides_clean that could potentially be proximity-matched to a known station from our lookup table.

In-RAM
Out-of-RAM

Precise geom-distance join using sf geometries:

matched_clean_geom <- (
   st_join(
    st_sf(rides_clean_geom_long_unk) |> mutate(pos = st_transform(pos, 5069)),
    st_sf(stations_clean_acc4_geom) |> mutate(pos = st_transform(pos, 5069)),
    join = st_is_within_distance,
    dist = 11, # In meters
    left = FALSE # Does an inner_join
  )
  |> as.data.frame()
  |> mutate(
    name = coalesce(name.x, name.y), 
    id = coalesce(id.x, id.y)
  )
  ## Getting the distance info between each unknown station and the known ones matched to it
  |> left_join(stations_clean_acc4_geom, join_by(id, name))
  |> mutate(
    dist = st_distance(
      st_transform(pos.x, 5069), 
      st_transform(pos.y, 5069), 
      by_element = TRUE
    ) |> as.numeric(),
    pos = pos.x
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> select(colnames(rides_clean_geom_long_unk))
  |> arrange(ride_id)
  |> as.data.table()
)

Time difference of 2.183 mins

Approximate haversine-distance join using fuzzyjoin:

matched_clean_haversine <- (
   fuzzyjoin::geo_inner_join(
    rides_clean_geom_long_unk,
    stations_clean_acc4_mean,
    by = c("lng", "lat"),
    method = "haversine",
    unit = "km",
    max_dist = 0.011, # 11 meters
    distance_col = "dist"
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> mutate(
    name = coalesce(name.x, name.y), 
    id = coalesce(id.x, id.y), 
    lat = lat.x, lng = lng.x
  )
  |> select(colnames(rides_clean_geom_long_unk))
  |> arrange(ride_id)
  |> as.data.table()
)

Time difference of 0.9995 secs

The haversine solution is 131 times faster !

Precise geom-distance join using spatial geometries:

(inner_join(
    tbl(con, "rides_clean_geom_long_unk"),
    tbl(con, "stations_clean_acc4_geom"),
    sql_on = "st_dwithin(RHS.pos, LHS.pos, 11)"
  )
  ## Getting the distance info between each unknown station and the known ones matched to it
  |> mutate(
    name = coalesce(name.x, name.y),
    id = coalesce(id.x, id.y),
    dist = st_distance(pos.x, pos.y) |> as.numeric(),
    pos = pos.x
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> select(any_of(colnames(tbl(con, "rides_clean_geom_long_unk"))))
  |> arrange(ride_id)
  |> compute("matched_clean_geom", overwrite = TRUE)
)

Time difference of 30.81 secs

┕ SQL

CREATE OR REPLACE TABLE matched_clean_geom AS
SELECT COLUMNS(c -> c NOT SIMILAR TO '(.*\.x|.*\.y|dist.*)')
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY ride_id, way ORDER BY dist) AS dist_rank
  FROM (
    SELECT
      *
      , COALESCE("name.x", "name.y") AS "name"
      , COALESCE("id.x", "id.y") AS id
      , CAST(st_distance("pos.x", "pos.y") AS NUMERIC) AS dist
      , "pos.x" AS pos
    FROM (
      SELECT
        ride_id, rideable_type, member_casual, way
        , r.name AS "name.x"
        , r.id AS "id.x"
        , at
        , lat
        , lng
        , r.pos AS "pos.x"
        , s.id AS "id.y"
        , s.name AS "name.y"
        , s.pos AS "pos.y"
      FROM (
        SELECT *, st_transform(pos, 'EPSG:4326', 'EPSG:5069') AS pos
        FROM rides_clean_geom_long_unk
      ) r
      INNER JOIN (
        SELECT *, st_transform(pos, 'EPSG:4326', 'EPSG:5069') AS pos
        FROM stations_clean_acc4_geom
      ) s
      ON st_dwithin(s.pos, r.pos, 11)
    )
  )
)
WHERE (dist_rank <= 1)
ORDER BY ride_id;

Time difference of 31.83 secs

Approximate haversine-distance join:

(inner_join(
    tbl(con, "rides_clean_geom_long_unk"),
    tbl(con, "stations_clean_acc4_mean"),
    sql_on = "haversine(LHS.lng, LHS.lat, RHS.lng, RHS.lat) <= 11"
  )
  ## Getting the distance info between each unknown station and the known ones matched to it
  |> mutate(
    dist = haversine(lng.x, lat.x, lng.y, lat.y),
    name = coalesce(name.x, name.y),
    id = coalesce(id.x, id.y),
    lat = lat.x, lng = lng.x
  )
  ## If multiple stations from the lookup table match a given unknown station, 
  ##  only keep the match with the shortest distance
  |> slice_min(dist, by = c(ride_id, way), with_ties = FALSE)
  |> select(any_of(colnames(tbl(con, "rides_clean_geom_long_unk"))))
  |> arrange(ride_id)
  |> compute("matched_clean_haversine", overwrite = TRUE)
)

Time difference of 4.938 secs

The haversine solution is 6 times faster !

👀 Output

From our 1,199,219 unknown entries from rides_clean unfiltered table, 9,372 got matched to a station in the unfiltered lookup table.

Geometry join:

data.frame [9,372 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
00153AFFF2694897	electric_bike	member	end	2021-10-01 17:59:01	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
001C1860110767A5	electric_bike	member	end	2021-12-10 12:02:18	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
0020CC9D9A259263	electric_bike	member	end	2021-10-28 11:26:55	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
0025F9D0884ABA24	electric_bike	casual	end	2021-06-14 13:20:10	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
0029A0DEC0285B5B	electric_bike	member	start	2022-03-05 13:28:11	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
002AA979F480C8F4	electric_bike	member	start	2021-06-01 17:03:28	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
002D9A9ABE21C1F9	electric_bike	member	end	2021-07-20 19:02:23	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
003719B5F2312D44	electric_bike	casual	start	2021-10-14 02:03:06	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
003C8916AC164249	electric_bike	casual	end	2021-11-15 08:57:42	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
00401080376FACDB	electric_bike	casual	start	2022-01-09 18:12:05	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
[ omitted 9,357 entries ]

Haversine join:

data.frame [9,372 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos
00153AFFF2694897	electric_bike	member	end	2021-10-01 17:59:01	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
001C1860110767A5	electric_bike	member	end	2021-12-10 12:02:18	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
0020CC9D9A259263	electric_bike	member	end	2021-10-28 11:26:55	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
0025F9D0884ABA24	electric_bike	casual	end	2021-06-14 13:20:10	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
0029A0DEC0285B5B	electric_bike	member	start	2022-03-05 13:28:11	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
002AA979F480C8F4	electric_bike	member	start	2021-06-01 17:03:28	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
002D9A9ABE21C1F9	electric_bike	member	end	2021-07-20 19:02:23	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
003719B5F2312D44	electric_bike	casual	start	2021-10-14 02:03:06	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
003C8916AC164249	electric_bike	casual	end	2021-11-15 08:57:42	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
00401080376FACDB	electric_bike	casual	start	2022-01-09 18:12:05	Paulina Ave & North Ave	TA1305000037	41.91	−87.67	<raw [32]>
[ omitted 9,357 entries ]

We have filled many more rides here, but surprisingly, all those rides only correspond to 3 stations:

(tbl(con, "matched_clean_haversine")
  |> select(name, id, lat, lng)
  |> distinct()
  |> collect()
)

data.frame [3 x 4]

name	id	lat	lng
Paulina Ave & North Ave	TA1305000037	41.91	−87.67
Wood St & Milwaukee Ave	13221	41.908	−87.673
Clinton St & Roosevelt Rd	WL-008	41.867	−87.641

And 2 of those are the same as the ones matched from the accuracy-4 data.

🔍 Comparing the results of both types of joins:

Are some stations identified differently between the haversine and geom-based join ?

Code

bind_rows(
  anti_join(
    tbl(con, "matched_clean_haversine"),
    tbl(con, "matched_clean_geom"),
    by = tbl(con, "matched_clean_haversine") |> select(-pos, -name, -id) |> colnames()
  )
  |> rename(any_of(c(
        name_from_geom = "name.x", id_from_geom = "id.x", 
        name_from_haversine = "name.y", id_from_haversine = "id.y"
      )
    )
  )
  |> collect(),
  anti_join(
    tbl(con, "matched_clean_geom"),
    tbl(con, "matched_clean_haversine"),
    by = tbl(con, "matched_clean_haversine") |> select(-pos, -name, -id) |> colnames()
  )
  |> rename(any_of(c(
        name_from_geom = "name.y", id_from_geom = "id.y", 
        name_from_haversine = "name.x", id_from_haversine = "id.x"
      )
    )
  )
  |> collect()
)

data.frame [0 x 10]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng	pos

The resulting table is empty, meaning that both solutions give the exact same matches here. But if you look at the distances both solutions compute (not shown here), you’ll notice they are slightly different. However, that difference is so small it does not impact the identification of the unknown stations at all.

6 Updating the original dataset

Finally, we can update the original dataset with the entries that were position-matched (i.e. replace the missing station name/id).

We will do it in two steps:

Updating the clean dataset in its long format (i.e. updating rides_acc4_clean_geom_long with matched_acc4_clean_geom)
Pivoting it back to the original wide format

6.1 Merging the two datasets

In-RAM
Out-of-RAM

rides_clean_geom_long_filled <- (
  matched_clean_acc4_geom[
    rides_clean_geom_long, on = setdiff(colnames(rides_clean_geom_long), c("id", "name", "pos"))
  ][, let(name = fcoalesce(name, i.name), id = fcoalesce(id, i.id))
  ][, nms, env = list(nms = I(setdiff(colnames(rides_clean_geom_long), c("pos"))))]
)

Time difference of 4.632 secs

(right_join(
    tbl(con, "matched_clean_acc4_geom"),
    tbl(con, "rides_clean_geom_long"),
    by = tbl(con, "rides_clean_geom_long") |> select(-name, -id, -pos) |> colnames()
  ) 
  |> mutate(name = coalesce(name.x, name.y), id = coalesce(id.x, id.y))
  |> select(-matches("\\.x|\\.y"))
  |> compute("rides_clean_geom_long_filled", overwrite = TRUE)
)

Time difference of 0.405 secs

┕ SQL

CREATE OR REPLACE TABLE rides_clean_geom_long_filled AS
SELECT
  COLUMNS(c -> c NOT SIMILAR TO '(.*\.x|.*\.y)'),
  COALESCE("name.x", "name.y") AS "name",
  COALESCE("id.x", "id.y") AS id
FROM (
  SELECT
    r.ride_id AS ride_id
    , r.rideable_type AS rideable_type
    , r.at AS at
    , r.member_casual AS member_casual
    , r.way AS way
    , m.name AS "name.x"
    , m.id AS "id.x"
    , r.lat AS lat
    , r.lng AS lng
    , r.name AS "name.y"
    , r.id AS "id.y"
  FROM matched_clean_acc4_geom m
  RIGHT JOIN rides_clean_geom_long r
    ON m.ride_id = r.ride_id AND m.way = r.way
)

Time difference of 0.4677 secs

Output:

data.table [8,766,040 x 9]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng
06048DCFC8520CAF	electric_bike	member	start	2021-06-04 11:18:02	NA	NA	41.79	−87.59
9598066F68045DF2	electric_bike	member	start	2021-06-04 09:49:35	NA	NA	41.8	−87.6
B03C0FE48C412214	electric_bike	member	start	2021-06-03 19:56:05	NA	NA	41.78	−87.58
7E2546FBA79C46EE	electric_bike	member	start	2021-06-10 16:30:10	NA	NA	41.79	−87.6
3DDF3BBF6C4C3C89	electric_bike	member	start	2021-06-10 17:00:30	NA	NA	41.79	−87.59
2608805637155AB6	electric_bike	member	start	2021-06-10 12:46:16	NA	NA	41.93	−87.67
AF529C946F28ED42	electric_bike	member	start	2021-06-23 17:57:29	NA	NA	41.88	−87.61
E6010941FB92E4A6	electric_bike	member	start	2021-06-22 19:28:02	NA	NA	41.87	−87.62
1149C0723F7AFFD5	electric_bike	member	start	2021-06-29 17:35:49	NA	NA	41.9	−87.63
8762DB62099E6011	electric_bike	member	start	2021-06-05 14:55:05	NA	NA	41.89	−87.62
8E9F2CB0893B96A0	electric_bike	member	start	2021-06-05 13:39:04	NA	NA	41.88	−87.62
6344B71B7BB6E09E	electric_bike	member	start	2021-06-22 18:52:53	NA	NA	41.79	−87.59
59CE9444E2ED2530	electric_bike	member	start	2021-06-02 10:30:11	NA	NA	41.79	−87.6
F7107122A837A50B	electric_bike	member	start	2021-06-08 18:31:31	NA	NA	41.78	−87.6
45ABF9231CC02E3C	electric_bike	member	start	2021-06-07 22:24:08	NA	NA	41.79	−87.58
[ omitted 8,766,025 entries ]

6.2 Validating the merge

rides_clean_geom_long_filled[has_acc4(lat) & has_acc4(lng) & (is.na(id) | is.na(name)),]

data.table [0 x 9]

ride_id	rideable_type	member_casual	way	at	name	id	lat	lng

We can see that the resulting dataset no longer has any entries that miss either their name or id (at least among the entries that have 4 or more decimals on their coordinates), whereas there were 2 before. We have successfully updated them !

A lot of effort for very little gains in data if you ask me …

6.3 Pivoting back to the original (wide) format

To finish, let’s pivot the resulting data back into the wider format it was originally in:

In-RAM
Out-of-RAM

rides_clean_geom_filled <- (
  rides_clean_geom_long_filled
  |> dcast(... ~ way, sep = "_station_", value.var = c("at", "name", "id", "lat", "lng"))
)

Time difference of 9.746 secs

(tbl(con, "rides_clean_geom_long_filled")
  |> pivot_wider(names_from = "way", names_glue = "{way}_station_{.value}", values_from = c("at", "name", "id", "lat", "lng"))
  |> compute("rides_clean_geom_filled", overwrite = TRUE)
)

Time difference of 14.13 secs

┕ SQL

CREATE OR REPLACE TABLE rides_clean_geom_filled AS
SELECT ride_id, rideable_type, member_casual, COLUMNS("^start_"), COLUMNS("^end_")
FROM (
  PIVOT rides_clean_geom_long_filled
  ON way || '_station' 
  USING MAX(at) as at, MAX(name) as name, MAX(id) as id, MAX(lat) as lat, MAX(lng) as lng
  GROUP BY ride_id, rideable_type, member_casual
)

Time difference of 8.516 secs

Output:

data.table [4,383,020 x 13]

ride_id	rideable_type	member_casual	at_station_end	at_station_start	name_station_end	name_station_start	id_station_end	id_station_start	lat_station_end	lat_station_start	lng_station_end	lng_station_start
000002EBE159AE82	electric_bike	member	2021-06-22 17:31:34	2021-06-22 17:25:15	Milwaukee Ave & Grand Ave	Clinton St & Jackson Blvd	13033	638	41.891	41.878	−87.648	−87.641
0000080D43BAA9E4	classic_bike	casual	2021-08-29 16:24:03	2021-08-29 15:38:05	Federal St & Polk St	Dearborn St & Van Buren St	SL-008	624	41.872	41.876	−87.63	−87.629
00000B4F1F71F9C2	electric_bike	member	2021-09-08 16:37:54	2021-09-08 16:31:38	NA	NA	NA	NA	41.91	41.91	−87.7	−87.69
00000CAE95438C9D	classic_bike	casual	2021-07-20 17:38:17	2021-07-20 15:40:46	Fairbanks Ct & Grand Ave	Streeter Dr & Grand Ave	TA1305000003	13022	41.892	41.892	−87.621	−87.612
00000E22FBA89D81	electric_bike	member	2022-05-19 14:54:03	2022-05-19 14:42:55	Clark St & Armitage Ave	NA	13146	NA	41.918	41.9	−87.636	−87.62
00000EBBC119168C	classic_bike	member	2021-10-31 11:39:27	2021-10-31 11:30:37	Kimbark Ave & 53rd St	Dorchester Ave & 49th St	TA1309000037	KA1503000069	41.8	41.806	−87.595	−87.592
000018B1D040DB44	electric_bike	member	2022-04-25 10:44:19	2022-04-25 10:37:22	NA	NA	NA	NA	41.79	41.79	−87.59	−87.6
000019B7F053D461	classic_bike	member	2021-08-13 20:02:56	2021-08-13 19:57:28	Sheffield Ave & Webster Ave	Larrabee St & Webster Ave	TA1309000033	13193	41.922	41.922	−87.654	−87.644
00001B4F79D102B5	classic_bike	casual	2021-07-28 08:05:00	2021-07-28 07:58:27	Broadway & Waveland Ave	DuSable Lake Shore Dr & Belmont Ave	13325	TA1309000049	41.949	41.941	−87.649	−87.639
00001BEE76AB24E0	electric_bike	member	2021-11-30 17:08:53	2021-11-30 16:55:38	Ashland Ave & Division St	Daley Center Plaza	13061	TA1306000010	41.903	41.884	−87.668	−87.629
000020C92AA9D6F7	classic_bike	casual	2021-09-12 10:12:52	2021-09-12 09:53:00	Dusable Harbor	Clark St & North Ave	KA1503000064	13128	41.887	41.912	−87.613	−87.632
0000228A4B430869	electric_bike	member	2021-10-18 10:47:58	2021-10-18 10:42:20	Calumet Ave & 18th St	MLK Jr Dr & 29th St	13102	TA1307000139	41.857	41.842	−87.619	−87.617
000022C3D3CE7DD5	classic_bike	casual	2022-04-30 10:03:12	2022-04-30 09:57:39	Sheffield Ave & Willow St	Halsted St & Clybourn Ave	TA1306000032	331	41.914	41.91	−87.653	−87.648
0000278F02EFFEF9	classic_bike	member	2021-09-18 16:32:06	2021-09-18 16:09:39	Michigan Ave & Washington St	Burnham Harbor	13001	15545	41.884	41.856	−87.625	−87.613
00002D2793DB63D4	classic_bike	member	2022-04-04 07:56:11	2022-04-04 07:42:18	Halsted St & Clybourn Ave	Clark St & Wrightwood Ave	331	TA1305000014	41.91	41.93	−87.648	−87.643
[ omitted 4,383,005 entries ]

💻 Expand for Session Info

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.1 (2023-06-16)
 os       Ubuntu 22.04.3 LTS
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  C.UTF-8
 ctype    C.UTF-8
 tz       Europe/Paris
 date     2024-02-07
 pandoc   3.1.11
 Quarto   1.5.9

─ Packages ───────────────────────────────────────────────────────────────────
 ! package    * version  date (UTC) lib source
 P arrow      * 13.0.0.1 2023-09-22 [?] CRAN (R 4.3.1)
 P crayon     * 1.5.2    2022-09-29 [?] CRAN (R 4.3.0)
 P data.table * 1.15.0   2024-01-30 [?] CRAN (R 4.3.1)
 P DBI        * 1.1.3    2022-06-18 [?] CRAN (R 4.3.0)
 P dbplyr     * 2.4.0    2023-10-26 [?] CRAN (R 4.3.1)
 P dplyr      * 1.1.4    2023-11-17 [?] CRAN (R 4.3.1)
 P duckdb     * 0.9.2-1  2023-11-28 [?] CRAN (R 4.3.1)
 P fs         * 1.6.3    2023-07-20 [?] CRAN (R 4.3.0)
 P fuzzyjoin  * 0.1.6    2020-05-15 [?] CRAN (R 4.3.0)
 P ggplot2    * 3.4.4    2023-10-12 [?] CRAN (R 4.3.1)
 P gt         * 0.10.0   2023-10-07 [?] CRAN (R 4.3.1)
 P here       * 1.0.1    2020-12-13 [?] CRAN (R 4.3.0)
 P knitr      * 1.44     2023-09-11 [?] CRAN (R 4.3.0)
 P leaflet    * 2.2.0    2023-08-31 [?] CRAN (R 4.3.0)
 P lubridate  * 1.9.3    2023-09-27 [?] CRAN (R 4.3.1)
 P pipebind   * 0.1.2    2023-08-30 [?] CRAN (R 4.3.0)
 P purrr      * 1.0.2    2023-08-10 [?] CRAN (R 4.3.0)
 P readr      * 2.1.4    2023-02-10 [?] CRAN (R 4.3.0)
 P sf         * 1.0-14   2023-07-11 [?] CRAN (R 4.3.0)
 P stringr    * 1.5.0    2022-12-02 [?] CRAN (R 4.3.0)
 P tidyr      * 1.3.0    2023-01-24 [?] CRAN (R 4.3.0)

 [1] /home/mar/Dev/Projects/R/ma-riviere.com/renv/library/R-4.3/x86_64-pc-linux-gnu
 [2] /home/mar/.cache/R/renv/sandbox/R-4.3/x86_64-pc-linux-gnu/9a444a72

 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────

Citation

BibTeX citation:

@online{rivière2022,
  author = {Rivière, Marc-Aurèle},
  title = {Fast Spatial Data Matching in {R}},
  date = {2022-06-18},
  url = {https://ma-riviere.com/content/code/posts/spatial},
  langid = {en},
  abstract = {This post showcases various solutions to efficiently match
    unknown locations to known ones by their geographical proximity
    (using lat/long coordinates), on a dataset with millions of
    entries.}
}

For attribution, please cite this work as:

Rivière, M.-A. (2022, June 18). Fast spatial data matching in R. https://ma-riviere.com/content/code/posts/spatial