Appendix

Final Scripts

On this page you will find the complete versions of final scripts as completed in the exercises

Exercise 1: Health Categories Maps final scripts

# Load Libraries ----
library(sf)
library(dplyr)
library(assertr)
library(ggplot2)
library(colorspace)
library(ggpubr)
library(furrr)

# Source function ----
source(here::here("health_data", "R", "map-function.R"))

# Prepare output directory ----
out_dir <- here::here("health_data", "outputs", "maps")
fs::dir_create(out_dir)

# Load data ----
health_data <- readr::read_csv(
  here::here(
    "health_data", "data",
    "lsoa-general_health.csv"
  )
)

look_up <- readr::read_csv(
  here::here(
    "health_data", "data",
    "output_area_lookup.csv"
  )
)

boundaries <- read_sf(
  here::here(
    "health_data", "data",
    "lsoa_boundaries.geojson"
  )
)

# Merge data and validate ----
all_data <- left_join(health_data, look_up,
                      relationship = "many-to-one"
) %>%
  left_join(boundaries, relationship = "many-to-one") %>%
  st_as_sf() %>%
  assert(not_na, lsoa_code, lsoa_name, lad_name, lad_code, geometry) %>%
  verify(inherits(., "sf"))

# Process data ----
# Get ordered health category levels
health_cat_levels <- health_data %>%
  select(gen_health_code, gen_health_cat) %>%
  distinct() %>%
  arrange(gen_health_code) %>%
  pull(gen_health_cat)

# Create addition variables obs_perc & z_score, cast gen_health_cat to factor
all_data <- all_data %>%
  mutate(gen_health_cat = factor(gen_health_cat,
                                 levels = health_cat_levels
  )) %>%
  group_by(lsoa_code) %>%
  mutate(obs_perc = observation / sum(observation) * 100) %>%
  ungroup() %>%
  group_by(gen_health_cat) %>%
  mutate(z_score = (obs_perc - mean(obs_perc)) / sd(obs_perc)) %>%
  ungroup()

# Create iteration indexes ----
idx_start <- NULL
idx_end <- NULL
# Collect command line arguments if present
args <- commandArgs(trailingOnly = TRUE)

# If idx_* values have already been set (i.e. are NULL) do nothing.
# If a command line argument is supplied (not NA) and no corresponding idx value
# has been set, convert it to integer and assign it.
# If any command arg is missing and idx_* value has not been set, set to 1 or
# total number of LADs.
lad_n <- length(unique(all_data$lad_code))
if (is.na(args[1])) {
  if (is.null(idx_start)) {
    idx_start <- 1L
  }
} else {
  idx_start <- as.integer(args[1])
}
if (is.na(args[2])) {
  if (is.null(idx_start)) {
    idx_end <- lad_n
  }
} else {
  idx_end <- as.integer(args[2])
}
idx <- idx_start:idx_end

# If running within a slurm array, split idx into chunks and subset the idxs for the
# particular task id.
array_id <- Sys.getenv('SLURM_ARRAY_TASK_ID')
array_task_n <- Sys.getenv('SLURM_ARRAY_TASK_COUNT')
if (array_id != "") {
  array_task_n <- as.integer(array_task_n)
  array_id <- as.integer(array_id)
  idx <- idx[furrr:::make_chunks(length(idx), array_task_n)[[array_id]]]
}


# Split and subset data ----
split_data <- split(all_data, f = all_data$lad_code)[idx]

# Create maps ----
# Iterate plotting function over each LAD plot data chunk in idx
tictoc::tic()
plan(multisession)
furrr::future_walk(
  split_data,
  ~ plot_lad_map(.x, out_dir),
  .progress = TRUE,
  .options = furrr_options(seed = TRUE)
)

cli::cli_h1("Job Complete")
cli::cli_alert_success("{length(idx)} map{?s} written to {.path {out_dir}}.")
cli::cli_h2("Total time elapsed: {.val {tictoc::toc(quiet = TRUE)$callback_msg}}")

Exercise 2: NBA Playoffs final scripts

#' Play all rounds for an individual conference
#'
#' @param qualified a character vector of team slugs of conference qualifiers
#' @param nba_stats data.frame of nba stats data
#'
#' @return a list of length 1 containg the name slug of the round winner. Match logs
#' for each round are also compiled and assigned to attribute `match_logs` of the output.
play_conference <- function(qualified, nba_stats) {
  round_1 <- play_round(
    qualified,
    nba_stats
  )
  semis <- play_round(
    round_1,
    nba_stats
  )
  finals <- play_round(
    semis,
    nba_stats
  )
  return(finals)
}
#' Play a round of matches. Should either be used for matches of teams in a single
#' conference or for the play-offs final.
#'
#' @param teams list of character strings of team slugs of all teams competing in round.
#' @param nba_stats data.frame of nba stats data
#' @param round_name Round name. If `NULL` (default), round name is auto-detected
#' by number of competing teams. For play-off final should be `"Play off Finals"`.
#' @param seed seed to set for lapply function.
#'
#' @return returns a list of the round winners of each match. Match logs are also
#' compiled and appended to the input's logs as attribute `match_logs` of the output
play_round <- function(teams, nba_stats, round_name = NULL, seed = TRUE) {

  # Detect round name from number of teams
  if (is.null(round_name)) {
    round_id <- as.character(length(teams))

    round_name <- switch(round_id,
                         "8" = "Conference Round 1",
                         "4" = "Conference Semi finals",
                         "2" = "Conference finals"
    )
  }
  if (round_name == "Play off Finals") {
    conf <- NA
    conf_msg <- ""
  } else {
    conf <- unique(nba_stats[nba_stats$slug_team %in% unlist(teams), ]$id_conference)
    conf_msg <- paste0("(conf ", conf, ") ")
  }
  # Signal start of round
  cli::cli_h2("{round_name} {conf_msg}started!")

  # Create list of round pair match ups
  round_pairs <- draw_match_pairs(unlist(teams))

  # Use future_lapply to play each match
  round_winners <- future.apply::future_lapply(
    X = round_pairs,
    FUN = play_match,
    nba_stats,
    round_name,
    future.seed = seed
  )

  # Compile match logs and append them to input match logs attribute
  attr(round_winners, "match_logs") <- rbind(
    attr(teams, "match_logs"),
    compile_match_logs(round_winners)
  )

  # Signal round completion and return results
  cli::cli_h2("{round_name} {conf_msg}COMPLETE!")

  return(round_winners)
}


#' Play a single match
#'
#' @param matchup a character vector of length 2 containing the name slugs of teams
#' competing.
#' @param nba_stats data.frame of nba stats data
#' @param round_name Character string. Round name.
#'
#' @return a list of length 1 containg the name slug of the round winner. Match logs are also
#' appended as attribute `match_logs`.
play_match <- function(matchup, nba_stats, round_name) {
  if (round_name == "Play off Finals") {
    conf <- NA
    conf_msg <- ""
  } else {
    conf <- unique(nba_stats[nba_stats$slug_team %in% matchup, ]$id_conference)
    conf_msg <- paste0("(conf ", conf, ") ")
  }


  # Create list of probabilities for sampling game length
  probs <- list(
    c(0.9452, 0.0481, 0.0057, 6e-04, 4e-04),
    c(0.7007, 0.1452, 0.0902, 0.0413, 0.0225)
  )
  # Assign higher probabilities for longer games to conference 2
  if (length(conf) > 1) {
    prob <- probs[[1]]
  } else {
    prob <- probs[[conf]]
  }

  pid <- Sys.getpid()
  node <- replace_ip(system2("hostname", stdout = TRUE))

  # play game
  cli::cli_h3("Playing {round_name} {conf_msg}game: {.var {matchup[1]}} VS {.var {matchup[2]}}")
  cli::cli_alert_info("Game location: {.val {pid}} ({node})")

  # Sample game length
  game_length <- sample(c(2.40, 2.65, 2.90, 3.15, 3.40), 1,
                        prob = prob
  )
  # Send system to sleep to simulate playing match
  Sys.sleep(game_length)

  # SAMPLE WINNER from probability of winning stats
  # subset nba_stats to only team matchup data
  match_df <- nba_stats[nba_stats$slug_team %in% matchup, ]
  # sample winner
  winner <- sample(match_df$slug_team, 1, prob = match_df$prop_win)


  # print messages
  cli::cli_alert_info("{matchup[1]} VS {matchup[2]} match complete in {game_length * 50} minutes")
  cli::cli_alert_success("Winner: {winner}")

  # Compile match information into match logs data.frame and append as attribute.
  match_logs <- data.frame(
    winner = winner,
    team_1 = matchup[1],
    team_2 = matchup[2],
    pid = pid,
    node = node,
    game_length = game_length * 50,
    date = Sys.time(),
    conf = conf,
    round_name = round_name
  )
  attr(winner, "match_logs") <- match_logs

  return(winner)
}
#' Play Conference qualifiers
#'
#' @param teams a character vector of team slugs of teams competing in qualifiers
#'
#' @return a character vector of team slugs of qualified teams.
play_qualifiers <- function(teams) {
  conf <- unique(teams$id_conference)
  # play season
  cli::cli_h2("Playing qualifiers for conference {.val {conf}} on {.var {Sys.getpid()}}")
  Sys.sleep(5)
  # sample qualifiers
  qualified <- sample(teams$slug_team, 8, prob = teams$prop_win)

  cli::cli_alert_success("Conference {.val {conf}} qualifying round complete")

  return(qualified)
}
#' Draw pair matches.
#'
#' @param teams a character vector of team slugs of teams competing in round
#'
#' @return a list half the size of `teams` containing match pairs.
draw_match_pairs <- function(teams) {
  n_teams <- length(teams)
  matches <- sample(rep(1:(n_teams / 2), each = 2), size = n_teams)
  match_pairs <- split(teams, matches)
  return(match_pairs)
}
# Compile and order match_logs from lists of results
compile_match_logs <- function(x) {
  logs_list <- lapply(x, function(x) {
    attr(x, "match_logs")
  })
  logs <- do.call(rbind, logs_list)
  logs[order(logs$date), ]
}
# Function replaces local IP addresses with fake IP address
replace_ip <- function(hostname) {
  # Regular expression for IPv4
  ipv4_pattern <- "\\b(?:[0-9]{1,3}\\.){3}[0-9]{1,3}\\b"
  # Regular expression for IPv6
  ipv6_pattern <- "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b"

  # Replace detected IPv4 addresses with a fake address
  hostname <- gsub(ipv4_pattern, "192.0.2.0", hostname)
  # Replace detected IPv6 addresses with a fake address
  hostname <- gsub(ipv6_pattern, "2001:db8::", hostname)

  hostname
}
# Load Libraries ----
library(future.apply)

# Source function ----
source(here::here("nba", "R", "playoff-future_apply-functions.R"))

# Load data ----
nba_stats <- readr::read_csv(
  here::here(
    "nba", "data",
    "nba_stats_summaries_19-21.csv"
  )
)


# Play qualifiers ----
tictoc::tic(msg = "Total Play-offs Duration")

cli::cli_h1("Qualifiers have begun!")
split_confs <- split(nba_stats,
  f = nba_stats$id_conference
)

plan(multisession)
qualified_confs <- future_lapply(
  split_confs,
  play_qualifiers,
  future.seed = 5
)

cli::cli_h2("ALL Qualifying matches COMPLETE!")

# Play Conference Rounds ----
cli::cli_h1("Conference Rounds have begun!")

# Set up parallel plan
outer_cores <- 2L
inner_cores <- parallelly::availableCores(
  omit = outer_cores
) %/% outer_cores
plan(list(
  tweak(multisession, workers = outer_cores),
  tweak(multisession, workers = I(inner_cores))
))

conf_winners <- future_lapply(
  X = qualified_confs,
  FUN = play_conference,
  nba_stats,
  future.seed = 8
)

attr(conf_winners, "match_logs") <- compile_match_logs(conf_winners)
cli::cli_h2("ALL Conference matches COMPLETE!")

# Play Play-offs FINAL ----
cli::cli_h1("Overall Playoff final has begun!")

plan(multisession)
playoff_winner <- play_round(conf_winners,
  nba_stats,
  round_name = "Play off Finals",
  seed = 7
)

# Announce Winner!
winner_stats <- nba_stats[nba_stats$slug_team == playoff_winner, ]
cli::cli_h1("Playoffs complete!")
cli::cli_alert_success("Winner: {.field {winner_stats$name_team}} ({winner_stats$slug_team})")
tictoc::toc()

# Write match logs to csv ----
fs::dir_create(here::here("nba", "outputs"))
write.csv(attr(playoff_winner, "match_logs"),
  file = here::here("nba", "outputs", "playoff_results.csv"),
  row.names = FALSE
)
library(future.apply)

# Source function ----
source(here::here("nba", "R", "playoff-future_apply-functions.R"))


nba_stats <- readr::read_csv(
  here::here(
    "nba", "data",
    "nba_stats_summaries_19-21.csv"
  )
)


# Play qualifiers ----
tictoc::tic(msg = "Total Play-offs Duration")
cli::cli_h1("Qualifiers have begun!")

split_confs <- split(nba_stats,
  f = nba_stats$id_conference
)

plan(multisession)
qualified_confs <- future.apply::future_lapply(
  split_confs,
  play_qualifiers,
  future.seed = 5
)

cli::cli_h2("ALL Qualifying matches COMPLETE!")

# Play Conference Rounds ----
cli::cli_h1("Conference Rounds have begun!")
# Set up nested parallel plan
plan(list(
  tweak(future.batchtools::batchtools_slurm,
    template = here::here("nba", "slurm", "batchtools.slurm.tmpl"),
    resources = list(
      ncpus = 4,
      memory = "1GB",
      walltime = 180
    )
  ),
  multisession
))

conf_winners <- future_lapply(
  X = qualified_confs,
  FUN = play_conference,
  nba_stats,
  future.seed = 8
)

attr(conf_winners, "match_logs") <- compile_match_logs(conf_winners)
cli::cli_h2("ALL Conference matches COMPLETE!")

# Play Play-offs FINAL ----
cli::cli_h1("Overall Playoff final has begun!")

plan(multisession)
playoff_winner <- play_round(conf_winners,
  nba_stats,
  round_name = "Play off Finals",
  seed = 7
)

# Announce Winner!
winner_stats <- nba_stats[nba_stats$slug_team == playoff_winner, ]
cli::cli_h1("Playoffs complete!")
cli::cli_alert_success("Winner: {.field {winner_stats$name_team}} ({winner_stats$slug_team})")
tictoc::toc()

# Write match logs to csv ----
fs::dir_create(here::here("nba", "outputs"))
write.csv(attr(playoff_winner, "match_logs"),
  file = here::here("nba", "outputs", "playoff_results.csv"),
  row.names = FALSE
)
Back to top