Back to Article
Prediction Analysis
Download Source

Prediction Analysis for WSL

Author

Lukas Graz

Published

February 13, 2025

In [1]:
source("R/data_prep.R")
Number of matches per filter criteria (not disjoint)
  Headphone  PRS_all_NA    Distance Activity_NA    Duration  HMNoise_NA 
        303         226         221         102          96          96 
JourneyTime 
         20 
Keep  1494 of 2206 observations
Warning: package 'missForest' was built under R version 4.5.1
Imputing PRS_orig_vars

Setup

In [2]:
Benchmark helper function
library(mlr3verse, quietly = TRUE)
Warning: package 'mlr3verse' was built under R version 4.5.1
Benchmark helper function
mse <- msrs(c("regr.mse"))

if (!interactive())
  lgr::get_logger("mlr3")$set_threshold("warn")

get_benchi_table <- function(tasks, nfolds = 5) {
  set.seed(123)
  learners <- lrns(c("regr.featureless", "regr.lm", "regr.xgboost", "regr.ranger"))
  learners$regr.xgboost$param_set$set_values(
    eta = 0.03, 
    nrounds = 300, 
    max_depth = 2
  )

  benchi <- xfun::cache_rds({
    benchmark(benchmark_grid(
      tasks, 
      learners, 
      rsmp("cv", folds = nfolds)
    ))
  }, 
  file = "benchmark.rds", 
  dir = "cache/",
  hash = list(tasks, nfolds)
  )
  
  res <- tidyr::pivot_wider(benchi$aggregate(mse), 
    id_cols = task_id,
    names_from = learner_id,
    values_from = regr.mse
  ) |> as.data.frame()
  
  rownames(res) <- res$task_id
  res <- res[, -1]
  colnames(res) <- gsub("regr.", "", colnames(res))
  stopifnot(any(colnames(res) == "featureless"))
  res <- 1 - res / res$featureless
  res[, -1, drop = FALSE] |> round(3)
}

Testing prediction quality of GIS_vars -> Mediators -> PRS_vars using

  • Linear models
  • Random forests (default parameters)
  • XGBoost (with parameter tuning)
  • FASSO (not shown since inferior)

GIS Variables:

In [3]:
GIS_vars
 [1] "LCARTIF_sqrt"   "LCFOREST_sqrt"  "HETER"          "OVDIST_sqrt"   
 [5] "VIS5K_sqrt"     "RL_NDVI"        "RL_NOISE"       "DISTKM_sqrt"   
 [9] "JNYTIME_sqrt"   "STRIMP123_sqrt" "STRIMP999_sqrt"

Mediators:

In [4]:
Mediator_vars
[1] "FEELNAT"  "LNOISE"   "LOC_SENS" "LOC_SOUN" "LOC_SCEN" "LOC_VISE" "LOC_VEGE"
[8] "LOC_FAUN"

PRS ~ GIS

In [5]:
Code
tasks_GIS <- lapply(PRS_vars, \(y) 
  as_task_regr(
    subset(Dmlr, select = c(y, GIS_vars)),
    target = y,
    id = y
  ))
get_benchi_table(tasks_GIS) 
         lm xgboost ranger
MEAN -0.002  -0.005 -0.035
FA   -0.002   0.014  0.005
BA   -0.006  -0.009 -0.021
EC    0.002  -0.024 -0.035
ES    0.046   0.044  0.026

GIS variables alone show poor predictive performance.

PRS ~ GIS + Mediators

In [6]:
Code
tasks_GIS_MED <- lapply(PRS_vars, \(y) 
  as_task_regr(
    subset(Dmlr, select = c(y, Mediator_vars, GIS_vars)),
    target = y,
    id = y
  ))
get_benchi_table(tasks_GIS_MED) 
        lm xgboost ranger
MEAN 0.228   0.244  0.231
FA   0.236   0.262  0.251
BA   0.123   0.135  0.131
EC   0.044   0.018  0.026
ES   0.151   0.170  0.153

PRS ~ Mediators

In [7]:
Code
tasks_MED <- lapply(PRS_vars, \(y) 
  as_task_regr(
    subset(Dmlr, select = c(y, Mediator_vars)),
    target = y,
    id = y
  ))
get_benchi_table(tasks_MED) 
        lm xgboost ranger
MEAN 0.227   0.243  0.198
FA   0.227   0.255  0.229
BA   0.135   0.150  0.112
EC   0.036   0.029 -0.019
ES   0.132   0.136  0.082

Mediators ~ GIS

In [8]:
Code
tasks_MED_by_GIS <- lapply(Mediator_vars, \(y) 
  as_task_regr(
    subset(Dmlr, select = c(y, GIS_vars)),
    target = y,
    id = y
  ))
get_benchi_table(tasks_MED_by_GIS)
            lm xgboost ranger
FEELNAT  0.127   0.136  0.112
LNOISE   0.096   0.068  0.081
LOC_SENS 0.020  -0.003 -0.024
LOC_SOUN 0.041   0.010 -0.001
LOC_SCEN 0.040   0.051  0.024
LOC_VISE 0.008  -0.029 -0.043
LOC_VEGE 0.052   0.031  0.036
LOC_FAUN 0.057   0.062  0.049

Legacy Code

In [9]:
XGBoost - Parameter Tuning

# Get parameter estimates for XGBoost
t <- as_task_regr(
  subset(Dmlr, select = c("FEELNAT", GIS_vars)),
  target = "FEELNAT"
)

l <- lrn("regr.xgboost",
  nrounds = 500  # More iterations due to lower learning rate
)

# Create search space
ps <- ps(
  max_depth = p_int(2, 3),
  eta = p_dbl(0.001, 0.3, tags = "logscale")
)

# Setup tuning
instance <- ti(
  task = t,
  learner = l,
  resampling = rsmp("cv", folds = 3),
  measure = msr("regr.mse"),
  terminator = trm("none"),
  search_space = ps
)

# Grid search
tuner <- mlr3tuning::tnr("grid_search")
tuner$optimize(instance)
instance$result
In [10]:
Superseded setup with mlr3

library(randomForest)

fit <- lm(as.formula(paste0(
    "cbind(", paste(PRS_vars, collapse = ", "), ")",
    " ~ ",
    paste(Mediator_vars, collapse = " + ")
  )), 
  D)
coef(fit) |> round(2)

rsq.lm <- sapply(summary(fit), \(x) x$r.sq)
rsq.rf <- sapply(PRS_vars, \(x) {
  rf <- randomForest(as.formula(paste0(
    x, " ~ ", paste(Mediator_vars, collapse = " + ")
  )), 
  D, na.action = na.omit
  ) 
  rf$rsq[500]
})

cbind(lm = rsq.lm, rf = rsq.rf) |> round(2)
In [11]:
First basic setup with mlr3

autoplot(mytsk1, type = "pairs")
mytsk1 <- as_task_regr(
  subset(Dmlr, select = c("FA", Mediator_vars, GIS_vars)),
  feature = c(Mediator_vars, GIS_vars),
  target = "FA",
  id = "bla"
)

lrn_xgb <- lrn("regr.xgboost")
lrn_avg <- lrn("regr.featureless")
splits <- partition(mytsk1)
lrn_xgb$train(mytsk1, splits$train)$predict(mytsk1, splits$test)$score(mse)
lrn_avg$train(mytsk1, splits$train)$predict(mytsk1, splits$test)$score(mse)
rr <- resample(mytsk1, lrn_xgb, cv3)
rr$aggregate(mse)

learners <- lrns(c("regr.featureless", "regr.lm", "regr.xgboost", "regr.ranger"))
learners$regr.xgboost$param_set$set_values(eta = 0.03, nrounds = 300, max_depth = 2)
learners <- c("regr.featureless", "regr.lm")