Number of matches per filter criteria (not disjoint)
Headphone PRS_all_NA Distance Activity_NA Duration HMNoise_NA
303 226 221 102 96 96
JourneyTime
20
Keep 1494 of 2206 observations
Setup
Benchmark helper function
library(mlr3verse, quietly = TRUE)
mse <- msrs(c("regr.mse"))
if (!interactive())
lgr::get_logger("mlr3")$set_threshold("warn")
get_benchi_table <- function(tasks, nfolds = 5) {
set.seed(123)
learners <- lrns(c("regr.featureless", "regr.lm", "regr.xgboost", "regr.ranger"))
learners$regr.xgboost$param_set$set_values(
eta = 0.03,
nrounds = 300,
max_depth = 2
)
benchi <- xfun::cache_rds({
benchmark(benchmark_grid(
tasks,
learners,
rsmp("cv", folds = nfolds)
))
},
file = "benchmark.rds",
dir = "cache/",
hash = list(tasks, nfolds)
)
res <- tidyr::pivot_wider(benchi$aggregate(mse),
id_cols = task_id,
names_from = learner_id,
values_from = regr.mse
) |> as.data.frame()
rownames(res) <- res$task_id
res <- res[, -1]
colnames(res) <- gsub("regr.", "", colnames(res))
stopifnot(any(colnames(res) == "featureless"))
res <- 1 - res / res$featureless
res[, -1, drop = FALSE] |> round(3)
}
Testing prediction quality of GIS_vars -> Mediators -> PRS_vars using
- Linear models
- Random forests (default parameters)
- XGBoost (with parameter tuning)
- FASSO (not shown since inferior)
GIS Variables:
[1] "LCARTIF_sqrt" "LCFOREST_sqrt" "HETER" "OVDIST_sqrt"
[5] "VIS5K_sqrt" "RL_NDVI" "RL_NOISE" "DISTKM_sqrt"
[9] "JNYTIME_sqrt" "STRIMP123_sqrt" "STRIMP999_sqrt"
Mediators:
[1] "FEELNAT" "LNOISE" "LOC_SENS" "LOC_SOUN" "LOC_SCEN" "LOC_VISE" "LOC_VEGE"
[8] "LOC_FAUN"
PRS ~ GIS
Code
tasks_GIS <- lapply(PRS_vars, \(y)
as_task_regr(
subset(Dmlr, select = c(y, GIS_vars)),
target = y,
id = y
))
get_benchi_table(tasks_GIS)
lm xgboost ranger
MEAN -0.001 -0.006 -0.035
FA -0.002 0.012 0.002
BA -0.006 -0.010 -0.022
EC 0.003 -0.027 -0.037
ES 0.047 0.041 0.025
GIS variables alone show poor predictive performance.
Legacy Code
XGBoost - Parameter Tuning
# Get parameter estimates for XGBoost
t <- as_task_regr(
subset(Dmlr, select = c("FEELNAT", GIS_vars)),
target = "FEELNAT"
)
l <- lrn("regr.xgboost",
nrounds = 500 # More iterations due to lower learning rate
)
# Create search space
ps <- ps(
max_depth = p_int(2, 3),
eta = p_dbl(0.001, 0.3, tags = "logscale")
)
# Setup tuning
instance <- ti(
task = t,
learner = l,
resampling = rsmp("cv", folds = 3),
measure = msr("regr.mse"),
terminator = trm("none"),
search_space = ps
)
# Grid search
tuner <- mlr3tuning::tnr("grid_search")
tuner$optimize(instance)
instance$result
Superseded setup with mlr3
library(randomForest)
fit <- lm(as.formula(paste0(
"cbind(", paste(PRS_vars, collapse = ", "), ")",
" ~ ",
paste(Mediator_vars, collapse = " + ")
)),
D)
coef(fit) |> round(2)
rsq.lm <- sapply(summary(fit), \(x) x$r.sq)
rsq.rf <- sapply(PRS_vars, \(x) {
rf <- randomForest(as.formula(paste0(
x, " ~ ", paste(Mediator_vars, collapse = " + ")
)),
D, na.action = na.omit
)
rf$rsq[500]
})
cbind(lm = rsq.lm, rf = rsq.rf) |> round(2)
First basic setup with mlr3
autoplot(mytsk1, type = "pairs")
mytsk1 <- as_task_regr(
subset(Dmlr, select = c("FA", Mediator_vars, GIS_vars)),
feature = c(Mediator_vars, GIS_vars),
target = "FA",
id = "bla"
)
lrn_xgb <- lrn("regr.xgboost")
lrn_avg <- lrn("regr.featureless")
splits <- partition(mytsk1)
lrn_xgb$train(mytsk1, splits$train)$predict(mytsk1, splits$test)$score(mse)
lrn_avg$train(mytsk1, splits$train)$predict(mytsk1, splits$test)$score(mse)
rr <- resample(mytsk1, lrn_xgb, cv3)
rr$aggregate(mse)
learners <- lrns(c("regr.featureless", "regr.lm", "regr.xgboost", "regr.ranger"))
learners$regr.xgboost$param_set$set_values(eta = 0.03, nrounds = 300, max_depth = 2)
learners <- c("regr.featureless", "regr.lm")