Setup + Preprocessing
suppressPackageStartupMessages ({
library (dplyr)
library (ggplot2)
library (egg)
library (kernlab)
library (mgcv) # for GAM models - smoothing
library (gridExtra) # for combining plots
library (tidyr) # for pivoting data
})
theme_set (theme_minimal ())
options (digits = 3 )
source ("R/data_prep.R" )
Number of matches per filter criteria (not disjoint)
Headphone PRS_all_NA Distance Activity_NA Duration HMNoise_NA
303 226 221 102 96 96
JourneyTime
20
Keep 1494 of 2206 observations
Code
D$ RL_NDVI <- pmax (0 , D$ RL_NDVI)
D_trn$ RL_NDVI <- pmax (0 , D_trn$ RL_NDVI)
D_tst$ RL_NDVI <- pmax (0 , D_tst$ RL_NDVI)
D$ LANG <- as.factor (D$ LANG)
D_trn$ LANG <- as.factor (D_trn$ LANG)
D_tst$ LANG <- as.factor (D_tst$ LANG)
D$ SEX <- as.factor (D$ SEX)
D_trn$ SEX <- as.factor (D_trn$ SEX)
D_tst$ SEX <- as.factor (D_tst$ SEX)
D$ JNYTIME <- D$ JNYTIME + quantile (D$ JNYTIME[D$ JNYTIME > 0 ], 0.05 , na.rm= TRUE )/ 2
D_trn$ JNYTIME <- D_trn$ JNYTIME + quantile (D_trn$ JNYTIME[D_trn$ JNYTIME > 0 ], 0.05 , na.rm= TRUE )/ 2
D_tst$ JNYTIME <- D_tst$ JNYTIME + quantile (D_tst$ JNYTIME[D_tst$ JNYTIME > 0 ], 0.05 , na.rm= TRUE )/ 2
D$ DISTKM <- D$ DISTKM + quantile (D$ DISTKM[D$ DISTKM > 0 ], 0.05 , na.rm= TRUE )/ 2
D_trn$ DISTKM <- D_trn$ DISTKM + quantile (D_trn$ DISTKM[D_trn$ DISTKM > 0 ], 0.05 , na.rm= TRUE )/ 2
D_tst$ DISTKM <- D_tst$ DISTKM + quantile (D_tst$ DISTKM[D_tst$ DISTKM > 0 ], 0.05 , na.rm= TRUE )/ 2
D$ SPEED_log <- log (D$ DISTKM / D$ JNYTIME)
D_trn$ SPEED_log <- log (D_trn$ DISTKM / D_trn$ JNYTIME)
D_tst$ SPEED_log <- log (D_tst$ DISTKM / D_tst$ JNYTIME)
# logical_vars <- c("ALONE","WITH_DOG","WITH_KID","WITH_PAR","WITH_PNT","WITH_FND")
# D[logical_vars] <- lapply(D[logical_vars], as.numeric)
# D_trn[logical_vars] <- lapply(D_trn[logical_vars], as.numeric)
# D_tst[logical_vars] <- lapply(D_tst[logical_vars], as.numeric)
Train/test split
nams <- names (D)
nams[grep ("HM|RL" , nams)]
[1] "HM_NOISELVL" "HM_NDVI" "HM_NOISE" "RL_NDVI" "RL_NOISE"
[6] "HM_COORDX" "HM_COORDY" "RL_COORDX" "RL_COORDY" "RL_GCOORD"
[11] "RL_GCOORDN" "RL_GCOORDW"
Train/test split
vars <- c (
# "HM_NOISELVL",
# "HM_COORDX","HM_COORDY","RL_COORDX","RL_COORDY","RL_GCOORD",
# "RL_GCOORDN","RL_GCOORDW",
"HM_NDVI" ,"HM_NOISE" ,"RL_NDVI" ,"RL_NOISE" ,
"ALONE" ,"WITH_DOG" ,"WITH_KID" ,
"WITH_PAR" ,"WITH_PNT" ,"WITH_FND" ,
"LANG" ,"AGE" ,"SEX" ,
"DISTKM_sqrt" , "JNYTIME_sqrt" , "SPEED_log"
)
D_trn <- D_trn[vars]
D_tst <- D_tst[vars]
D_trn <- D_trn[complete.cases (D_trn), ]
D_tst <- D_tst[complete.cases (D_tst), ]
D_trn[] <- lapply (D_trn[], \(x) if (is.numeric (x)) scale (x) else x)
D_tst[] <- lapply (D_tst[], \(x) if (is.numeric (x)) scale (x) else x)
# summary(D[vars])
cor (cbind (D_trn[c ("DISTKM_sqrt" , "JNYTIME_sqrt" , "SPEED_log" )]))
DISTKM_sqrt JNYTIME_sqrt SPEED_log
DISTKM_sqrt 1.000 0.4894 0.7145
JNYTIME_sqrt 0.489 1.0000 -0.0835
SPEED_log 0.715 -0.0835 1.0000
Remove DIST_sqrt, as it is highly correlated with the other two.
RL_NDVI
lm_ndvi <- lm (RL_NDVI ~ (HM_NDVI + HM_NOISE +
# ALONE + WITH_DOG + WITH_KID + WITH_PAR + WITH_PNT + WITH_FND +
LANG + AGE + SEX +
SPEED_log + JNYTIME_sqrt)^ 2 , D_trn)
step_ndvi <- step (lm_ndvi, trace = FALSE , k = log (nrow (D_trn)))
summary (fit <- lm (formula (step_ndvi), D_tst))
Call:
lm(formula = formula(step_ndvi), data = D_tst)
Residuals:
Min 1Q Median 3Q Max
-3.371 -0.430 0.182 0.698 1.989
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.1690 0.0946 -1.79 0.07452 .
HM_NDVI 0.1581 0.0413 3.83 0.00015 ***
LANGGerman 0.2232 0.1064 2.10 0.03633 *
LANGItalian 0.0028 0.1965 0.01 0.98866
SPEED_log -0.0720 0.0414 -1.74 0.08277 .
JNYTIME_sqrt 0.1272 0.0415 3.07 0.00226 **
HM_NDVI:SPEED_log -0.1848 0.0422 -4.38 1.4e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.958 on 536 degrees of freedom
Multiple R-squared: 0.0917, Adjusted R-squared: 0.0815
F-statistic: 9.02 on 6 and 536 DF, p-value: 2.09e-09
R² = 0.08
Higher HM_NDVI corresponds to slightly higher RL-NDVI
higher JNYTIME_sqrt corresponds to slightly higher RL-NDVI
The faster (or further) you travel to RL, the more the RL_NDVI differs from HM_NDVI (negative interaction effect)
ggplot (D_tst, aes (x = JNYTIME_sqrt, y= HM_NOISE, col = RL_NOISE)) +
geom_jitter (width= 0.07 , height = 0.1 )
RL_NOISE
lm_noise <- lm (RL_NOISE ~ (HM_NDVI + HM_NOISE +
# ALONE + WITH_DOG + WITH_KID + WITH_PAR + WITH_PNT + WITH_FND +
LANG + AGE + SEX +
SPEED_log + JNYTIME_sqrt)^ 2 , D_trn)
step_noise <- step (lm_noise, trace = FALSE , k = log (nrow (D_trn)))
summary (lm (formula (step_noise), D_tst))
Call:
lm(formula = formula(step_noise), data = D_tst)
Residuals:
Min 1Q Median 3Q Max
-2.5097 -0.7495 -0.0467 0.6473 2.8683
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.03379 0.08896 -0.38 0.70426
HM_NOISE 0.23846 0.03916 6.09 2.2e-09 ***
LANGGerman -0.00494 0.09968 -0.05 0.96047
LANGItalian 0.62198 0.18667 3.33 0.00092 ***
SPEED_log -0.06326 0.03902 -1.62 0.10552
JNYTIME_sqrt -0.31956 0.03936 -8.12 3.2e-15 ***
HM_NOISE:JNYTIME_sqrt -0.03423 0.04070 -0.84 0.40061
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.904 on 536 degrees of freedom
Multiple R-squared: 0.193, Adjusted R-squared: 0.184
F-statistic: 21.3 on 6 and 536 DF, p-value: <2e-16
R² = 0.184
Participants can’t completely escape HM_NOISE (HM_NOISE positive predictor)
LANGItalians have it louder (than LANG de/fr)
Longer JNYTIME_sqrt leads to lower NOISE
Fit Gaussian Process for smooth plot
# Fit Gaussian Process
X <- as.matrix (D_tst[, c ("JNYTIME_sqrt" , "HM_NOISE" )])
y <- D_tst$ RL_NOISE
# Fit GP with RBF kernel
gp_model <- gausspr (X, y, kernel = "rbfdot" , kpar = "automatic" )
Using automatic sigma estimation (sigest) for RBF or laplace kernel
Fit Gaussian Process for smooth plot
# Create prediction grid
x_range <- range (D_tst$ JNYTIME_sqrt)
y_range <- range (D_tst$ HM_NOISE)
grid <- expand.grid (
JNYTIME_sqrt = seq (x_range[1 ], x_range[2 ], length.out = 50 ),
HM_NOISE = seq (y_range[1 ], y_range[2 ], length.out = 50 )
)
# Predict on grid
grid$ predicted_RL_NOISE <- predict (gp_model, as.matrix (grid[, 1 : 2 ]))
# Get combined range for both scales
combined_range <- range (c (D_tst$ RL_NOISE, grid$ predicted_RL_NOISE))
Plot predicted RL_NOISE with GP
# Plot with matching color scales
ggplot () +
geom_raster (data = grid, aes (x = JNYTIME_sqrt, y = HM_NOISE, fill = predicted_RL_NOISE)) +
geom_jitter (data = D_tst, aes (x = JNYTIME_sqrt, y = HM_NOISE, col = RL_NOISE, shape = LANG),
width = 0.07 , height = 0.1 , alpha = 0.7 ) +
scale_fill_viridis_c (name = "Predicted \n RL_NOISE" , limits = combined_range) +
scale_color_viridis_c (name = "Actual \n RL_NOISE" , limits = combined_range)
D$ PRS <- rowMeans (cbind (D$ FA,D$ BA,D$ EC,D$ ES))
summary_fun <- function (x) c (mean = mean (x, na.rm = TRUE ),
sd = sd (x, na.rm = TRUE ),
median = median (x, na.rm = TRUE ))
aggregate (PRS~ HM_NOISELVL, D, summary_fun)
HM_NOISELVL PRS.mean PRS.sd PRS.median
1 1 4.987 0.885 5.000
2 2 4.932 0.913 4.917
3 3 4.862 0.924 4.792
aggregate (FA ~ HM_NOISELVL, D, summary_fun)
HM_NOISELVL FA.mean FA.sd FA.median
1 1 5.25 1.10 5.33
2 2 5.16 1.21 5.21
3 3 5.18 1.13 5.33
aggregate (BA ~ HM_NOISELVL, D, summary_fun)
HM_NOISELVL BA.mean BA.sd BA.median
1 1 5.11 1.20 5.00
2 2 5.04 1.17 5.00
3 3 4.95 1.20 5.00
aggregate (EC ~ HM_NOISELVL, D, summary_fun)
HM_NOISELVL EC.mean EC.sd EC.median
1 1 4.52 1.31 4.33
2 2 4.48 1.33 4.33
3 3 4.47 1.29 4.33
aggregate (ES ~ HM_NOISELVL, D, summary_fun)
HM_NOISELVL ES.mean ES.sd ES.median
1 1 5.07 1.41 5.23
2 2 5.05 1.38 5.00
3 3 4.85 1.55 5.00