Back to Article
Hypothesis Testing
Download Source

Hypothesis Testing

Author

Lukas Graz

Published

February 13, 2025

In [1]:
Code
source("R/data_prep.R")
Number of matches per filter criteria (not disjoint)
  Headphone  PRS_all_NA    Distance Activity_NA    Duration  HMNoise_NA 
        303         226         221         102          96          96 
JourneyTime 
         20 
Keep  1494 of 2206 observations
Imputing PRS_orig_vars
Code
options(digits = 3)

# interactive <- function() FALSE
# D_trn$HM_NOISE_nrm <- scale(D_trn$HM_NOISE)
# D_tst$HM_NOISE_nrm <- scale(D_tst$HM_NOISE)
## No suspicious patterns in missing data
# mice::md.pattern(D[c(Mediator_vars, GIS_vars)[nNAs>0]], plot = FALSE)
# mice::md.pattern(D[PRS_orig_vars])

Linear Modeling

Imputation with MissForest on Training Data

In [2]:
Number of NAs in Mediators and GIS variables
sapply(D[Mediator_vars], \(x) sum(is.na(x)))
 FEELNAT   LNOISE LOC_SENS LOC_SOUN LOC_SCEN LOC_VISE LOC_VEGE LOC_FAUN 
      16      291       28       30       36       62       69       88 
Number of NAs in Mediators and GIS variables
sapply(D[GIS_vars], \(x) sum(is.na(x)))
  LCARTIF_sqrt  LCFOREST_sqrt          HETER    OVDIST_sqrt     VIS5K_sqrt 
             0              0              0              0              0 
       RL_NDVI       RL_NOISE    DISTKM_sqrt   JNYTIME_sqrt STRIMP123_sqrt 
             0              0              0             86              0 
STRIMP999_sqrt 
             0 
In [3]:
Impute missing values using MissForest
# Mediator imputation
D_trn[Mediator_vars] <- xfun::cache_rds({
  missForest(as.matrix(D_trn[Mediator_vars]))
  }, 
  file = "Mediator_imputation.rds", 
  dir = "cache/",
  hash = list(as.matrix(D_trn[Mediator_vars]))
)$ximp |> as.data.frame()

# GIS imputation (missForest)
D_trn[GIS_vars] <- xfun::cache_rds({
  missForest(as.matrix(D_trn[GIS_vars]))
  }, 
  file = "GIS_imputation.rds", 
  dir = "cache/",
  hash = list(as.matrix(D_trn[GIS_vars]))
)$ximp |> as.data.frame()

Scaling Test Data

In [4]:
Scaling variables and show old scale
all_vars <- c(Mediator_vars, GIS_vars, PRS_vars)
old_scale <- t(sapply(D_tst[c(all_vars)], \(x) 
  c(mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE))))

D_tst[c(all_vars)] <- lapply(D_tst[c(all_vars)], scale)

old_scale
                 mean     sd
FEELNAT         6.142  1.055
LNOISE          4.210  0.747
LOC_SENS        4.098  1.016
LOC_SOUN        4.296  0.947
LOC_SCEN        3.967  1.056
LOC_VISE        4.080  1.027
LOC_VEGE        4.343  0.859
LOC_FAUN        3.298  1.365
LCARTIF_sqrt    0.271  0.269
LCFOREST_sqrt   0.454  0.311
HETER           1.305  0.402
OVDIST_sqrt    21.797 10.144
VIS5K_sqrt      3.323  1.620
RL_NDVI         0.635  0.202
RL_NOISE       41.615  9.261
DISTKM_sqrt     1.473  1.156
JNYTIME_sqrt    3.830  2.247
STRIMP123_sqrt  6.555 10.739
STRIMP999_sqrt 47.557 13.162
MEAN            4.987  0.882
FA              5.266  1.111
BA              5.141  1.157
EC              4.542  1.290
ES              5.006  1.430

Testing VIF

In [5]:
VIF: PRS ~ Mediators + GIS_vars (NO interaction)
car::vif(fit_PRS_MED <- lm(as.formula(paste0(
  PRS_vars[1], 
  " ~ ", 
  paste(Mediator_vars, collapse = " + "), " + ",
  paste(GIS_vars,      collapse = " + ")
)), D_trn)) |> summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.21    1.46    1.94    2.03    2.24    4.99 
In [6]:
VIF: PRS ~ (Mediators + GIS_vars)^2 (WITH interaction)
suppressMessages(
car::vif(fit_PRS_MED <- lm(as.formula(paste0(
  PRS_vars[1], 
  " ~ ", 
  "(", paste(Mediator_vars, collapse = " + "), 
     " + ", paste(GIS_vars, collapse = " + "), 
  ")^2"
)), D_trn))) |> summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   11.9    81.4   147.6   204.9   270.7  1391.7 

Since we model with interactions later, the latter VIF are relevant for us. Given that they are very high (c.f. median and max), we would have no hope of finding any significant results in the full interaction model. Therefore, we will first perfom a variable selection, to reduce the VIF and enable us to find significant effects.

All Interactions: Mediators ~ (GIS)^2

In [7]:
Code: Print Coef Table

# Elegant function to create coefficient tables from model summaries
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code: Print Coef Table

library(tidyr)
library(knitr)
library(purrr)

#' Create a formatted coefficient table from model summary list
#' 
#' @param model_summaries List of model summaries (e.g., output from lapply(models, summary))
#' @param sig_threshold Significance threshold for bold formatting (default: 0.001)
#' @param covariate_order Optional vector specifying order of covariates
#' @return Formatted table with models as columns and covariates as rows
create_coef_table <- function(model_summaries, sig_threshold = 0.001, covariate_order = NULL) {
  
  # Extract and format coefficients for all models
  format_model_coef <- function(coef_matrix, model_name) {
    estimates <- coef_matrix[, "Estimate"]
    p_values <- coef_matrix[, "Pr(>|t|)"]
    
    # Format with significance stars (common notation)
    formatted_coef <- sapply(seq_along(estimates), function(i) {
      est_str <- sprintf("%.3f", estimates[i])
      stars <- case_when(
        p_values[i] < 0.001 ~ "***",
        p_values[i] < 0.01 ~ "**",
        p_values[i] < 0.05 ~ "*",
        p_values[i] < 0.1 ~ ".",
        TRUE ~ ""
      )
      paste0(est_str, stars)
    })
    
    tibble(
      Model = model_name,
      Covariate = rownames(coef_matrix),
      Coefficient = formatted_coef
    )
  }
  
  # Process all models and create wide table
  coef_list <- map(model_summaries, coef)
  
  results_table <- map2_dfr(coef_list, names(coef_list), format_model_coef) %>%
    pivot_wider(names_from = Model, values_from = Coefficient, values_fill = "")
  
  # Apply covariate ordering
  if (is.null(covariate_order)) {
    # Default: Intercept first, then alphabetical
    all_covariates <- unique(results_table$Covariate)
    covariate_order <- c("(Intercept)", sort(all_covariates[all_covariates != "(Intercept)"]))
  }
  
  # Filter and reorder covariates
  results_table <- results_table %>%
    filter(Covariate %in% covariate_order) %>%
    slice(match(covariate_order, Covariate))
  
  kable(results_table, 
        format = "pipe",
        align = c("l", rep("c", ncol(results_table) - 1)))
}
In [8]:
Res3 <- list()
for (mediator in Mediator_vars) {
  intercept_model <- lm(as.formula(paste0(
    mediator, " ~ 1")), D_trn)
  step_model <- step(intercept_model, 
    scope = as.formula(paste0(
      mediator, " ~ ", 
      "(", paste(GIS_vars, collapse = " + "), ")^2"
    )),
    trace = FALSE, k = log(nrow(D_trn))
  )
  Res3[[mediator]] <- lm(formula(step_model), D_tst)
}
(ResSum3 <- lapply(Res3, summary))
$FEELNAT

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-4.959 -0.391  0.264  0.607  1.685 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)            0.0618     0.0410    1.51  0.13260    
LCARTIF_sqrt          -0.1524     0.0570   -2.67  0.00770 ** 
RL_NDVI                0.1498     0.0436    3.43  0.00063 ***
OVDIST_sqrt            0.0270     0.0452    0.60  0.55112    
LCARTIF_sqrt:RL_NDVI   0.1146     0.0402    2.85  0.00446 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.948 on 733 degrees of freedom
  (9 observations deleted due to missingness)
Multiple R-squared:  0.106, Adjusted R-squared:  0.101 
F-statistic: 21.8 on 4 and 733 DF,  p-value: <2e-16


$LNOISE

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.999 -0.527  0.012  0.676  1.622 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.00097    0.03862   -0.03    0.980    
LCARTIF_sqrt -0.12357    0.04841   -2.55    0.011 *  
RL_NOISE     -0.24203    0.04890   -4.95  9.7e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.945 on 596 degrees of freedom
  (148 observations deleted due to missingness)
Multiple R-squared:  0.11,  Adjusted R-squared:  0.107 
F-statistic: 36.6 on 2 and 596 DF,  p-value: 9.79e-16


$LOC_SENS

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.319 -0.217 -0.007  0.831  1.390 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.000148   0.036675    0.00  0.99678    
HETER           0.129769   0.038233    3.39  0.00073 ***
STRIMP999_sqrt -0.072664   0.038328   -1.90  0.05837 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.993 on 730 degrees of freedom
  (14 observations deleted due to missingness)
Multiple R-squared:  0.0168,    Adjusted R-squared:  0.0141 
F-statistic: 6.22 on 2 and 730 DF,  p-value: 0.00209


$LOC_SOUN

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.623 -0.402  0.482  0.698  1.512 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.000403   0.036322    0.01   0.9912    
LCARTIF_sqrt -0.175213   0.037180   -4.71  2.9e-06 ***
HETER         0.109010   0.037186    2.93   0.0035 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.984 on 731 degrees of freedom
  (13 observations deleted due to missingness)
Multiple R-squared:  0.0344,    Adjusted R-squared:  0.0317 
F-statistic:   13 on 2 and 731 DF,  p-value: 2.82e-06


$LOC_SCEN

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.0383 -0.2135  0.0669  0.8101  1.8029 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.00114    0.03618   -0.03     0.97    
RL_NDVI      0.21701    0.03629    5.98  3.5e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.977 on 727 degrees of freedom
  (18 observations deleted due to missingness)
Multiple R-squared:  0.0469,    Adjusted R-squared:  0.0456 
F-statistic: 35.8 on 1 and 727 DF,  p-value: 3.48e-09


$LOC_VISE

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.0692 -0.1488 -0.0235  0.8511  1.0861 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)  
(Intercept)  -0.000356   0.037305   -0.01    0.992  
LCARTIF_sqrt -0.071106   0.037580   -1.89    0.059 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.998 on 714 degrees of freedom
  (31 observations deleted due to missingness)
Multiple R-squared:  0.00499,   Adjusted R-squared:  0.0036 
F-statistic: 3.58 on 1 and 714 DF,  p-value: 0.0589


$LOC_VEGE

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.971 -0.492  0.443  0.706  1.599 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -0.0192     0.0378   -0.51   0.6111    
RL_NDVI        0.2195     0.0382    5.75  1.4e-08 ***
JNYTIME_sqrt  -0.1139     0.0379   -3.01   0.0027 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.98 on 671 degrees of freedom
  (73 observations deleted due to missingness)
Multiple R-squared:  0.0549,    Adjusted R-squared:  0.052 
F-statistic: 19.5 on 2 and 671 DF,  p-value: 6.03e-09


$LOC_FAUN

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-1.898 -0.914  0.300  0.796  1.744 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.00127    0.03685   -0.03     0.97    
LCARTIF_sqrt -0.21409    0.03697   -5.79  1.1e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.978 on 702 degrees of freedom
  (43 observations deleted due to missingness)
Multiple R-squared:  0.0456,    Adjusted R-squared:  0.0442 
F-statistic: 33.5 on 1 and 702 DF,  p-value: 1.05e-08

All Interactions: PRS ~ (Mediators + GIS)^2

In [9]:
Res4 <- list()
for (prs in PRS_vars) {
  intercept_model <- lm(as.formula(paste0(
    prs, " ~ 1")), D_trn)
  step_model <- step(intercept_model, 
    scope = as.formula(paste0(
      prs, " ~ ", 
      "(", paste(GIS_vars, collapse = " + "), " + ", 
      paste(Mediator_vars, collapse = " + "), ")^2"
    )),
    trace = FALSE, k = log(nrow(D_trn))
  )
  Res4[[prs]] <- lm(formula(step_model), D_tst)
}
(ResSum4 <- lapply(Res4, summary))
$MEAN

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.8609 -0.5631 -0.0386  0.5998  2.7647 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -0.00762    0.03787   -0.20    0.841    
LOC_VISE          0.17314    0.04201    4.12  4.3e-05 ***
FEELNAT           0.20198    0.04293    4.70  3.2e-06 ***
LOC_SENS          0.10408    0.04234    2.46    0.014 *  
LNOISE            0.17667    0.04049    4.36  1.5e-05 ***
FEELNAT:LOC_SENS  0.05398    0.02793    1.93    0.054 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.893 on 567 degrees of freedom
  (174 observations deleted due to missingness)
Multiple R-squared:  0.192, Adjusted R-squared:  0.184 
F-statistic: 26.9 on 5 and 567 DF,  p-value: <2e-16


$FA

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.391 -0.518  0.093  0.592  2.567 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -0.00324    0.03796   -0.09  0.93195    
LOC_VISE          0.12823    0.04125    3.11  0.00197 ** 
LOC_FAUN          0.17631    0.04167    4.23  2.7e-05 ***
LNOISE            0.13307    0.04017    3.31  0.00098 ***
RL_NDVI          -0.13265    0.03906   -3.40  0.00073 ***
FEELNAT           0.16853    0.04528    3.72  0.00022 ***
LOC_SCEN          0.16380    0.04567    3.59  0.00036 ***
FEELNAT:LOC_SCEN -0.00170    0.03148   -0.05  0.95699    
RL_NDVI:LOC_SCEN  0.02416    0.03405    0.71  0.47821    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.865 on 556 degrees of freedom
  (182 observations deleted due to missingness)
Multiple R-squared:  0.256, Adjusted R-squared:  0.245 
F-statistic: 23.9 on 8 and 556 DF,  p-value: <2e-16


$BA

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.965 -0.507  0.070  0.647  2.410 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.00835    0.03572   -0.23   0.8152    
LOC_VISE     0.12213    0.04045    3.02   0.0026 ** 
FEELNAT      0.18778    0.03684    5.10  4.4e-07 ***
LOC_SENS     0.14743    0.04055    3.64   0.0003 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.948 on 700 degrees of freedom
  (43 observations deleted due to missingness)
Multiple R-squared:  0.115, Adjusted R-squared:  0.111 
F-statistic: 30.2 on 3 and 700 DF,  p-value: <2e-16


$EC

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9221 -0.5444 -0.0364  0.6855  2.3639 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -0.00858    0.03666   -0.23  0.81510    
LOC_SENS       0.14234    0.04085    3.48  0.00052 ***
LOC_SCEN       0.00370    0.04121    0.09  0.92855    
LCFOREST_sqrt -0.08955    0.03752   -2.39  0.01724 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.986 on 719 degrees of freedom
  (24 observations deleted due to missingness)
Multiple R-squared:  0.0297,    Adjusted R-squared:  0.0257 
F-statistic: 7.34 on 3 and 719 DF,  p-value: 7.55e-05


$ES

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.492 -0.541  0.145  0.689  2.296 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     0.03146    0.04159    0.76   0.4497    
LNOISE          0.13295    0.04315    3.08   0.0022 ** 
LOC_SENS        0.09603    0.04085    2.35   0.0191 *  
DISTKM_sqrt     0.08081    0.04039    2.00   0.0459 *  
FEELNAT         0.25812    0.04639    5.56    4e-08 ***
LNOISE:FEELNAT -0.00623    0.03425   -0.18   0.8558    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.95 on 576 degrees of freedom
  (165 observations deleted due to missingness)
Multiple R-squared:  0.141, Adjusted R-squared:  0.133 
F-statistic: 18.8 on 5 and 576 DF,  p-value: <2e-16

Table Summarizing Coefficients

Significant codes as usual: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

All Interactions: Mediators ~ (GIS)^2

In [10]:
create_coef_table(ResSum3)
Covariate FEELNAT LNOISE LOC_SENS LOC_SOUN LOC_SCEN LOC_VISE LOC_VEGE LOC_FAUN
(Intercept) 0.062 -0.001 -0.000 0.000 -0.001 -0.000 -0.019 -0.001
HETER 0.130*** 0.109**
JNYTIME_sqrt -0.114**
LCARTIF_sqrt -0.152** -0.124* -0.175*** -0.071. -0.214***
LCARTIF_sqrt:RL_NDVI 0.115**
OVDIST_sqrt 0.027
RL_NDVI 0.150*** 0.217*** 0.219***
RL_NOISE -0.242***
STRIMP999_sqrt -0.073.

PRS ~ (Mediators + GIS)^2

In [11]:
create_coef_table(ResSum4)
Covariate MEAN FA BA EC ES
(Intercept) -0.008 -0.003 -0.008 -0.009 0.031
DISTKM_sqrt 0.081*
FEELNAT 0.202*** 0.169*** 0.188*** 0.258***
FEELNAT:LOC_SCEN -0.002
FEELNAT:LOC_SENS 0.054.
LCFOREST_sqrt -0.090*
LNOISE 0.177*** 0.133*** 0.133**
LNOISE:FEELNAT -0.006
LOC_FAUN 0.176***
LOC_SCEN 0.164*** 0.004
LOC_SENS 0.104* 0.147*** 0.142*** 0.096*
LOC_VISE 0.173*** 0.128** 0.122**
RL_NDVI -0.133***
RL_NDVI:LOC_SCEN 0.024

Legacy Code

In [12]:
Variable selection over multiple y (not desired)

# library(grpreg)
# fit <- cv.grpreg(X = model.matrix(fit_MED_GIS)[,-1], y = D_trn[Mediator_vars[1:2]])
# coef(fit) |> t()
# fit$beta
# plot(fit)
In [13]:
Linear models with mice

Y <- D[]

library(mice, quietly = TRUE)
library(car, quietly = TRUE)
library(miceadds, quietly = TRUE)
data(nhanes2, package = "mice")
set.seed(9090)

mi.res <- miceadds::mice.1chain(nhanes2, burnin = 4, iter = 20, Nimp = 8)
an2a <- miceadds::mi.anova(mi.res = mi.res, formula = "bmi ~ age * chl")

mod1 <- with(mi.res, stats::lm(bmi ~ age * chl))
mod0 <- with(mi.res, stats::lm(bmi ~ age + chl))

mitml::testModels(model = mod1$analyses, null.model = mod0$analyses, method = "D1")
mitml::testModels(model = mod1$analyses, null.model = mod0$analyses, method = "D2")

an2b <- miceadds::mi.anova(mi.res = mi.res, formula = "bmi ~ age * chl", type = 3)
In [14]:
Mediators ~ GIS
stop("this should not run")
Res1 <- list()
for (mediator in Mediator_vars) {
  full_model <- lm(as.formula(paste0(
    mediator, " ~ ", 
    "HM_NOISE_nrm * (", paste(GIS_vars, collapse = " + "), ")"
  )), D_trn)
  small_model <- step(full_model, trace = FALSE, k = log(nrow(D_trn)))
  Res1[[mediator]] <- lm(formula(small_model), D_tst)
}
lapply(Res1, summary)
In [15]:
PRS ~ Mediators
Res2 <- list()
for (mediator in Mediator_vars) {
  full_model <- lm(as.formula(paste0(
    mediator, " ~ ", 
    "HM_NOISE_nrm * (", paste(GIS_vars, collapse = " + "), ")"
  )), D_trn)
  small_model <- step(full_model, trace = FALSE, k = log(nrow(D_trn)))
  Res2[[mediator]] <- lm(formula(small_model), D_tst)
}
lapply(Res2, summary)
library(knitr)
library(kableExtra)

# Function to extract coefficients with significance indicators
extract_coef_data <- function(model_name, coef_data) {
  # Extract coefficient names, estimates, and p-values
  coef_names <- rownames(coef_data)
  estimates <- coef_data[, "Estimate"]
  p_values <- coef_data[, "Pr(>|t|)"]
  
  # Format estimates with significance stars
  formatted_coef <- sapply(1:length(estimates), function(i) {
    est_str <- sprintf("%.3f", estimates[i])
    if (p_values[i] < 0.001) {
      paste0("**", est_str, "**")  # Bold for p < 0.001
    } else {
      est_str
    }
  })
  
  # Create data frame
  data.frame(
    Model = model_name,
    Covariate = coef_names,
    Coefficient = formatted_coef,
    stringsAsFactors = FALSE
  )
}

# Manual data entry based on your results
# You would typically extract this from your model objects, but since you have text output:

model_results <- list()

# FEELNAT model
feelnat_coef <- data.frame(
  row.names = c("(Intercept)", "LCARTIF_sqrt", "RL_NDVI", "OVDIST_sqrt", "LCARTIF_sqrt:RL_NDVI"),
  Estimate = c(0.0618, -0.1524, 0.1498, 0.0270, 0.1146),
  `Pr(>|t|)` = c(0.13260, 0.00770, 0.00063, 0.55112, 0.00446),
  check.names = FALSE
)
model_results[["FEELNAT"]] <- extract_coef_data("FEELNAT", feelnat_coef)

# LNOISE model
lnoise_coef <- data.frame(
  row.names = c("(Intercept)", "LCARTIF_sqrt", "RL_NOISE"),
  Estimate = c(-0.00097, -0.12357, -0.24203),
  `Pr(>|t|)` = c(0.980, 0.011, 9.7e-07),
  check.names = FALSE
)
model_results[["LNOISE"]] <- extract_coef_data("LNOISE", lnoise_coef)

# LOC_SENS model
loc_sens_coef <- data.frame(
  row.names = c("(Intercept)", "HETER", "STRIMP999_sqrt"),
  Estimate = c(-0.000148, 0.129769, -0.072664),
  `Pr(>|t|)` = c(0.99678, 0.00073, 0.05837),
  check.names = FALSE
)
model_results[["LOC_SENS"]] <- extract_coef_data("LOC_SENS", loc_sens_coef)

# LOC_SOUN model
loc_soun_coef <- data.frame(
  row.names = c("(Intercept)", "LCARTIF_sqrt", "HETER"),
  Estimate = c(0.000403, -0.175213, 0.109010),
  `Pr(>|t|)` = c(0.9912, 2.9e-06, 0.0035),
  check.names = FALSE
)
model_results[["LOC_SOUN"]] <- extract_coef_data("LOC_SOUN", loc_soun_coef)

# LOC_SCEN model
loc_scen_coef <- data.frame(
  row.names = c("(Intercept)", "RL_NDVI"),
  Estimate = c(-0.00114, 0.21701),
  `Pr(>|t|)` = c(0.97, 3.5e-09),
  check.names = FALSE
)
model_results[["LOC_SCEN"]] <- extract_coef_data("LOC_SCEN", loc_scen_coef)

# LOC_VISE model
loc_vise_coef <- data.frame(
  row.names = c("(Intercept)", "LCARTIF_sqrt"),
  Estimate = c(-0.000356, -0.071106),
  `Pr(>|t|)` = c(0.992, 0.059),
  check.names = FALSE
)
model_results[["LOC_VISE"]] <- extract_coef_data("LOC_VISE", loc_vise_coef)

# LOC_VEGE model
loc_vege_coef <- data.frame(
  row.names = c("(Intercept)", "RL_NDVI", "JNYTIME_sqrt"),
  Estimate = c(-0.0192, 0.2195, -0.1139),
  `Pr(>|t|)` = c(0.6111, 1.4e-08, 0.0027),
  check.names = FALSE
)
model_results[["LOC_VEGE"]] <- extract_coef_data("LOC_VEGE", loc_vege_coef)

# LOC_FAUN model
loc_faun_coef <- data.frame(
  row.names = c("(Intercept)", "LCARTIF_sqrt"),
  Estimate = c(-0.00127, -0.21409),
  `Pr(>|t|)` = c(0.97, 1.1e-08),
  check.names = FALSE
)
model_results[["LOC_FAUN"]] <- extract_coef_data("LOC_FAUN", loc_faun_coef)

# Combine all results
all_results <- do.call(rbind, model_results)

# Create wide format table
results_wide <- all_results %>%
  select(Model, Covariate, Coefficient) %>%
  tidyr::pivot_wider(names_from = Model, values_from = Coefficient, values_fill = "")

# Get all unique covariates and sort them (intercept first)
all_covariates <- unique(all_results$Covariate)
covariate_order <- c("(Intercept)", sort(all_covariates[all_covariates != "(Intercept)"]))

# Reorder rows
results_wide <- results_wide %>%
  slice(match(covariate_order, Covariate))

# Create the final table
print("Linear Model Results Summary")
print("Bold coefficients indicate p < 0.001")
print("")

# Display as a nice table
kable(results_wide, 
      format = "pipe",
      align = c("l", rep("c", ncol(results_wide)-1))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

# Alternative: Simple data frame for viewing
print(results_wide)