The (interactive) correlation heatmap reveals very high correlation among aminoacids and TG compounds.
load("data/data.Rdata")
X <- df[df$Diagnosis == "Probable AD", 1:230]
C <- round(cor(X), 2)
heatmaply_cor(C, color = "magma", plot_method = "plotly", dendrogram = F, reorderfun = sort.default(d, w), main = "Correlation Heatmap", file = "heatmap.html", colorbar_thickness = 15, colorbar_len = 0.5)
Performing a Global Test of ApoE4 dose-effects, on serum metabolites of AD patients, correcting for sex (Ho: ApoE4 dose has no effect on mean metabolite levels, Ha: it has an effect), yields a significant difference (p = 0.0171). The most significantly affected metabolites are triglycerides and diglycerides (FDR-adjusted p-value <0.05).
AD <- subset(df, Diagnosis == "Probable AD")
AD$E4dose <- as.factor(AD$E4dose)
gt <- globaltest::gt(E4dose ~ 1, E4dose ~ . - E4 - Diagnosis - target, data = AD, standardize = TRUE)
gt
p-value Statistic Expected Std.dev #Cov
1 0.0171 1.23 0.8 0.168 232
covariates <- covariates(gt, sort = T)
res.gt <- extract(covariates) %>%
sort() %>%
p.adjust(, method = "FDR") %>%
globaltest::result() %>%
mutate_if(is.numeric, round, digits = 3)
res.gt$direction <- as.factor(res.gt$direction)
levels(res.gt$direction) <- c("no ApoE4", "1 ApoE4", "2 ApoE4")
res.gt$alias <- NULL
res.gt <- res.gt[, 1:4]
names(res.gt) <- c("Inheritance", "Assoc. with", "FDR", "p-value")
res.gt <- res.gt[, c(1:(ncol(res.gt) - 2), ncol(res.gt), (ncol(res.gt) - 1))]
filter(res.gt, `p-value` < 0.05)
Inheritance Assoc. with p-value FDR
Lip.TG.56.2. 0.046 1 ApoE4 0.000 0.016
Lip.TG.58.1. 0.117 1 ApoE4 0.000 0.021
Lip.DG.36.3. 0.190 1 ApoE4 0.000 0.021
Lip.TG.56.3. 0.244 1 ApoE4 0.000 0.021
Lip.TG.52.3. 0.424 1 ApoE4 0.001 0.031
Lip.TG.54.5. 0.480 1 ApoE4 0.001 0.027
Lip.TG.58.2. 0.722 1 ApoE4 0.001 0.027
Lip.TG.54.4. 0.761 1 ApoE4 0.002 0.033
Lip.TG.58.9. 1.000 1 ApoE4 0.001 0.027
Lip.TG.56.7. 1.000 1 ApoE4 0.001 0.027
Lip.TG.58.8. 1.000 1 ApoE4 0.001 0.032
Lip.TG.54.6. 1.000 1 ApoE4 0.002 0.035
Lip.TG.56.8. 1.000 1 ApoE4 0.002 0.037
Lip.TG.52.4. 1.000 1 ApoE4 0.003 0.055
Lip.TG.54.3. 1.000 1 ApoE4 0.004 0.055
Lip.TG.56.6. 1.000 1 ApoE4 0.004 0.059
Lip.TG.56.1. 1.000 1 ApoE4 0.004 0.059
Lip.TG.52.2. 1.000 1 ApoE4 0.005 0.063
Lip.TG.54.2. 1.000 1 ApoE4 0.005 0.063
Lip.TG.60.2. 1.000 1 ApoE4 0.007 0.077
Lip.TG.58.10. 1.000 1 ApoE4 0.011 0.124
Lip.TG.51.3. 1.000 1 ApoE4 0.015 0.154
Lip.SM.d18.1.18.1. 1.000 2 ApoE4 0.018 0.169
OA.OA09...2.ketoglutaric.acid 1.000 2 ApoE4 0.019 0.169
Lip.TG.52.0. 1.000 1 ApoE4 0.019 0.169
Lip.TG.50.3. 1.000 2 ApoE4 0.020 0.169
OA.OA29...Uracil 1.000 no ApoE4 0.020 0.169
Lip.TG.52.5. 1.000 1 ApoE4 0.021 0.174
Lip.TG.54.0. 1.000 1 ApoE4 0.022 0.174
Lip.TG.50.2. 1.000 2 ApoE4 0.022 0.174
Lip.TG.51.2. 1.000 1 ApoE4 0.023 0.174
Am.L.Glutamine 1.000 no ApoE4 0.024 0.174
Lip.TG.50.1. 1.000 2 ApoE4 0.025 0.174
Lip.TG.50.4. 1.000 1 ApoE4 0.026 0.176
Lip.TG.52.1. 1.000 1 ApoE4 0.027 0.176
Lip.TG.54.1. 1.000 1 ApoE4 0.031 0.203
Lip.TG.50.0. 1.000 1 ApoE4 0.037 0.229
OA.OA08...Malic.acid 1.000 2 ApoE4 0.038 0.234
Lip.DG.36.2. 1.000 1 ApoE4 0.039 0.235
OS.HpH.PAF.C16.0 1.000 2 ApoE4 0.049 0.283
Metabolite-level models to test for ApoE4 dose effects, correcting for age at diagnosis, sex, smoking status, alcohol consumption, hypertension (and medication), hyperlipidemia (and medication), anticoagulant medication, anti-depressants, mean arterial pressure (MAP) and body mass index (BMI). The F-tests yielded p-values < 0.05, however none of them survived FDR correction. Interestingly, in the SCD group, aminoacids are mostly affected and no TGs or DGs compared to the AD group.
nested <- function(Y, x, ...) {
### Analysis of Variance (ANOVA)
x <- as.factor(x)
df <- cbind(Y, x, ...)
ncol <- ncol(Y)
covariates <- names(...)
F_tests <- furrr::future_map(df[, 1:ncol], ~ {
frm <- as.formula(paste0(".x ~", paste(covariates, collapse = "+")))
rm <- lm(frm, data = df)
ffm <- as.formula(paste0(".x ~ x +", paste(covariates, collapse = "+")))
fm <- lm(ffm, data = df)
anova(rm, fm)
})
# Correction for Multiple Testing
# Create a list to store the p-values
# Extract the p-values of the F-tests from the anova summaries list and store them in p_values
p_values <- F_tests %>%
future_map(~ .x[["Pr(>F)"]][[2]])
# Coerce p_values to dataframe and transpose it
p_values <- as.data.frame(p_values) %>%
t() %>%
data.frame(row.names = colnames(Y))
# Calculate the FDR-adjusted p-values
p_values$p_adj <- p.adjust(p_values[, ], method = "fdr", n = 230)
p_values <- round(p_values, 3)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
sig <- as.data.frame(dplyr::filter(p_values, `.` < 0.05))
sig <- sig[order(sig$p_adj), ]
names(sig) <- c("P(>F)", "FDR")
cat("Significant F-tests: \n")
print(sig)
cat("\nDose effects on metabolites:")
summaries <- furrr::future_map(df[, rownames(sig)], ~ {
f <- as.formula(paste0(".x ~ x +", paste(covariates, collapse = "+")))
mdl <- lm(f, data = df)
summary(mdl)$coefficients[1:length(table(x)), ]
})
summaries <- lapply(summaries, function(x) {
x <- x[, -c(2, 3)]
x <- t(x)
})
summaries <- plyr::ldply(summaries, id = 1:2) %>%
mutate(across(where(is.numeric), round, digits = 3))
coeffs <- summaries[c(TRUE, FALSE), ]
pvalues <- summaries[c(FALSE, TRUE), ]
t_tests <- cbind.data.frame(coeffs[, 1], coeffs[, 2], pvalues[, 2], coeffs[, 3], pvalues[, 3], coeffs[, 4], pvalues[, 4])
names(t_tests) <- c("Metabolite", "No ApoE4", "P(>t)", "ApoE4x1", "P(>t)", "ApoE4x2", "P(>t)")
return(t_tests)
}
Y <- df[, 1:230]
df$E4dose <- as.numeric(df$E4dose) - 1
df$E4dose[df$E4dose == 0] <- "No ApoE4"
df$E4dose[df$E4dose == 1] <- "1 ApoE4"
df$E4dose[df$E4dose == 2] <- "2 ApoE4"
E4dose <- df$E4dose <- as.factor(df$E4dose)
Among AD patients, ApoE4 dose seems to have positive effect on several triglycerides, diglycerides, putrescine, 2-ketoglutraric acid, lysophosphatidylcholin, HpH.PAF.-C16.0 and -C18.0
load("data/clinical.Rdata")
AD <- subset(df, Diagnosis == "Probable AD")
ADY <- AD[, 1:230]
ADE4dose <- AD$E4dose
ADclinical <- clinical[df$Diagnosis == "Probable AD", ]
## Testing for effects of ApoE4 dose
nested(ADY, relevel(ADE4dose, ref = "No ApoE4"), ADclinical)
Significant F-tests:
P(>F) FDR
Lip.TG.52.3. 0.001 0.156
Lip.TG.52.4. 0.003 0.156
Lip.DG.36.3. 0.002 0.156
OS.HpH.PAF.C16.0 0.002 0.156
Lip.TG.52.2. 0.006 0.255
Lip.TG.54.5. 0.010 0.373
Lip.TG.50.2. 0.012 0.398
Lip.TG.50.1. 0.018 0.450
Lip.TG.54.4. 0.020 0.450
Lip.TG.54.6. 0.017 0.450
Am.Putrescine 0.029 0.515
OA.OA09...2.ketoglutaric.acid 0.026 0.515
Lip.TG.50.3. 0.028 0.515
Lip.TG.52.5. 0.042 0.538
Lip.TG.56.7. 0.042 0.538
Lip.TG.56.8. 0.044 0.538
Lip.TG.58.9. 0.042 0.538
Lip.LPC.16.0. 0.033 0.538
OS.HpH.LPA.C18.0 0.044 0.538
Dose effects on metabolites:
Metabolite No ApoE4 P(>t) ApoE4x1 P(>t) ApoE4x2 P(>t)
1 Lip.TG.52.3. -1.443 0.818 2.568 0.004 3.719 0.001
2 Lip.TG.52.4. 0.620 0.888 1.628 0.008 2.392 0.002
3 Lip.DG.36.3. 0.026 0.631 0.019 0.012 0.031 0.001
4 OS.HpH.PAF.C16.0 34.751 0.000 2.593 0.017 4.645 0.001
5 Lip.TG.52.2. -5.080 0.469 2.124 0.030 3.744 0.002
6 Lip.TG.54.5. 1.863 0.496 0.928 0.016 1.280 0.006
7 Lip.TG.50.2. -4.499 0.297 0.882 0.141 2.193 0.003
8 Lip.TG.50.1. -2.324 0.539 0.706 0.179 1.837 0.005
9 Lip.TG.54.4. 3.457 0.318 1.178 0.015 1.369 0.020
10 Lip.TG.54.6. -0.434 0.795 0.515 0.027 0.746 0.009
11 Am.Putrescine 0.004 0.000 0.000 0.035 0.000 0.017
12 OA.OA09...2.ketoglutaric.acid 0.008 0.366 0.000 0.953 0.003 0.014
13 Lip.TG.50.3. -1.203 0.668 0.594 0.127 1.268 0.008
14 Lip.TG.52.5. 0.459 0.789 0.358 0.134 0.725 0.014
15 Lip.TG.56.7. -2.394 0.095 0.487 0.015 0.392 0.105
16 Lip.TG.56.8. -1.393 0.055 0.248 0.014 0.175 0.151
17 Lip.TG.58.9. -0.447 0.068 0.085 0.013 0.034 0.410
18 Lip.LPC.16.0. 4.220 0.028 0.312 0.237 0.850 0.009
19 OS.HpH.LPA.C18.0 0.498 0.038 0.044 0.178 0.101 0.013
Among individuals with SCD, lipid metabolites were not affected as much as in the AD group, with only two sphingomyelin species showing a difference. Aminoacids L-serine, tryptophan, glycine,trytptophan, L-homoserine, putrescine were affected in this group.
SCD <- df[df$Diagnosis == "Subjectieve klachten", ]
SCDY <- SCD[, 1:230]
SCDE4dose <- SCD$E4dose
SCDclinical <- clinical[df$Diagnosis == "Subjectieve klachten", ]
## Testing for effects of ApoE4 dose
nested(SCDY, relevel(SCDE4dose, ref = "No ApoE4"), SCDclinical)
Significant F-tests:
P(>F) FDR
Am.L.Serine 0.002 0.285
Am.L.Tryptophan 0.002 0.285
Am.Glycine 0.006 0.450
Am.L.homoserine 0.047 0.931
Am.Putrescine 0.045 0.931
Lip.SM.d18.1.22.0. 0.043 0.931
Lip.SM.d18.1.23.0. 0.029 0.931
Dose effects on metabolites:
Metabolite No ApoE4 P(>t) ApoE4x1 P(>t) ApoE4x2 P(>t)
1 Am.L.Serine 4.669 0.000 0.191 0.115 -1.058 0.003
2 Am.L.Tryptophan 3.747 0.004 -0.307 0.047 -1.364 0.002
3 Am.Glycine 4.332 0.001 0.382 0.015 -0.873 0.052
4 Am.L.homoserine 0.003 0.001 0.000 0.723 -0.001 0.014
5 Am.Putrescine 0.000 0.969 0.001 0.013 0.000 0.927
6 Lip.SM.d18.1.22.0. 3.337 0.002 0.309 0.015 0.272 0.448
7 Lip.SM.d18.1.23.0. 1.241 0.010 0.144 0.012 0.176 0.277
The discriminative potential of the metabolites, correcting for clinical background features, on AD and ApoE4 or not (4-class response ADE4, AD, SCDE4, SCD) is assessed by first fitting a Multi-nomial Logistic Regression (benchmark model). The benchmark model fits only the clinical features. The model’s AUC is compared with the same model (with altered hyperparameters) fitting clinical variables + 230 metabolites. The metabolites are then projected to 6 latent ML-estimated factors and 3 more models are fitted: Multi-nomial Logistic Regression, Decision Tree and XGBoost fitting clinical variables + 6 factors. The models’ multi-class classification performance is compared using confusion matrices and AUCs.
multifit <- function(X,
y,
model,
ctrl = NULL,
grid = NULL,
seed = 87654, ...) {
set.seed(seed)
# Merge X and y into df
df <- cbind.data.frame(X, y)
# Train the model
model <- caret::train(df[, 1:ncol(X)], df$y,
method = model,
tuneGrid = grid,
trControl = ctrl,
metric = "logLoss",
# importance=TRUE,
...
)
obs <- model$pred$obs
preds <- model$pred$pred
# Get the multi-class ROC curves
ys <- as.numeric(obs) - 1
yhats <- as.numeric(preds) - 1
roc <- multiclass.roc(
response = ys,
predictor = yhats, quiet = T, legacy.axes = T
)
print(roc$auc)
names <- c(
"ADE4 vs. AD", "ADE4 vs. SCDE4",
"ADE4 vs. SCD", "AD vs SCDE4", "AD vs. SCD", "SCDE4 vs. SCD"
)
for (i in 1:length(names)) {
names[[i]] <- paste0(
names[[i]], ", AUC=",
paste(round(auc(roc$rocs[[i]]), 2))
)
}
names(roc$rocs) <- names
cat("\n")
# Create a confusion matrix and get performance metrics from caret
cm <- confusionMatrix(reference = obs, data = preds, mode = "everything")
print(cm)
out <- list("model"= model, "roc"= roc)
return(out)
}
load("data/data.Rdata")
X <- scale(df[, 1:230])
y <- df$target
Xclin <- cbind.data.frame(X, clin_dummy)
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 100,
savePredictions = "final",
classProbs = T,
summaryFunction = multiClassSummary,
selectionFunction = best,
search = "random",
sampling = "smote"
)
benchmark <- multifit(
X = clin_dummy,
y = y,
model = "multinom",
ctrl = ctrl,
grid = expand.grid(decay = 0),
trace = F
)
Multi-class area under the curve: 0.8143
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 4756 2251 2 102
ADE4 3736 904 31 119
SCD 34 199 1892 4158
SCDE4 74 46 2075 4321
Overall Statistics
Accuracy : 0.4807
95% CI : (0.4744, 0.4869)
No Information Rate : 0.3522
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.2972
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.5530 0.2659 0.4730 0.4967
Specificity 0.8537 0.8176 0.7879 0.8628
Pos Pred Value 0.6688 0.1887 0.3011 0.6631
Neg Pred Value 0.7815 0.8746 0.8855 0.7592
Precision 0.6688 0.1887 0.3011 0.6631
Recall 0.5530 0.2659 0.4730 0.4967
F1 0.6054 0.2208 0.3680 0.5680
Prevalence 0.3482 0.1377 0.1619 0.3522
Detection Rate 0.1926 0.0366 0.0766 0.1749
Detection Prevalence 0.2879 0.1939 0.2544 0.2638
Balanced Accuracy 0.7034 0.5417 0.6304 0.6797
g <- ggroc(benchmark$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
mlr <- multifit(
X = Xclin,
y = y,
model = "multinom",
ctrl = ctrl,
grid = expand.grid(decay = 9.187724),
trace = F,
importance=T
)
Multi-class area under the curve: 0.8338
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 5763 2371 0 6
ADE4 2829 963 1 44
SCD 4 6 1662 2827
SCDE4 4 60 2337 5823
Overall Statistics
Accuracy : 0.5753
95% CI : (0.5692, 0.5815)
No Information Rate : 0.3522
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4078
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.6701 0.28324 0.41550 0.6693
Specificity 0.8524 0.86507 0.86295 0.8499
Pos Pred Value 0.7080 0.25098 0.36942 0.7080
Neg Pred Value 0.8287 0.88319 0.88426 0.8254
Precision 0.7080 0.25098 0.36942 0.7080
Recall 0.6701 0.28324 0.41550 0.6693
F1 0.6885 0.26613 0.39110 0.6881
Prevalence 0.3482 0.13765 0.16194 0.3522
Detection Rate 0.2333 0.03899 0.06729 0.2357
Detection Prevalence 0.3296 0.15534 0.18215 0.3330
Balanced Accuracy 0.7612 0.57415 0.63922 0.7596
g <- ggroc(mlr$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
# Get the feature importance scores
fimp <- varImp(mlr$model)
fimp$importance %>%
arrange(desc(Overall)) %>%
top_n(20)
Overall
Am.Putrescine 100.00000
OS.HpH.Spha.1.P.C18.0 91.91669
OA.OA29...Uracil 86.75761
OA.OA17...3.Hydroxybutyric.acid 84.77845
Am.Sarcosine 83.65953
med_chol_nee 82.97303
Lip.TG.56.0. 80.42543
Lip.PE.O.38.5. 77.63905
Am.L.Tryptophan 72.10589
OS.HpH.LPA.C20.5 70.13465
Am.Glutathione 70.12092
Lip.SM.d18.1.20.1. 69.84502
Am.Histamine 67.85096
Am.L.Histidine 66.69940
Am.L.Glutamine 66.52812
OS.HpH.LPA.C18.0 65.53134
Am.L.Serine 64.49744
Am.L.Threonine 64.45528
OS.HpH.PAF.C16.0 63.14493
Lip.SM.d18.1.24.1. 62.78610
project <- function(X, m, seed) {
set.seed(seed)
X <- scale(as.matrix(df[, 1:230]))
cov <- cor(X)
# Find redundant features
filter <- RF(cov)
# Filter out redundant features
filtered <- subSet(X, filter)
# Regularized correlation matrix estimation
M <- regcor(filtered)
# Get the regularized correlation matrix of the filtered dataset
R <- M$optCor
mlfa <- mlFA(R, m = 6)
thomson <- facScore(filtered, mlfa$Loadings, mlfa$Uniqueness)
return(thomson)
}
thomson <- project(X, 6, seed = 1234)
X <- cbind(clin_dummy, thomson)
mlrf <- multifit(
X = X,
y = y,
model = "multinom",
ctrl = ctrl,
trace = FALSE,
grid = expand.grid(decay = 0)
)
Multi-class area under the curve: 0.8178
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 5047 2100 4 91
ADE4 3344 941 60 202
SCD 115 215 1998 3271
SCDE4 94 144 1938 5136
Overall Statistics
Accuracy : 0.5313
95% CI : (0.525, 0.5375)
No Information Rate : 0.3522
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3593
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.5869 0.2768 0.49950 0.5903
Specificity 0.8637 0.8307 0.82604 0.8640
Pos Pred Value 0.6969 0.2069 0.35685 0.7024
Neg Pred Value 0.7965 0.8780 0.89519 0.7950
Precision 0.6969 0.2069 0.35685 0.7024
Recall 0.5869 0.2768 0.49950 0.5903
F1 0.6372 0.2368 0.41629 0.6415
Prevalence 0.3482 0.1377 0.16194 0.3522
Detection Rate 0.2043 0.0381 0.08089 0.2079
Detection Prevalence 0.2932 0.1841 0.22668 0.2960
Balanced Accuracy 0.7253 0.5537 0.66277 0.7272
g <- ggroc(mlrf$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
tree <- multifit(
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = expand.grid(maxdepth = 3)
)
Multi-class area under the curve: 0.8176
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 5578 1904 56 95
ADE4 2692 1241 58 155
SCD 182 153 2704 5420
SCDE4 148 102 1182 3030
Overall Statistics
Accuracy : 0.5082
95% CI : (0.502, 0.5145)
No Information Rate : 0.3522
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3445
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.6486 0.36500 0.6760 0.3483
Specificity 0.8724 0.86362 0.7220 0.9105
Pos Pred Value 0.7308 0.29932 0.3197 0.6791
Neg Pred Value 0.8229 0.89496 0.9202 0.7198
Precision 0.7308 0.29932 0.3197 0.6791
Recall 0.6486 0.36500 0.6760 0.3483
F1 0.6872 0.32892 0.4341 0.4604
Prevalence 0.3482 0.13765 0.1619 0.3522
Detection Rate 0.2258 0.05024 0.1095 0.1227
Detection Prevalence 0.3090 0.16785 0.3425 0.1806
Balanced Accuracy 0.7605 0.61431 0.6990 0.6294
g <- ggroc(tree$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
xgb <- multifit(
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = xgb.grid
)
Multi-class area under the curve: 0.8355
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 6432 2000 19 3
ADE4 2079 1263 35 184
SCD 24 6 1559 3086
SCDE4 65 131 2387 5427
Overall Statistics
Accuracy : 0.5944
95% CI : (0.5882, 0.6005)
No Information Rate : 0.3522
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4336
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.7479 0.37147 0.38975 0.6238
Specificity 0.8744 0.89211 0.84947 0.8386
Pos Pred Value 0.7608 0.35468 0.33348 0.6775
Neg Pred Value 0.8666 0.89891 0.87810 0.8039
Precision 0.7608 0.35468 0.33348 0.6775
Recall 0.7479 0.37147 0.38975 0.6238
F1 0.7543 0.36288 0.35942 0.6496
Prevalence 0.3482 0.13765 0.16194 0.3522
Detection Rate 0.2604 0.05113 0.06312 0.2197
Detection Prevalence 0.3423 0.14417 0.18927 0.3243
Balanced Accuracy 0.8112 0.63179 0.61961 0.7312
g <- ggroc(xgb$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
# cvms::plot_confusion_matrix(cmxgb, palette = "Oranges" , theme_fn = ggthemes::theme_tufte)
aucs <- data.frame(AUC = c("Clinical features only" = benchmark$roc$auc,
"Clinical features + 230 metabolites" = mlr$roc$auc,
"Clinical features + 6 latent factors" = mlrf$roc$auc,
"Decision Tree" = tree$roc$auc, "XGBoost" = xgb$roc$auc))
aucs
AUC
Clinical features only 0.8143379
Clinical features + 230 metabolites 0.8337657
Clinical features + 6 latent factors 0.8177738
Decision Tree 0.8176124
XGBoost 0.8355013
Observations:
Adding serum metabolite information (either the full 230-metabolite matrix or its 6-factor projection) seems to slightly increase the discriminatory power of the models.
Fitting 6 ML-estimated factors obtained by the FMradio package (cummulatively explaining 30% of variace) yields slightly increased classification performance.
Looking at the confusion matrices and individual ROC curves, all models were able to discriminate better among certain classes (AD+E4/SCD+E4, AD+E4/SCD, AD-E4/SCD+E4 and AD-E4/SCD-E4) compared to others (AD+E4/AD-E4 and SCD+E4/SCD-E4).
The covariance network analysis shows distinct metabolic “images” among ApoE4 carriers and non-carriers in AD. The network vertices and edges are distinct.
# Store all observations of ApoE4 non-carriers in C1
C1 <- scale(AD[AD$E4 == 0, 1:230])
# Store all observations of ApoE4 carriers in C2
C2 <- scale(AD[AD$E4 == 1, 1:230])
# Get the covariance matrices of C1 and C2
S1 <- covML(C1)
S2 <- covML(C2)
# Store them in a list
S <- list(S1 = S1, S2 = S2)
# Get the total number of samples
n <- c(nrow(S1), nrow(S2))
# Create a list of fused covariance matrices T
Ts <- default.target.fused(Slist = S, ns = n, type = "DUPV")
# Get the optimal lambdas per class and fused with 10-fold CV
# set.seed(8910)
# optf <- optPenalty.fused(
# Ylist = Ys,
# Tlist = Ts,
# lambda = default.penalty(Ys),
# cv.method = "kCV",
# k = 10,
# verbose = FALSE
# )
# save(optf, file= "data/optf.Rdata")
i = 1 | max diff = 8.0963463215e-01
i = 2 | max diff = 7.3872853063e-29
Converged in 2 iterations, max diff < 1.49e-08.
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting
- Retained elements: 55
- Corresponding to 0.21 % of possible edges
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting
- Retained elements: 205
- Corresponding to 0.78 % of possible edges
The Gaussian graphical modelling between ApoE4 carriers and non- carriers with AD unveiled distinct correlations between the metabolites. The ApoE4 carriers’ network was more sparse, having less edges compared to the non-carriers’. Moreover, the metabolites could be clearly separated in communities based on their covariances in ApoE4 carriers, as shown in The communities found contain similar compounds (phosphatidylcholines, triglycerides, sphin- gomyelines, prostaglandines) or molecules that share metabolic pathways (amines, amines-organic acids, aspartic-glutamic acid, oxidative stress compounds-lipids). The ApoE4 non-carrier’s network appeared less cohesive, with only a few small aminic and organic acid communities.
# Merge the sparse high precision matrices
TST <- Union(P0s$S1$sparseParCor, P0s$S2$sparseParCor)
PCE4NO <- TST$M1subset
PCE4YES <- TST$M2subset
# Create a color map per metabolite class
Colors <- rownames(PCE4YES)
Colors[grep("Am", rownames(PCE4YES))] <- "pink"
Colors[grep("OA", rownames(PCE4YES))] <- "lightblue"
Colors[grep("Lip", rownames(PCE4YES))] <- "yellow"
Colors[grep("OS", rownames(PCE4YES))] <- "grey"
# httpgd::hgd( width = getOption("httpgd.width", 700),
# height = getOption("httpgd.height", 700),)
set.seed(111213)
# Plot he sparsified ridge matrix of ApoE4 carriers with AD
Coords <- Ugraph(PCE4YES,
type = "fancy", lay = "layout_with_fr",
Vcolor = Colors, prune = FALSE, Vsize = 5, Vcex = 0.3, cut = 0.5,
main = "ApoE4 carriers with AD"
)
# Plot the sparsified ridge matrix of ApoE4 non-carriers with AD
Ugraph(PCE4NO,
type = "fancy", lay = NULL, coords = Coords,
Vcolor = Colors, prune = FALSE, Vsize = 5, Vcex = 0.3, cut = 0.5,
main = "ApoE4 non-carriers with AD"
)
# Plot the differential network
diffgraph <- DiffGraph(PCE4NO, PCE4YES,
lay = NULL, coords = Coords,
Vcolor = Colors, Vsize = 5, Vcex = 0.3, main = "Differential Network"
)
Distinct metabolite communities among ApoE4 carriers/non carriers
# Get the communities per class
set.seed(141516)
CommC1 <- Communities(PCE4NO,
Vcolor = Colors, Vsize = 5, Vcex = 0.3,
main = "ApoE4 non-carriers with AD"
)
CommC2 <- Communities(PCE4YES,
Vcolor = Colors, Vsize = 5, Vcex = 0.3,
main = "ApoE4 carriers with AD"
)
PC0list <- list(PCE4NO = PCE4NO, PCE4YES = PCE4YES)
# Get the network statistics
NetStats <- GGMnetworkStats.fused(PC0list)
NetStatsE4yes <- NetStats[, 10:18]
NetStatsE4no <- NetStats[, 1:9]
NetStatsE4yes[, c(3, 9)] <- NetStatsE4no[, c(3, 9)] <- NULL
DegreesE4yes <- as.data.frame(NetStatsE4yes[, 1], rownames(NetStats))
DegreesE4no <- as.data.frame(NetStatsE4no[, 1], row.names = row.names(NetStatsE4no))
head(DegreesE4yes[order(DegreesE4yes[, 1], decreasing = TRUE), , drop = FALSE], 20)
NetStatsE4yes[, 1]
Am.Dopamine 7
Am.ADMA 5
Am.Citrulline 5
Am.L.Kynurenine 5
OA.OA27...3.hydroxyisovaleric.acid 5
OS.HpH.PAF.C16.0 5
Am.X1.Methylhistidine 4
Am.X3.Methoxytyramine 4
Am.DL.3.aminoisobutyric.acid 4
Am.Ethanolamine 4
Am.L.2.aminoadipic.acid 4
Am.L.4.hydroxy.proline 4
Am.SDMA 4
OA.OA06...Glycolic.acid 4
OA.OA09...2.ketoglutaric.acid 4
OA.OA12...Fumaric.acid 4
OA.OA17...3.Hydroxybutyric.acid 4
OA.OA20...Aspartic.acid 4
OA.OA28...Glyceric.acid 4
Lip.TG.56.8. 4
head(DegreesE4no[order(DegreesE4no[, 1], decreasing = TRUE), , drop = FALSE], 20)
NetStatsE4no[, 1]
Am.X3.Methoxytyrosine 5
OS.LpH.NO2.OA 4
Am.Cysteine 3
Am.Dopamine 3
Am.Glutathione 3
Am.L.Aspartic.acid 3
Am.X3.Methylhistidine 2
Am.Hydroxylysine 2
Am.L.4.hydroxy.proline 2
Am.L.carnosine 2
Am.Methyldopa 2
OA.OA03...Citric.acid 2
OA.OA13...Pyruvic.acid 2
OA.OA20...Aspartic.acid 2
OA.OA23...Iminodiacetate 2
OA.OA29...Uracil 2
Lip.TG.60.2. 2
Lip.SM.d18.1.24.0. 2
OS.LpH.NO2.LA 2
OS.LpH.NO2.aLA 2
The Wilcoxon Signed Rank tests revealed most network statistics to be larger in ApoE4 carriers compared to non-carriers (p-values <0.05) with AD (Table 7). In the ApoE4 positive group, top central metabolites were amines (dopamine and citrulline) (Table 8). ApoE4 non-carriers had 3-methoxytyrosine as top central metabolite, followed by the oxidative stress compound nitro fatty acid (C18:3)
# Perform a Wilcoxon signed rank test
w <- furrr::future_map2(NetStatsE4yes[, 1:7], NetStatsE4no[, 1:7], ~ {
wilcox.test(.x, .y, paired = TRUE, alternative = "greater")
})
p.values <- names <- list(1:length(w))
for (i in 1:length(w)) {
names[[i]] <- names(w[[i]])[7:length(names(w[[i]]))]
p.values[[i]] <- w[[i]][["p.value"]]
}
names(p.values) <- names(w)
d <- as.data.frame(p.values)
d <- t(d)
d <- cbind.data.frame(d, p.adjust(d, method = "fdr"))
rownames(d) <- sub("PCE4YES.", "", row.names(d))
names(d) <- c("p-value", "FDR")
d
p-value FDR
degree 7.905289e-27 1.844567e-26
betweenness 5.439808e-18 6.346442e-18
eigenCentrality 3.626367e-25 6.346142e-25
nNeg 8.837198e-06 8.837198e-06
nPos 1.436859e-24 2.011603e-24
mutualInfo 1.773939e-30 6.411006e-30
variance 1.831716e-30 6.411006e-30