Skip to contents

Overview

This vignette demonstrates the application of PathwayEmbed in a beta-catenin perturbed system where the Ctnnb1 upstream enhancer is depleted in intestinal epithelia. Because the KO genotype is known by design, this dataset provides a clean biological ground truth for a direct AUROC benchmark against PROGENy and AddModuleScore.

Reference: Hua et al., eLife 13: RP98238 (2024). https://doi.org/10.7554/eLife.98238


Helper: Annotation Label Builder

make_label <- function(stats, group1, group2) {
  d    <- round(stats$cohens_d[1], 3)
  p    <- formatC(stats$p_value[1], format = "e", digits = 2)
  on1  <- stats$percentage_on[stats$group  == group1]
  off1 <- stats$percentage_off[stats$group == group1]
  on2  <- stats$percentage_on[stats$group  == group2]
  off2 <- stats$percentage_off[stats$group == group2]
  paste0(
    group1, ": ON=", on1,  "%, OFF=", off1, "%\n",
    group2, ": ON=", on2,  "%, OFF=", off2, "%\n",
    "Cohen's d = ", d, "\n",
    "p = ", p
  )
}

Download Data

url_ko <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE233nnn/GSE233978/suppl/GSE233978_KO_filtered_feature_bc_matrix.h5"
url_wt <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE233nnn/GSE233978/suppl/GSE233978_WT_filtered_feature_bc_matrix.h5"

download.file(url_ko, destfile = "GSE233978_KO_filtered_feature_bc_matrix.h5", mode = "wb")
download.file(url_wt, destfile = "GSE233978_WT_filtered_feature_bc_matrix.h5", mode = "wb")

Data Preparation and Preprocessing

ko_data <- Read10X_h5("GSE233978_KO_filtered_feature_bc_matrix.h5")
wt_data <- Read10X_h5("GSE233978_WT_filtered_feature_bc_matrix.h5")

ko <- CreateSeuratObject(counts = ko_data, project = "KO",
                          min.cells = 3, min.features = 200)
wt <- CreateSeuratObject(counts = wt_data, project = "WT",
                          min.cells = 3, min.features = 200)
ko$sample <- "KO"
wt$sample <- "WT"

merged          <- merge(ko, wt)
merged[["RNA"]] <- JoinLayers(merged[["RNA"]])

merged <- NormalizeData(merged, normalization.method = "LogNormalize",
                         scale.factor = 10000)
merged <- FindVariableFeatures(merged, selection.method = "vst",
                                nfeatures = 2000)
merged <- ScaleData(merged, features = VariableFeatures(merged))

Wnt Pathway Scoring

Wnt_12h   <- LoadPathway("WNT3A_12H_ACTIVATION",  "mouse")
Wnt_24h   <- LoadPathway("WNT3A_24H_ACTIVATION",  "mouse")
Wnt_48h   <- LoadPathway("WNT3A_48H_ACTIVATION",  "mouse")
Wnt_slope <- LoadPathway("WNT3A_SLOPE_ACTIVATION", "mouse")

matrix_12h   <- DataPreProcess(merged, Wnt_12h,   Seurat.object = TRUE)
matrix_24h   <- DataPreProcess(merged, Wnt_24h,   Seurat.object = TRUE)
matrix_48h   <- DataPreProcess(merged, Wnt_48h,   Seurat.object = TRUE)
matrix_slope <- DataPreProcess(merged, Wnt_slope, Seurat.object = TRUE)

pathwaystat_12h   <- PathwayMaxMin(matrix_12h,   Wnt_12h)
pathwaystat_24h   <- PathwayMaxMin(matrix_24h,   Wnt_24h)
pathwaystat_48h   <- PathwayMaxMin(matrix_48h,   Wnt_48h)
pathwaystat_slope <- PathwayMaxMin(matrix_slope, Wnt_slope)

score_12h   <- ComputeCellData(matrix_12h,   pathwaystat_12h)
score_24h   <- ComputeCellData(matrix_24h,   pathwaystat_24h)
score_48h   <- ComputeCellData(matrix_48h,   pathwaystat_48h)
score_slope <- ComputeCellData(matrix_slope, pathwaystat_slope)

Visualization

plot_data_12h   <- PreparePlotData(merged, score_12h,   "sample", Seurat.object = TRUE)
plot_data_24h   <- PreparePlotData(merged, score_24h,   "sample", Seurat.object = TRUE)
plot_data_48h   <- PreparePlotData(merged, score_48h,   "sample", Seurat.object = TRUE)
plot_data_slope <- PreparePlotData(merged, score_slope, "sample", Seurat.object = TRUE)

pct_12h   <- CalculatePercentage(plot_data_12h,   "sample")
pct_24h   <- CalculatePercentage(plot_data_24h,   "sample")
pct_48h   <- CalculatePercentage(plot_data_48h,   "sample")
pct_slope <- CalculatePercentage(plot_data_slope, "sample")

pct_12h; pct_24h; pct_48h; pct_slope
#> # A tibble: 2 × 5
#>   group percentage_on percentage_off cohens_d  p_value
#>   <chr>         <dbl>          <dbl>    <dbl>    <dbl>
#> 1 KO             50.5           49.5  -0.0728 2.50e-12
#> 2 WT             53.6           46.4  -0.0728 2.50e-12
#> # A tibble: 2 × 5
#>   group percentage_on percentage_off cohens_d  p_value
#>   <chr>         <dbl>          <dbl>    <dbl>    <dbl>
#> 1 KO             57.7           42.3   0.0748 5.27e-12
#> 2 WT             51.4           48.6   0.0748 5.27e-12
#> # A tibble: 2 × 5
#>   group percentage_on percentage_off cohens_d  p_value
#>   <chr>         <dbl>          <dbl>    <dbl>    <dbl>
#> 1 KO             41.7           58.3   -0.136 3.52e-36
#> 2 WT             50.8           49.2   -0.136 3.52e-36
#> # A tibble: 2 × 5
#>   group percentage_on percentage_off cohens_d   p_value
#>   <chr>         <dbl>          <dbl>    <dbl>     <dbl>
#> 1 KO             38.0           62.0   -0.301 6.49e-147
#> 2 WT             52.0           48.0   -0.301 6.49e-147

p_12h <- PlotPathway(plot_data_12h,   "Wnt 12h",    "sample", c("#ae282c", "#2066a8")) +
  annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = make_label(pct_12h, "KO", "WT"), size = 3.5, color = "black")

p_24h <- PlotPathway(plot_data_24h,   "Wnt 24h",    "sample", c("#ae282c", "#2066a8")) +
  annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = make_label(pct_24h, "KO", "WT"), size = 3.5, color = "black")

p_48h <- PlotPathway(plot_data_48h,   "Wnt 48h",    "sample", c("#ae282c", "#2066a8")) +
  annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = make_label(pct_48h, "KO", "WT"), size = 3.5, color = "black")

p_slope <- PlotPathway(plot_data_slope, "Wnt (slope)", "sample", c("#ae282c", "#2066a8")) +
  annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
           label = make_label(pct_slope, "KO", "WT"), size = 3.5, color = "black")

p_12h; p_24h; p_48h; p_slope


Technical Confounders

merged[["percent.mt"]] <- PercentageFeatureSet(merged, pattern = "^mt-")
seurat_meta <- merged@meta.data

make_scatter <- function(df, x_col, x_label) {
  r <- cor(df$embed_score, df[[x_col]], method = "spearman", use = "complete.obs")
  ggplot(df, aes(x = .data[[x_col]], y = embed_score)) +
    geom_point(alpha = 0.3, size = 0.8, color = "#534AB7") +
    geom_smooth(method = "lm", color = "#E24B4A", linewidth = 0.8, se = TRUE) +
    labs(title = sprintf("%s (rho = %.3f)", x_label, r),
         x = x_label, y = "PathwayEmbed score") +
    theme_minimal()
}

confounder_df <- data.frame(
  cell         = names(score_slope),
  embed_score  = as.numeric(score_slope),
  nCount_RNA   = seurat_meta[names(score_slope), "nCount_RNA"],
  nFeature_RNA = seurat_meta[names(score_slope), "nFeature_RNA"],
  percent_mt   = seurat_meta[names(score_slope), "percent.mt"]
)

for (cov in c("nCount_RNA", "nFeature_RNA", "percent_mt")) {
  r <- cor(confounder_df$embed_score, confounder_df[[cov]],
           method = "spearman", use = "complete.obs")
  cat(sprintf("Spearman vs %-15s : %.3f\n", cov, r))
}
#> Spearman vs nCount_RNA      : 0.188
#> Spearman vs nFeature_RNA    : 0.262
#> Spearman vs percent_mt      : -0.119

(make_scatter(confounder_df, "nCount_RNA",   "nCount_RNA") +
 make_scatter(confounder_df, "nFeature_RNA", "nFeature_RNA") +
 make_scatter(confounder_df, "percent_mt",   "percent.mt"))


Benchmark Against PROGENy and AddModuleScore

normalized_mat       <- GetAssayData(merged, assay = "RNA", layer = "data")
normalized_mat_dense <- as.matrix(normalized_mat)

common_genes         <- intersect(rownames(normalized_mat_dense), Wnt_slope$Gene_Symbol)
gene_matrix          <- normalized_mat_dense[common_genes, ]
mean_expr            <- colMeans(gene_matrix, na.rm = TRUE)
zscore_mean_per_cell <- colMeans(t(scale(t(gene_matrix))), na.rm = TRUE)

progeny_scores <- progeny(
  normalized_mat_dense,
  scale    = TRUE,
  organism = "Mouse",
  top      = 100,
  perm     = 1
)
progeny_wnt <- setNames(as.numeric(progeny_scores[, "WNT"]),
                         rownames(progeny_scores))

wnt_features <- rownames(matrix_slope)
merged <- AddModuleScore(
  object   = merged,
  features = list(wnt_features),
  name     = "WNT_ModuleScore",
  ctrl     = 5,
  nbin     = 10
)
module_score <- setNames(
  merged@meta.data$WNT_ModuleScore1,
  rownames(merged@meta.data)
)

cells <- names(score_slope)
benchmark_df <- data.frame(
  PathwayEmbed   = as.numeric(score_slope),
  PROGENy        = as.numeric(progeny_wnt[cells]),
  AddModuleScore = as.numeric(module_score[cells]),
  mean_expr      = as.numeric(mean_expr[cells]),
  zscore_expr    = as.numeric(zscore_mean_per_cell[cells]),
  row.names      = cells
)

benchmark_cor <- cor(benchmark_df, method = "spearman",
                     use = "pairwise.complete.obs")
print(round(benchmark_cor, 3))
#>                PathwayEmbed PROGENy AddModuleScore mean_expr zscore_expr
#> PathwayEmbed          1.000   0.095          0.106     0.359       0.356
#> PROGENy               0.095   1.000         -0.180     0.367       0.362
#> AddModuleScore        0.106  -0.180          1.000     0.146       0.139
#> mean_expr             0.359   0.367          0.146     1.000       0.972
#> zscore_expr           0.356   0.362          0.139     0.972       1.000

p_cor <- pheatmap(
  benchmark_cor,
  color           = colorRampPalette(c("#185FA5", "white", "#993C1D"))(100),
  breaks          = seq(-1, 1, length.out = 101),
  display_numbers = TRUE, number_format = "%.2f",
  fontsize_number = 9,
  main            = "Method Benchmark — Spearman Correlation (Ctnnb1 KO dataset)"
)
p_cor


ko_label <- ifelse(merged@meta.data[cells, "sample"] == "KO", 1L, 0L)

compute_auroc <- function(scores, labels) {
  r       <- roc(labels, scores, quiet = TRUE)
  auc_val <- as.numeric(auc(r))
  if (auc_val < 0.5) auc_val <- 1 - auc_val
  auc_val
}

auroc_results <- data.frame(
  Method = colnames(benchmark_df),
  AUROC  = sapply(benchmark_df, compute_auroc, labels = ko_label)
)
auroc_results <- auroc_results[order(-auroc_results$AUROC), ]
print(auroc_results)
#>                        Method     AUROC
#> AddModuleScore AddModuleScore 0.6827026
#> mean_expr           mean_expr 0.6482098
#> zscore_expr       zscore_expr 0.6425566
#> PathwayEmbed     PathwayEmbed 0.5897700
#> PROGENy               PROGENy 0.5106677

compute_cohens_d_method <- function(scores_vec, seurat_obj, group_col) {
  pd  <- PreparePlotData(seurat_obj, scores_vec, group_col, Seurat.object = TRUE)
  pct <- CalculatePercentage(pd, group_var = group_col)
  abs(pct$cohens_d[1])
}

cohens_d_results <- data.frame(
  Method   = colnames(benchmark_df),
  Cohens_d = sapply(
    colnames(benchmark_df),
    function(m) {
      sv <- setNames(benchmark_df[[m]], rownames(benchmark_df))
      compute_cohens_d_method(sv, merged, "sample")
    }
  )
)
cohens_d_results <- cohens_d_results[order(-cohens_d_results$Cohens_d), ]
print(cohens_d_results)
#>                        Method   Cohens_d
#> AddModuleScore AddModuleScore 0.64575350
#> mean_expr           mean_expr 0.54020251
#> zscore_expr       zscore_expr 0.51565732
#> PathwayEmbed     PathwayEmbed 0.30056596
#> PROGENy               PROGENy 0.03129493

method_colors <- c(
  "PathwayEmbed"   = "#534AB7",
  "PROGENy"        = "#E24B4A",
  "AddModuleScore" = "#EF9F27",
  "mean_expr"      = "#888780",
  "zscore_expr"    = "#B4B2A9"
)

p_cd <- ggplot(cohens_d_results,
       aes(x = reorder(Method, Cohens_d), y = Cohens_d, fill = Method)) +
  geom_bar(stat = "identity", width = 0.6) +
  coord_flip() +
  scale_fill_manual(values = method_colors) +
  labs(title = "Effect size (Cohen's d): KO vs. WT",
       x = NULL, y = "Cohen's d (absolute)") +
  theme_classic() +
  theme(legend.position = "none")
p_cd


p_auroc <- ggplot(auroc_results,
       aes(x = reorder(Method, AUROC), y = AUROC, fill = Method)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black") +
  coord_flip() +
  scale_fill_manual(values = method_colors) +
  labs(title = "AUROC: KO vs. WT separation (Ctnnb1 KO dataset)",
       x = NULL, y = "AUROC") +
  ylim(0, 1) +
  theme_classic() +
  theme(legend.position = "none")
p_auroc


roc_list <- lapply(colnames(benchmark_df), function(m) {
  roc(ko_label, benchmark_df[[m]], quiet = TRUE)
})
names(roc_list) <- colnames(benchmark_df)

roc_df <- do.call(rbind, lapply(names(roc_list), function(m) {
  r <- roc_list[[m]]
  data.frame(Method = m, FPR = 1 - r$specificities, TPR = r$sensitivities)
}))

p_roc <- ggplot(roc_df, aes(x = FPR, y = TPR, color = Method)) +
  geom_line(linewidth = 0.9) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey60") +
  scale_color_manual(values = method_colors) +
  labs(title = "ROC curves: KO vs. WT (Ctnnb1 KO dataset)",
       x = "False positive rate", y = "True positive rate") +
  theme_classic()
p_roc