Spaces:
Running
Running
File size: 5,172 Bytes
4d886f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """Pathway enrichment tables (DAVID-style exports) for Reactome and KEGG panels."""
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
REPO_ROOT = Path(__file__).resolve().parents[2]
DE_TSV = REPO_ROOT / "analysis" / "de_all_48.tsv"
RE_TSV = REPO_ROOT / "analysis" / "re_all_48.tsv"
def load_de_re_tsv() -> tuple[pd.DataFrame, pd.DataFrame] | None:
if not DE_TSV.is_file() or not RE_TSV.is_file():
return None
return pd.read_csv(DE_TSV, sep="\t"), pd.read_csv(RE_TSV, sep="\t")
def preprocess_pathway_file(df: pd.DataFrame, splitter: str) -> pd.DataFrame:
out = df.copy()
out["Term"] = out["Term"].astype(str).str.split(splitter).str[-1]
if splitter == "-":
out["Term"] = out["Term"].astype(str).str.split("~").str[-1]
out = out[out["Benjamini"] < 0.05].copy()
out["Gene Ratio"] = out["Count"] / out["List Total"]
return out
def merged_reactome_kegg_bubble_frames(
de_all: pd.DataFrame, re_all: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Rows for bubble plot (Gene Ratio, Count, Benjamini, Library, Term) per notebook cell 31."""
reactome_de = de_all[de_all["Category"] == "REACTOME_PATHWAY"]
reactome_re = re_all[re_all["Category"] == "REACTOME_PATHWAY"]
kegg_de = de_all[de_all["Category"] == "KEGG_PATHWAY"]
kegg_re = re_all[re_all["Category"] == "KEGG_PATHWAY"]
rde = preprocess_pathway_file(reactome_de, "~")
rde["Library"] = "Reactome"
rre = preprocess_pathway_file(reactome_re, "~")
rre["Library"] = "Reactome"
kde = preprocess_pathway_file(kegg_de, ":")
kde["Library"] = "KEGG"
kre = preprocess_pathway_file(kegg_re, ":")
kre["Library"] = "KEGG"
merged_dead = pd.concat([rde, kde], ignore_index=True)
merged_re = pd.concat([rre, kre], ignore_index=True)
return merged_dead, merged_re
def _preprocess_exploded(df: pd.DataFrame, pval_threshold: float, splitter: str, label: str) -> pd.DataFrame:
d = df.copy()
d["Term"] = d["Term"].astype(str).str.split(splitter).str[-1]
if splitter == "-":
d["Term"] = d["Term"].astype(str).str.split("~").str[-1]
def _trunc(x: str) -> str:
return x[:60] + "..." if len(x) > 60 else x
d["Term"] = d["Term"].map(_trunc)
d = d[d["Benjamini"] < pval_threshold]
sub = d[["Term", "Genes", "Benjamini"]].copy()
sub["Label"] = label
exploded = (
sub.set_index(["Term", "Benjamini", "Label"])["Genes"].str.split(", ").explode().reset_index()
)
return exploded
def _binary_matrix(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
binary = pd.crosstab(data["Term"], data["Genes"])
labels = data.groupby("Term")["Label"].first()
pvals = data.groupby("Term")["Benjamini"].first()
return binary, labels, pvals
def _sort_matrix(matrix: pd.DataFrame) -> pd.DataFrame:
sp = matrix.sum(axis=1).sort_values(ascending=False).index
sg = matrix.sum(axis=0).sort_values(ascending=False).index
return matrix.loc[sp, sg]
def build_merged_pathway_membership(
de_all: pd.DataFrame, re_all: pd.DataFrame, pval_threshold: float = 0.05
) -> tuple[np.ndarray, list[str], list[str]] | None:
"""
Numeric grid for heatmap: values 0=white, 1=dead-end gene, 2=reprogramming gene,
3=Reactome library stripe, 4=KEGG library stripe (notebook cell 29).
"""
reactome_de = de_all[de_all["Category"] == "REACTOME_PATHWAY"]
reactome_re = re_all[re_all["Category"] == "REACTOME_PATHWAY"]
kegg_de = de_all[de_all["Category"] == "KEGG_PATHWAY"]
kegg_re = re_all[re_all["Category"] == "KEGG_PATHWAY"]
rde = _preprocess_exploded(reactome_de, pval_threshold, "~", "Dead-end")
rre = _preprocess_exploded(reactome_re, pval_threshold, "~", "Reprogramming")
rcomb = pd.concat([rde, rre], ignore_index=True)
kde = _preprocess_exploded(kegg_de, pval_threshold, ":", "Dead-end")
kre = _preprocess_exploded(kegg_re, pval_threshold, ":", "Reprogramming")
kcomb = pd.concat([kde, kre], ignore_index=True)
rm, rlab, _ = _binary_matrix(rcomb)
km, klab, _ = _binary_matrix(kcomb)
rm = _sort_matrix(rm)
km = _sort_matrix(km)
reactome_lib = pd.Series("Reactome", index=rm.index)
kegg_lib = pd.Series("KEGG", index=km.index)
merged = pd.concat([rm, km], axis=0, sort=False).fillna(0)
if merged.empty or merged.shape[1] == 0:
return None
merged_labels = pd.concat([rlab, klab])
merged_library = pd.concat([reactome_lib, kegg_lib])
label_code = {"Dead-end": 1, "Reprogramming": 2}
lib_code = {"Reactome": 3, "KEGG": 4}
gene_cols = list(merged.columns)
z = np.zeros((len(merged), len(gene_cols) + 1), dtype=float)
for i, term in enumerate(merged.index):
lc = label_code.get(str(merged_labels.loc[term]), 0)
for j, g in enumerate(gene_cols):
v = float(merged.loc[term, g])
if v > 0 and lc:
z[i, j] = v * lc
z[i, -1] = lib_code.get(str(merged_library.loc[term]), 0)
row_labels = [str(t) for t in merged.index]
col_labels = gene_cols + ["Library"]
return z, row_labels, col_labels
|