GLUE-Agent-MCP / src /tools /preprocessing.py
dmannk's picture
Upload folder using huggingface_hub
d048db9 verified
"""
GLUE preprocessing tutorial for scRNA-seq and scATAC-seq data integration.
This MCP Server provides 3 tools:
1. glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA
2. glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction
3. glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features
All tools extracted from `gao-lab/GLUE/blob/master/docs/preprocessing.ipynb`.
"""
import os
from datetime import datetime
from pathlib import Path
# Standard imports
from typing import Annotated, Any, Literal
import anndata as ad
# Domain-specific imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scanpy as sc
import scglue
from fastmcp import FastMCP
from matplotlib import rcParams
# Project structure
PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs"
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs"
INPUT_DIR = Path(os.environ.get("PREPROCESSING_INPUT_DIR", DEFAULT_INPUT_DIR))
OUTPUT_DIR = Path(os.environ.get("PREPROCESSING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
# Ensure directories exist
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Timestamp for unique outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Set plotting parameters
plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)
# MCP server instance
preprocessing_mcp = FastMCP(name="preprocessing")
@preprocessing_mcp.tool
def glue_preprocess_scrna(
rna_path: Annotated[
str | None, "Path to scRNA-seq data file in h5ad format"
] = None,
n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000,
flavor: Annotated[
Literal["seurat", "cell_ranger", "seurat_v3"], "Method for HVG selection"
] = "seurat_v3",
n_comps: Annotated[int, "Number of principal components"] = 100,
svd_solver: Annotated[
Literal["auto", "arpack", "randomized"], "SVD solver for PCA"
] = "auto",
color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Preprocess scRNA-seq data with highly variable gene selection, normalization, scaling, and PCA.
Input is scRNA-seq data in h5ad format and output is preprocessed data with PCA embedding and UMAP visualization.
"""
# Input validation
if rna_path is None:
raise ValueError("Path to scRNA-seq data file must be provided")
# File existence validation
rna_file = Path(rna_path)
if not rna_file.exists():
raise FileNotFoundError(f"RNA data file not found: {rna_path}")
# Set output prefix
if out_prefix is None:
out_prefix = "glue_rna"
# Load data
rna = ad.read_h5ad(rna_path)
# Backup raw counts to "counts" layer
rna.layers["counts"] = rna.X.copy()
# Select highly variable genes
sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=flavor)
# Normalize, log-transform, and scale
sc.pp.normalize_total(rna)
sc.pp.log1p(rna)
sc.pp.scale(rna)
# Perform PCA
sc.tl.pca(rna, n_comps=n_comps, svd_solver=svd_solver)
# Generate UMAP visualization
sc.pp.neighbors(rna, metric="cosine")
sc.tl.umap(rna)
# Save UMAP plot
fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
sc.pl.umap(rna, color=color_var, show=False)
plt.savefig(fig_output, dpi=300, bbox_inches="tight")
plt.close()
# Save preprocessed data
rna_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
rna.write(str(rna_output), compression="gzip")
return {
"message": f"Preprocessed RNA data: {n_top_genes} HVGs, {n_comps} PCs, UMAP generated",
"reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
"artifacts": [
{"description": "Preprocessed RNA data", "path": str(rna_output.resolve())},
{
"description": "RNA UMAP visualization",
"path": str(fig_output.resolve()),
},
],
}
@preprocessing_mcp.tool
def glue_preprocess_scatac(
atac_path: Annotated[
str | None, "Path to scATAC-seq data file in h5ad format"
] = None,
n_components: Annotated[int, "Number of LSI components"] = 100,
n_iter: Annotated[int, "Number of iterations for randomized SVD in LSI"] = 15,
color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Preprocess scATAC-seq data with latent semantic indexing (LSI) dimension reduction.
Input is scATAC-seq data in h5ad format and output is preprocessed data with LSI embedding and UMAP visualization.
"""
# Input validation
if atac_path is None:
raise ValueError("Path to scATAC-seq data file must be provided")
# File existence validation
atac_file = Path(atac_path)
if not atac_file.exists():
raise FileNotFoundError(f"ATAC data file not found: {atac_path}")
# Set output prefix
if out_prefix is None:
out_prefix = "glue_atac"
# Load data
atac = ad.read_h5ad(atac_path)
# Perform LSI dimension reduction
scglue.data.lsi(atac, n_components=n_components, n_iter=n_iter)
# Generate UMAP visualization
sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine")
sc.tl.umap(atac)
# Save UMAP plot
fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
sc.pl.umap(atac, color=color_var, show=False)
plt.savefig(fig_output, dpi=300, bbox_inches="tight")
plt.close()
# Save preprocessed data
atac_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
atac.write(str(atac_output), compression="gzip")
return {
"message": f"Preprocessed ATAC data: {n_components} LSI components, UMAP generated",
"reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
"artifacts": [
{
"description": "Preprocessed ATAC data",
"path": str(atac_output.resolve()),
},
{
"description": "ATAC UMAP visualization",
"path": str(fig_output.resolve()),
},
],
}
@preprocessing_mcp.tool
def glue_construct_regulatory_graph(
rna_path: Annotated[
str | None, "Path to preprocessed scRNA-seq data file in h5ad format"
] = None,
atac_path: Annotated[
str | None, "Path to preprocessed scATAC-seq data file in h5ad format"
] = None,
gtf_path: Annotated[
str | None, "Path to GTF annotation file for gene coordinates"
] = None,
gtf_by: Annotated[str, "GTF attribute to match gene names"] = "gene_name",
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Construct prior regulatory graph linking RNA genes and ATAC peaks via genomic proximity.
Input is preprocessed RNA and ATAC data with GTF annotation and output is NetworkX guidance graph.
"""
# Input validation
if rna_path is None:
raise ValueError("Path to preprocessed scRNA-seq data file must be provided")
if atac_path is None:
raise ValueError("Path to preprocessed scATAC-seq data file must be provided")
if gtf_path is None:
raise ValueError("Path to GTF annotation file must be provided")
# File existence validation
rna_file = Path(rna_path)
if not rna_file.exists():
raise FileNotFoundError(f"RNA data file not found: {rna_path}")
atac_file = Path(atac_path)
if not atac_file.exists():
raise FileNotFoundError(f"ATAC data file not found: {atac_path}")
gtf_file = Path(gtf_path)
if not gtf_file.exists():
raise FileNotFoundError(f"GTF annotation file not found: {gtf_path}")
# Set output prefix
if out_prefix is None:
out_prefix = "glue_guidance"
# Load data
rna = ad.read_h5ad(rna_path)
atac = ad.read_h5ad(atac_path)
# Get gene annotation from GTF
scglue.data.get_gene_annotation(rna, gtf=gtf_path, gtf_by=gtf_by)
# Extract ATAC peak coordinates from var_names
split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
# Construct guidance graph
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
# Verify graph compliance
scglue.graph.check_graph(guidance, [rna, atac])
# Save guidance graph
graph_output = OUTPUT_DIR / f"{out_prefix}_graph_{timestamp}.graphml.gz"
nx.write_graphml(guidance, str(graph_output))
# Save updated data with coordinates
rna_output = OUTPUT_DIR / f"{out_prefix}_rna_annotated_{timestamp}.h5ad"
atac_output = OUTPUT_DIR / f"{out_prefix}_atac_annotated_{timestamp}.h5ad"
rna.write(str(rna_output), compression="gzip")
atac.write(str(atac_output), compression="gzip")
return {
"message": f"Constructed guidance graph with {guidance.number_of_nodes()} nodes and {guidance.number_of_edges()} edges",
"reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
"artifacts": [
{"description": "Guidance graph", "path": str(graph_output.resolve())},
{
"description": "RNA data with coordinates",
"path": str(rna_output.resolve()),
},
{
"description": "ATAC data with coordinates",
"path": str(atac_output.resolve()),
},
],
}