""" GLUE preprocessing tutorial for scRNA-seq and scATAC-seq data integration. This MCP Server provides 3 tools: 1. glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA 2. glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction 3. glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features All tools extracted from `gao-lab/GLUE/blob/master/docs/preprocessing.ipynb`. """ import os from datetime import datetime from pathlib import Path # Standard imports from typing import Annotated, Any, Literal import anndata as ad # Domain-specific imports import matplotlib.pyplot as plt import networkx as nx import numpy as np import pandas as pd import scanpy as sc import scglue from fastmcp import FastMCP from matplotlib import rcParams # Project structure PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve() DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs" DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs" INPUT_DIR = Path(os.environ.get("PREPROCESSING_INPUT_DIR", DEFAULT_INPUT_DIR)) OUTPUT_DIR = Path(os.environ.get("PREPROCESSING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR)) # Ensure directories exist INPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Timestamp for unique outputs timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Set plotting parameters plt.rcParams["figure.dpi"] = 300 plt.rcParams["savefig.dpi"] = 300 scglue.plot.set_publication_params() rcParams["figure.figsize"] = (4, 4) # MCP server instance preprocessing_mcp = FastMCP(name="preprocessing") @preprocessing_mcp.tool def glue_preprocess_scrna( rna_path: Annotated[ str | None, "Path to scRNA-seq data file in h5ad format" ] = None, n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000, flavor: Annotated[ Literal["seurat", "cell_ranger", "seurat_v3"], "Method for HVG selection" ] = "seurat_v3", n_comps: Annotated[int, "Number of principal components"] = 100, svd_solver: Annotated[ Literal["auto", "arpack", "randomized"], "SVD solver for PCA" ] = "auto", color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type", out_prefix: Annotated[str | None, "Output file prefix"] = None, ) -> dict: """ Preprocess scRNA-seq data with highly variable gene selection, normalization, scaling, and PCA. Input is scRNA-seq data in h5ad format and output is preprocessed data with PCA embedding and UMAP visualization. """ # Input validation if rna_path is None: raise ValueError("Path to scRNA-seq data file must be provided") # File existence validation rna_file = Path(rna_path) if not rna_file.exists(): raise FileNotFoundError(f"RNA data file not found: {rna_path}") # Set output prefix if out_prefix is None: out_prefix = "glue_rna" # Load data rna = ad.read_h5ad(rna_path) # Backup raw counts to "counts" layer rna.layers["counts"] = rna.X.copy() # Select highly variable genes sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=flavor) # Normalize, log-transform, and scale sc.pp.normalize_total(rna) sc.pp.log1p(rna) sc.pp.scale(rna) # Perform PCA sc.tl.pca(rna, n_comps=n_comps, svd_solver=svd_solver) # Generate UMAP visualization sc.pp.neighbors(rna, metric="cosine") sc.tl.umap(rna) # Save UMAP plot fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png" sc.pl.umap(rna, color=color_var, show=False) plt.savefig(fig_output, dpi=300, bbox_inches="tight") plt.close() # Save preprocessed data rna_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad" rna.write(str(rna_output), compression="gzip") return { "message": f"Preprocessed RNA data: {n_top_genes} HVGs, {n_comps} PCs, UMAP generated", "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb", "artifacts": [ {"description": "Preprocessed RNA data", "path": str(rna_output.resolve())}, { "description": "RNA UMAP visualization", "path": str(fig_output.resolve()), }, ], } @preprocessing_mcp.tool def glue_preprocess_scatac( atac_path: Annotated[ str | None, "Path to scATAC-seq data file in h5ad format" ] = None, n_components: Annotated[int, "Number of LSI components"] = 100, n_iter: Annotated[int, "Number of iterations for randomized SVD in LSI"] = 15, color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type", out_prefix: Annotated[str | None, "Output file prefix"] = None, ) -> dict: """ Preprocess scATAC-seq data with latent semantic indexing (LSI) dimension reduction. Input is scATAC-seq data in h5ad format and output is preprocessed data with LSI embedding and UMAP visualization. """ # Input validation if atac_path is None: raise ValueError("Path to scATAC-seq data file must be provided") # File existence validation atac_file = Path(atac_path) if not atac_file.exists(): raise FileNotFoundError(f"ATAC data file not found: {atac_path}") # Set output prefix if out_prefix is None: out_prefix = "glue_atac" # Load data atac = ad.read_h5ad(atac_path) # Perform LSI dimension reduction scglue.data.lsi(atac, n_components=n_components, n_iter=n_iter) # Generate UMAP visualization sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine") sc.tl.umap(atac) # Save UMAP plot fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png" sc.pl.umap(atac, color=color_var, show=False) plt.savefig(fig_output, dpi=300, bbox_inches="tight") plt.close() # Save preprocessed data atac_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad" atac.write(str(atac_output), compression="gzip") return { "message": f"Preprocessed ATAC data: {n_components} LSI components, UMAP generated", "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb", "artifacts": [ { "description": "Preprocessed ATAC data", "path": str(atac_output.resolve()), }, { "description": "ATAC UMAP visualization", "path": str(fig_output.resolve()), }, ], } @preprocessing_mcp.tool def glue_construct_regulatory_graph( rna_path: Annotated[ str | None, "Path to preprocessed scRNA-seq data file in h5ad format" ] = None, atac_path: Annotated[ str | None, "Path to preprocessed scATAC-seq data file in h5ad format" ] = None, gtf_path: Annotated[ str | None, "Path to GTF annotation file for gene coordinates" ] = None, gtf_by: Annotated[str, "GTF attribute to match gene names"] = "gene_name", out_prefix: Annotated[str | None, "Output file prefix"] = None, ) -> dict: """ Construct prior regulatory graph linking RNA genes and ATAC peaks via genomic proximity. Input is preprocessed RNA and ATAC data with GTF annotation and output is NetworkX guidance graph. """ # Input validation if rna_path is None: raise ValueError("Path to preprocessed scRNA-seq data file must be provided") if atac_path is None: raise ValueError("Path to preprocessed scATAC-seq data file must be provided") if gtf_path is None: raise ValueError("Path to GTF annotation file must be provided") # File existence validation rna_file = Path(rna_path) if not rna_file.exists(): raise FileNotFoundError(f"RNA data file not found: {rna_path}") atac_file = Path(atac_path) if not atac_file.exists(): raise FileNotFoundError(f"ATAC data file not found: {atac_path}") gtf_file = Path(gtf_path) if not gtf_file.exists(): raise FileNotFoundError(f"GTF annotation file not found: {gtf_path}") # Set output prefix if out_prefix is None: out_prefix = "glue_guidance" # Load data rna = ad.read_h5ad(rna_path) atac = ad.read_h5ad(atac_path) # Get gene annotation from GTF scglue.data.get_gene_annotation(rna, gtf=gtf_path, gtf_by=gtf_by) # Extract ATAC peak coordinates from var_names split = atac.var_names.str.split(r"[:-]") atac.var["chrom"] = split.map(lambda x: x[0]) atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int) atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int) # Construct guidance graph guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac) # Verify graph compliance scglue.graph.check_graph(guidance, [rna, atac]) # Save guidance graph graph_output = OUTPUT_DIR / f"{out_prefix}_graph_{timestamp}.graphml.gz" nx.write_graphml(guidance, str(graph_output)) # Save updated data with coordinates rna_output = OUTPUT_DIR / f"{out_prefix}_rna_annotated_{timestamp}.h5ad" atac_output = OUTPUT_DIR / f"{out_prefix}_atac_annotated_{timestamp}.h5ad" rna.write(str(rna_output), compression="gzip") atac.write(str(atac_output), compression="gzip") return { "message": f"Constructed guidance graph with {guidance.number_of_nodes()} nodes and {guidance.number_of_edges()} edges", "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb", "artifacts": [ {"description": "Guidance graph", "path": str(graph_output.resolve())}, { "description": "RNA data with coordinates", "path": str(rna_output.resolve()), }, { "description": "ATAC data with coordinates", "path": str(atac_output.resolve()), }, ], }