Spaces:
Running
Running
| """ | |
| GLUE preprocessing tutorial for scRNA-seq and scATAC-seq data integration. | |
| This MCP Server provides 3 tools: | |
| 1. glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA | |
| 2. glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction | |
| 3. glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features | |
| All tools extracted from `gao-lab/GLUE/blob/master/docs/preprocessing.ipynb`. | |
| """ | |
| import os | |
| from datetime import datetime | |
| from pathlib import Path | |
| # Standard imports | |
| from typing import Annotated, Any, Literal | |
| import anndata as ad | |
| # Domain-specific imports | |
| import matplotlib.pyplot as plt | |
| import networkx as nx | |
| import numpy as np | |
| import pandas as pd | |
| import scanpy as sc | |
| import scglue | |
| from fastmcp import FastMCP | |
| from matplotlib import rcParams | |
| # Project structure | |
| PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve() | |
| DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs" | |
| DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs" | |
| INPUT_DIR = Path(os.environ.get("PREPROCESSING_INPUT_DIR", DEFAULT_INPUT_DIR)) | |
| OUTPUT_DIR = Path(os.environ.get("PREPROCESSING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR)) | |
| # Ensure directories exist | |
| INPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| # Timestamp for unique outputs | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| # Set plotting parameters | |
| plt.rcParams["figure.dpi"] = 300 | |
| plt.rcParams["savefig.dpi"] = 300 | |
| scglue.plot.set_publication_params() | |
| rcParams["figure.figsize"] = (4, 4) | |
| # MCP server instance | |
| preprocessing_mcp = FastMCP(name="preprocessing") | |
| def glue_preprocess_scrna( | |
| rna_path: Annotated[ | |
| str | None, "Path to scRNA-seq data file in h5ad format" | |
| ] = None, | |
| n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000, | |
| flavor: Annotated[ | |
| Literal["seurat", "cell_ranger", "seurat_v3"], "Method for HVG selection" | |
| ] = "seurat_v3", | |
| n_comps: Annotated[int, "Number of principal components"] = 100, | |
| svd_solver: Annotated[ | |
| Literal["auto", "arpack", "randomized"], "SVD solver for PCA" | |
| ] = "auto", | |
| color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type", | |
| out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
| ) -> dict: | |
| """ | |
| Preprocess scRNA-seq data with highly variable gene selection, normalization, scaling, and PCA. | |
| Input is scRNA-seq data in h5ad format and output is preprocessed data with PCA embedding and UMAP visualization. | |
| """ | |
| # Input validation | |
| if rna_path is None: | |
| raise ValueError("Path to scRNA-seq data file must be provided") | |
| # File existence validation | |
| rna_file = Path(rna_path) | |
| if not rna_file.exists(): | |
| raise FileNotFoundError(f"RNA data file not found: {rna_path}") | |
| # Set output prefix | |
| if out_prefix is None: | |
| out_prefix = "glue_rna" | |
| # Load data | |
| rna = ad.read_h5ad(rna_path) | |
| # Backup raw counts to "counts" layer | |
| rna.layers["counts"] = rna.X.copy() | |
| # Select highly variable genes | |
| sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=flavor) | |
| # Normalize, log-transform, and scale | |
| sc.pp.normalize_total(rna) | |
| sc.pp.log1p(rna) | |
| sc.pp.scale(rna) | |
| # Perform PCA | |
| sc.tl.pca(rna, n_comps=n_comps, svd_solver=svd_solver) | |
| # Generate UMAP visualization | |
| sc.pp.neighbors(rna, metric="cosine") | |
| sc.tl.umap(rna) | |
| # Save UMAP plot | |
| fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png" | |
| sc.pl.umap(rna, color=color_var, show=False) | |
| plt.savefig(fig_output, dpi=300, bbox_inches="tight") | |
| plt.close() | |
| # Save preprocessed data | |
| rna_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad" | |
| rna.write(str(rna_output), compression="gzip") | |
| return { | |
| "message": f"Preprocessed RNA data: {n_top_genes} HVGs, {n_comps} PCs, UMAP generated", | |
| "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb", | |
| "artifacts": [ | |
| {"description": "Preprocessed RNA data", "path": str(rna_output.resolve())}, | |
| { | |
| "description": "RNA UMAP visualization", | |
| "path": str(fig_output.resolve()), | |
| }, | |
| ], | |
| } | |
| def glue_preprocess_scatac( | |
| atac_path: Annotated[ | |
| str | None, "Path to scATAC-seq data file in h5ad format" | |
| ] = None, | |
| n_components: Annotated[int, "Number of LSI components"] = 100, | |
| n_iter: Annotated[int, "Number of iterations for randomized SVD in LSI"] = 15, | |
| color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type", | |
| out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
| ) -> dict: | |
| """ | |
| Preprocess scATAC-seq data with latent semantic indexing (LSI) dimension reduction. | |
| Input is scATAC-seq data in h5ad format and output is preprocessed data with LSI embedding and UMAP visualization. | |
| """ | |
| # Input validation | |
| if atac_path is None: | |
| raise ValueError("Path to scATAC-seq data file must be provided") | |
| # File existence validation | |
| atac_file = Path(atac_path) | |
| if not atac_file.exists(): | |
| raise FileNotFoundError(f"ATAC data file not found: {atac_path}") | |
| # Set output prefix | |
| if out_prefix is None: | |
| out_prefix = "glue_atac" | |
| # Load data | |
| atac = ad.read_h5ad(atac_path) | |
| # Perform LSI dimension reduction | |
| scglue.data.lsi(atac, n_components=n_components, n_iter=n_iter) | |
| # Generate UMAP visualization | |
| sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine") | |
| sc.tl.umap(atac) | |
| # Save UMAP plot | |
| fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png" | |
| sc.pl.umap(atac, color=color_var, show=False) | |
| plt.savefig(fig_output, dpi=300, bbox_inches="tight") | |
| plt.close() | |
| # Save preprocessed data | |
| atac_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad" | |
| atac.write(str(atac_output), compression="gzip") | |
| return { | |
| "message": f"Preprocessed ATAC data: {n_components} LSI components, UMAP generated", | |
| "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb", | |
| "artifacts": [ | |
| { | |
| "description": "Preprocessed ATAC data", | |
| "path": str(atac_output.resolve()), | |
| }, | |
| { | |
| "description": "ATAC UMAP visualization", | |
| "path": str(fig_output.resolve()), | |
| }, | |
| ], | |
| } | |
| def glue_construct_regulatory_graph( | |
| rna_path: Annotated[ | |
| str | None, "Path to preprocessed scRNA-seq data file in h5ad format" | |
| ] = None, | |
| atac_path: Annotated[ | |
| str | None, "Path to preprocessed scATAC-seq data file in h5ad format" | |
| ] = None, | |
| gtf_path: Annotated[ | |
| str | None, "Path to GTF annotation file for gene coordinates" | |
| ] = None, | |
| gtf_by: Annotated[str, "GTF attribute to match gene names"] = "gene_name", | |
| out_prefix: Annotated[str | None, "Output file prefix"] = None, | |
| ) -> dict: | |
| """ | |
| Construct prior regulatory graph linking RNA genes and ATAC peaks via genomic proximity. | |
| Input is preprocessed RNA and ATAC data with GTF annotation and output is NetworkX guidance graph. | |
| """ | |
| # Input validation | |
| if rna_path is None: | |
| raise ValueError("Path to preprocessed scRNA-seq data file must be provided") | |
| if atac_path is None: | |
| raise ValueError("Path to preprocessed scATAC-seq data file must be provided") | |
| if gtf_path is None: | |
| raise ValueError("Path to GTF annotation file must be provided") | |
| # File existence validation | |
| rna_file = Path(rna_path) | |
| if not rna_file.exists(): | |
| raise FileNotFoundError(f"RNA data file not found: {rna_path}") | |
| atac_file = Path(atac_path) | |
| if not atac_file.exists(): | |
| raise FileNotFoundError(f"ATAC data file not found: {atac_path}") | |
| gtf_file = Path(gtf_path) | |
| if not gtf_file.exists(): | |
| raise FileNotFoundError(f"GTF annotation file not found: {gtf_path}") | |
| # Set output prefix | |
| if out_prefix is None: | |
| out_prefix = "glue_guidance" | |
| # Load data | |
| rna = ad.read_h5ad(rna_path) | |
| atac = ad.read_h5ad(atac_path) | |
| # Get gene annotation from GTF | |
| scglue.data.get_gene_annotation(rna, gtf=gtf_path, gtf_by=gtf_by) | |
| # Extract ATAC peak coordinates from var_names | |
| split = atac.var_names.str.split(r"[:-]") | |
| atac.var["chrom"] = split.map(lambda x: x[0]) | |
| atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int) | |
| atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int) | |
| # Construct guidance graph | |
| guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac) | |
| # Verify graph compliance | |
| scglue.graph.check_graph(guidance, [rna, atac]) | |
| # Save guidance graph | |
| graph_output = OUTPUT_DIR / f"{out_prefix}_graph_{timestamp}.graphml.gz" | |
| nx.write_graphml(guidance, str(graph_output)) | |
| # Save updated data with coordinates | |
| rna_output = OUTPUT_DIR / f"{out_prefix}_rna_annotated_{timestamp}.h5ad" | |
| atac_output = OUTPUT_DIR / f"{out_prefix}_atac_annotated_{timestamp}.h5ad" | |
| rna.write(str(rna_output), compression="gzip") | |
| atac.write(str(atac_output), compression="gzip") | |
| return { | |
| "message": f"Constructed guidance graph with {guidance.number_of_nodes()} nodes and {guidance.number_of_edges()} edges", | |
| "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb", | |
| "artifacts": [ | |
| {"description": "Guidance graph", "path": str(graph_output.resolve())}, | |
| { | |
| "description": "RNA data with coordinates", | |
| "path": str(rna_output.resolve()), | |
| }, | |
| { | |
| "description": "ATAC data with coordinates", | |
| "path": str(atac_output.resolve()), | |
| }, | |
| ], | |
| } | |