Spaces:

dmannk
/

Paper2Agent-scglue-mcp

Sleeping

File size: 9,963 Bytes

dee34fb

"""
GLUE preprocessing tutorial for scRNA-seq and scATAC-seq data integration.

This MCP Server provides 3 tools:
1. glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA
2. glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction
3. glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features

All tools extracted from `gao-lab/GLUE/blob/master/docs/preprocessing.ipynb`.
"""

import os
from datetime import datetime
from pathlib import Path
# Standard imports
from typing import Annotated, Any, Literal

import anndata as ad
# Domain-specific imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scanpy as sc
import scglue
from fastmcp import FastMCP
from matplotlib import rcParams

# Project structure
PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs"
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs"

INPUT_DIR = Path(os.environ.get("PREPROCESSING_INPUT_DIR", DEFAULT_INPUT_DIR))
OUTPUT_DIR = Path(os.environ.get("PREPROCESSING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))

# Ensure directories exist
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Timestamp for unique outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Set plotting parameters
plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

# MCP server instance
preprocessing_mcp = FastMCP(name="preprocessing")


@preprocessing_mcp.tool
def glue_preprocess_scrna(
    rna_path: Annotated[
        str | None, "Path to scRNA-seq data file in h5ad format"
    ] = None,
    n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000,
    flavor: Annotated[
        Literal["seurat", "cell_ranger", "seurat_v3"], "Method for HVG selection"
    ] = "seurat_v3",
    n_comps: Annotated[int, "Number of principal components"] = 100,
    svd_solver: Annotated[
        Literal["auto", "arpack", "randomized"], "SVD solver for PCA"
    ] = "auto",
    color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Preprocess scRNA-seq data with highly variable gene selection, normalization, scaling, and PCA.
    Input is scRNA-seq data in h5ad format and output is preprocessed data with PCA embedding and UMAP visualization.
    """
    # Input validation
    if rna_path is None:
        raise ValueError("Path to scRNA-seq data file must be provided")

    # File existence validation
    rna_file = Path(rna_path)
    if not rna_file.exists():
        raise FileNotFoundError(f"RNA data file not found: {rna_path}")

    # Set output prefix
    if out_prefix is None:
        out_prefix = "glue_rna"

    # Load data
    rna = ad.read_h5ad(rna_path)

    # Backup raw counts to "counts" layer
    rna.layers["counts"] = rna.X.copy()

    # Select highly variable genes
    sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=flavor)

    # Normalize, log-transform, and scale
    sc.pp.normalize_total(rna)
    sc.pp.log1p(rna)
    sc.pp.scale(rna)

    # Perform PCA
    sc.tl.pca(rna, n_comps=n_comps, svd_solver=svd_solver)

    # Generate UMAP visualization
    sc.pp.neighbors(rna, metric="cosine")
    sc.tl.umap(rna)

    # Save UMAP plot
    fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
    sc.pl.umap(rna, color=color_var, show=False)
    plt.savefig(fig_output, dpi=300, bbox_inches="tight")
    plt.close()

    # Save preprocessed data
    rna_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
    rna.write(str(rna_output), compression="gzip")

    return {
        "message": f"Preprocessed RNA data: {n_top_genes} HVGs, {n_comps} PCs, UMAP generated",
        "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
        "artifacts": [
            {"description": "Preprocessed RNA data", "path": str(rna_output.resolve())},
            {
                "description": "RNA UMAP visualization",
                "path": str(fig_output.resolve()),
            },
        ],
    }


@preprocessing_mcp.tool
def glue_preprocess_scatac(
    atac_path: Annotated[
        str | None, "Path to scATAC-seq data file in h5ad format"
    ] = None,
    n_components: Annotated[int, "Number of LSI components"] = 100,
    n_iter: Annotated[int, "Number of iterations for randomized SVD in LSI"] = 15,
    color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Preprocess scATAC-seq data with latent semantic indexing (LSI) dimension reduction.
    Input is scATAC-seq data in h5ad format and output is preprocessed data with LSI embedding and UMAP visualization.
    """
    # Input validation
    if atac_path is None:
        raise ValueError("Path to scATAC-seq data file must be provided")

    # File existence validation
    atac_file = Path(atac_path)
    if not atac_file.exists():
        raise FileNotFoundError(f"ATAC data file not found: {atac_path}")

    # Set output prefix
    if out_prefix is None:
        out_prefix = "glue_atac"

    # Load data
    atac = ad.read_h5ad(atac_path)

    # Perform LSI dimension reduction
    scglue.data.lsi(atac, n_components=n_components, n_iter=n_iter)

    # Generate UMAP visualization
    sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine")
    sc.tl.umap(atac)

    # Save UMAP plot
    fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
    sc.pl.umap(atac, color=color_var, show=False)
    plt.savefig(fig_output, dpi=300, bbox_inches="tight")
    plt.close()

    # Save preprocessed data
    atac_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
    atac.write(str(atac_output), compression="gzip")

    return {
        "message": f"Preprocessed ATAC data: {n_components} LSI components, UMAP generated",
        "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
        "artifacts": [
            {
                "description": "Preprocessed ATAC data",
                "path": str(atac_output.resolve()),
            },
            {
                "description": "ATAC UMAP visualization",
                "path": str(fig_output.resolve()),
            },
        ],
    }


@preprocessing_mcp.tool
def glue_construct_regulatory_graph(
    rna_path: Annotated[
        str | None, "Path to preprocessed scRNA-seq data file in h5ad format"
    ] = None,
    atac_path: Annotated[
        str | None, "Path to preprocessed scATAC-seq data file in h5ad format"
    ] = None,
    gtf_path: Annotated[
        str | None, "Path to GTF annotation file for gene coordinates"
    ] = None,
    gtf_by: Annotated[str, "GTF attribute to match gene names"] = "gene_name",
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Construct prior regulatory graph linking RNA genes and ATAC peaks via genomic proximity.
    Input is preprocessed RNA and ATAC data with GTF annotation and output is NetworkX guidance graph.
    """
    # Input validation
    if rna_path is None:
        raise ValueError("Path to preprocessed scRNA-seq data file must be provided")
    if atac_path is None:
        raise ValueError("Path to preprocessed scATAC-seq data file must be provided")
    if gtf_path is None:
        raise ValueError("Path to GTF annotation file must be provided")

    # File existence validation
    rna_file = Path(rna_path)
    if not rna_file.exists():
        raise FileNotFoundError(f"RNA data file not found: {rna_path}")

    atac_file = Path(atac_path)
    if not atac_file.exists():
        raise FileNotFoundError(f"ATAC data file not found: {atac_path}")

    gtf_file = Path(gtf_path)
    if not gtf_file.exists():
        raise FileNotFoundError(f"GTF annotation file not found: {gtf_path}")

    # Set output prefix
    if out_prefix is None:
        out_prefix = "glue_guidance"

    # Load data
    rna = ad.read_h5ad(rna_path)
    atac = ad.read_h5ad(atac_path)

    # Get gene annotation from GTF
    scglue.data.get_gene_annotation(rna, gtf=gtf_path, gtf_by=gtf_by)

    # Extract ATAC peak coordinates from var_names
    split = atac.var_names.str.split(r"[:-]")
    atac.var["chrom"] = split.map(lambda x: x[0])
    atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
    atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)

    # Construct guidance graph
    guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)

    # Verify graph compliance
    scglue.graph.check_graph(guidance, [rna, atac])

    # Save guidance graph
    graph_output = OUTPUT_DIR / f"{out_prefix}_graph_{timestamp}.graphml.gz"
    nx.write_graphml(guidance, str(graph_output))

    # Save updated data with coordinates
    rna_output = OUTPUT_DIR / f"{out_prefix}_rna_annotated_{timestamp}.h5ad"
    atac_output = OUTPUT_DIR / f"{out_prefix}_atac_annotated_{timestamp}.h5ad"
    rna.write(str(rna_output), compression="gzip")
    atac.write(str(atac_output), compression="gzip")

    return {
        "message": f"Constructed guidance graph with {guidance.number_of_nodes()} nodes and {guidance.number_of_edges()} edges",
        "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
        "artifacts": [
            {"description": "Guidance graph", "path": str(graph_output.resolve())},
            {
                "description": "RNA data with coordinates",
                "path": str(rna_output.resolve()),
            },
            {
                "description": "ATAC data with coordinates",
                "path": str(atac_output.resolve()),
            },
        ],
    }