File size: 9,963 Bytes
dee34fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
"""
GLUE preprocessing tutorial for scRNA-seq and scATAC-seq data integration.

This MCP Server provides 3 tools:
1. glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA
2. glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction
3. glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features

All tools extracted from `gao-lab/GLUE/blob/master/docs/preprocessing.ipynb`.
"""

import os
from datetime import datetime
from pathlib import Path
# Standard imports
from typing import Annotated, Any, Literal

import anndata as ad
# Domain-specific imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scanpy as sc
import scglue
from fastmcp import FastMCP
from matplotlib import rcParams

# Project structure
PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs"
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs"

INPUT_DIR = Path(os.environ.get("PREPROCESSING_INPUT_DIR", DEFAULT_INPUT_DIR))
OUTPUT_DIR = Path(os.environ.get("PREPROCESSING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))

# Ensure directories exist
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Timestamp for unique outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Set plotting parameters
plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

# MCP server instance
preprocessing_mcp = FastMCP(name="preprocessing")


@preprocessing_mcp.tool
def glue_preprocess_scrna(
    rna_path: Annotated[
        str | None, "Path to scRNA-seq data file in h5ad format"
    ] = None,
    n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000,
    flavor: Annotated[
        Literal["seurat", "cell_ranger", "seurat_v3"], "Method for HVG selection"
    ] = "seurat_v3",
    n_comps: Annotated[int, "Number of principal components"] = 100,
    svd_solver: Annotated[
        Literal["auto", "arpack", "randomized"], "SVD solver for PCA"
    ] = "auto",
    color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Preprocess scRNA-seq data with highly variable gene selection, normalization, scaling, and PCA.
    Input is scRNA-seq data in h5ad format and output is preprocessed data with PCA embedding and UMAP visualization.
    """
    # Input validation
    if rna_path is None:
        raise ValueError("Path to scRNA-seq data file must be provided")

    # File existence validation
    rna_file = Path(rna_path)
    if not rna_file.exists():
        raise FileNotFoundError(f"RNA data file not found: {rna_path}")

    # Set output prefix
    if out_prefix is None:
        out_prefix = "glue_rna"

    # Load data
    rna = ad.read_h5ad(rna_path)

    # Backup raw counts to "counts" layer
    rna.layers["counts"] = rna.X.copy()

    # Select highly variable genes
    sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=flavor)

    # Normalize, log-transform, and scale
    sc.pp.normalize_total(rna)
    sc.pp.log1p(rna)
    sc.pp.scale(rna)

    # Perform PCA
    sc.tl.pca(rna, n_comps=n_comps, svd_solver=svd_solver)

    # Generate UMAP visualization
    sc.pp.neighbors(rna, metric="cosine")
    sc.tl.umap(rna)

    # Save UMAP plot
    fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
    sc.pl.umap(rna, color=color_var, show=False)
    plt.savefig(fig_output, dpi=300, bbox_inches="tight")
    plt.close()

    # Save preprocessed data
    rna_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
    rna.write(str(rna_output), compression="gzip")

    return {
        "message": f"Preprocessed RNA data: {n_top_genes} HVGs, {n_comps} PCs, UMAP generated",
        "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
        "artifacts": [
            {"description": "Preprocessed RNA data", "path": str(rna_output.resolve())},
            {
                "description": "RNA UMAP visualization",
                "path": str(fig_output.resolve()),
            },
        ],
    }


@preprocessing_mcp.tool
def glue_preprocess_scatac(
    atac_path: Annotated[
        str | None, "Path to scATAC-seq data file in h5ad format"
    ] = None,
    n_components: Annotated[int, "Number of LSI components"] = 100,
    n_iter: Annotated[int, "Number of iterations for randomized SVD in LSI"] = 15,
    color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Preprocess scATAC-seq data with latent semantic indexing (LSI) dimension reduction.
    Input is scATAC-seq data in h5ad format and output is preprocessed data with LSI embedding and UMAP visualization.
    """
    # Input validation
    if atac_path is None:
        raise ValueError("Path to scATAC-seq data file must be provided")

    # File existence validation
    atac_file = Path(atac_path)
    if not atac_file.exists():
        raise FileNotFoundError(f"ATAC data file not found: {atac_path}")

    # Set output prefix
    if out_prefix is None:
        out_prefix = "glue_atac"

    # Load data
    atac = ad.read_h5ad(atac_path)

    # Perform LSI dimension reduction
    scglue.data.lsi(atac, n_components=n_components, n_iter=n_iter)

    # Generate UMAP visualization
    sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine")
    sc.tl.umap(atac)

    # Save UMAP plot
    fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
    sc.pl.umap(atac, color=color_var, show=False)
    plt.savefig(fig_output, dpi=300, bbox_inches="tight")
    plt.close()

    # Save preprocessed data
    atac_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
    atac.write(str(atac_output), compression="gzip")

    return {
        "message": f"Preprocessed ATAC data: {n_components} LSI components, UMAP generated",
        "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
        "artifacts": [
            {
                "description": "Preprocessed ATAC data",
                "path": str(atac_output.resolve()),
            },
            {
                "description": "ATAC UMAP visualization",
                "path": str(fig_output.resolve()),
            },
        ],
    }


@preprocessing_mcp.tool
def glue_construct_regulatory_graph(
    rna_path: Annotated[
        str | None, "Path to preprocessed scRNA-seq data file in h5ad format"
    ] = None,
    atac_path: Annotated[
        str | None, "Path to preprocessed scATAC-seq data file in h5ad format"
    ] = None,
    gtf_path: Annotated[
        str | None, "Path to GTF annotation file for gene coordinates"
    ] = None,
    gtf_by: Annotated[str, "GTF attribute to match gene names"] = "gene_name",
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Construct prior regulatory graph linking RNA genes and ATAC peaks via genomic proximity.
    Input is preprocessed RNA and ATAC data with GTF annotation and output is NetworkX guidance graph.
    """
    # Input validation
    if rna_path is None:
        raise ValueError("Path to preprocessed scRNA-seq data file must be provided")
    if atac_path is None:
        raise ValueError("Path to preprocessed scATAC-seq data file must be provided")
    if gtf_path is None:
        raise ValueError("Path to GTF annotation file must be provided")

    # File existence validation
    rna_file = Path(rna_path)
    if not rna_file.exists():
        raise FileNotFoundError(f"RNA data file not found: {rna_path}")

    atac_file = Path(atac_path)
    if not atac_file.exists():
        raise FileNotFoundError(f"ATAC data file not found: {atac_path}")

    gtf_file = Path(gtf_path)
    if not gtf_file.exists():
        raise FileNotFoundError(f"GTF annotation file not found: {gtf_path}")

    # Set output prefix
    if out_prefix is None:
        out_prefix = "glue_guidance"

    # Load data
    rna = ad.read_h5ad(rna_path)
    atac = ad.read_h5ad(atac_path)

    # Get gene annotation from GTF
    scglue.data.get_gene_annotation(rna, gtf=gtf_path, gtf_by=gtf_by)

    # Extract ATAC peak coordinates from var_names
    split = atac.var_names.str.split(r"[:-]")
    atac.var["chrom"] = split.map(lambda x: x[0])
    atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
    atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)

    # Construct guidance graph
    guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)

    # Verify graph compliance
    scglue.graph.check_graph(guidance, [rna, atac])

    # Save guidance graph
    graph_output = OUTPUT_DIR / f"{out_prefix}_graph_{timestamp}.graphml.gz"
    nx.write_graphml(guidance, str(graph_output))

    # Save updated data with coordinates
    rna_output = OUTPUT_DIR / f"{out_prefix}_rna_annotated_{timestamp}.h5ad"
    atac_output = OUTPUT_DIR / f"{out_prefix}_atac_annotated_{timestamp}.h5ad"
    rna.write(str(rna_output), compression="gzip")
    atac.write(str(atac_output), compression="gzip")

    return {
        "message": f"Constructed guidance graph with {guidance.number_of_nodes()} nodes and {guidance.number_of_edges()} edges",
        "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
        "artifacts": [
            {"description": "Guidance graph", "path": str(graph_output.resolve())},
            {
                "description": "RNA data with coordinates",
                "path": str(rna_output.resolve()),
            },
            {
                "description": "ATAC data with coordinates",
                "path": str(atac_output.resolve()),
            },
        ],
    }