dmannk commited on
Commit
d048db9
·
verified ·
1 Parent(s): 70df03a

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ bedtools \
7
+ build-essential \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY src/GLUE_Agent_mcp.py .
15
+ COPY src/tools/ tools/
16
+
17
+ ENV PREPROCESSING_INPUT_DIR=/data/inputs
18
+ ENV PREPROCESSING_OUTPUT_DIR=/data/outputs
19
+ ENV TRAINING_INPUT_DIR=/data/inputs
20
+ ENV TRAINING_OUTPUT_DIR=/data/outputs
21
+
22
+ RUN mkdir -p /data/inputs /data/outputs
23
+
24
+ EXPOSE 7860
25
+
26
+ CMD ["uvicorn", "GLUE_Agent_mcp:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,40 @@
1
  ---
2
  title: GLUE Agent MCP
3
- emoji:
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: GLUE Agent MCP
3
+ emoji: 🧬
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
+ # GLUE Agent MCP Server
12
+
13
+ Remote MCP (Model Context Protocol) server for **GLUE** (Graph-Linked Unified Embedding) multi-omics data integration.
14
+
15
+ ## Tools
16
+
17
+ ### Preprocessing
18
+ - **glue_preprocess_scrna** — Preprocess scRNA-seq data with HVG selection, normalization, and PCA
19
+ - **glue_preprocess_scatac** — Preprocess scATAC-seq data with LSI dimension reduction
20
+ - **glue_construct_regulatory_graph** — Construct prior regulatory graph linking RNA and ATAC features
21
+
22
+ ### Training
23
+ - **glue_configure_datasets** — Configure RNA-seq and ATAC-seq datasets for GLUE model training
24
+ - **glue_train_model** — Train GLUE model for multi-omics integration
25
+ - **glue_check_integration_consistency** — Evaluate integration quality with consistency scores
26
+ - **glue_generate_embeddings** — Generate cell and feature embeddings from trained GLUE model
27
+
28
+ ## Usage
29
+
30
+ Add to your MCP client config:
31
+
32
+ ```json
33
+ {
34
+ "mcpServers": {
35
+ "GLUE_Agent": {
36
+ "url": "https://dmannk-glue-agent-mcp.hf.space/mcp"
37
+ }
38
+ }
39
+ }
40
+ ```
requirements.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MCP server & HTTP transport
2
+ fastmcp==2.14.5
3
+ uvicorn==0.40.0
4
+ fastapi
5
+ starlette==0.52.1
6
+
7
+ # Bioinformatics core
8
+ anndata==0.11.4
9
+ scanpy==1.11.5
10
+ scglue==0.4.0
11
+
12
+ # Graph / numerics
13
+ networkx==3.4.2
14
+ numpy==2.2.6
15
+ pandas==2.3.3
16
+ scipy==1.15.3
17
+ scikit-learn==1.7.2
18
+
19
+ # Plotting
20
+ matplotlib==3.10.8
21
+ seaborn==0.13.2
22
+
23
+ # scglue deep-learning backend
24
+ torch==2.10.0
25
+ pyro-ppl==1.9.1
26
+
27
+ # scglue genomics (requires bedtools system package)
28
+ pybedtools==0.12.0
29
+
30
+ # Utilities
31
+ tqdm==4.67.3
32
+ dill==0.4.1
src/GLUE_Agent_mcp.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model Context Protocol (MCP) for GLUE_Agent
3
+
4
+ GLUE_Agent provides comprehensive multi-omics data integration tools for single-cell RNA-seq and ATAC-seq analysis. This framework enables preprocessing, model training, and visualization of integrated multi-modal datasets.
5
+
6
+ This MCP Server contains tools extracted from the following tutorial files:
7
+ 1. preprocessing
8
+ - glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA
9
+ - glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction
10
+ - glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features
11
+ 2. training
12
+ - glue_configure_datasets: Configure RNA-seq and ATAC-seq datasets for GLUE model training
13
+ - glue_train_model: Train GLUE model for multi-omics integration
14
+ - glue_check_integration_consistency: Evaluate integration quality with consistency scores
15
+ - glue_generate_embeddings: Generate cell and feature embeddings from trained GLUE model
16
+ """
17
+
18
+ import os
19
+
20
+ from fastmcp import FastMCP
21
+
22
+ # Import statements (alphabetical order)
23
+ from tools.preprocessing import preprocessing_mcp
24
+ from tools.training import training_mcp
25
+
26
+ # Server definition and mounting
27
+ mcp = FastMCP(name="GLUE_Agent")
28
+ mcp.mount(preprocessing_mcp)
29
+ mcp.mount(training_mcp)
30
+
31
+ # ASGI app for uvicorn (used when deployed as a remote HTTP server)
32
+ # stateless_http=True avoids the StreamableHTTPSessionManager task group
33
+ # initialization issue that causes 500 errors on HuggingFace Spaces.
34
+ app = mcp.http_app(path="/mcp", stateless_http=True)
35
+
36
+ if __name__ == "__main__":
37
+ mcp.run(
38
+ transport="http",
39
+ host="0.0.0.0",
40
+ port=int(os.getenv("PORT", 7860)),
41
+ path="/mcp",
42
+ )
src/tools/__init__.py ADDED
File without changes
src/tools/preprocessing.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GLUE preprocessing tutorial for scRNA-seq and scATAC-seq data integration.
3
+
4
+ This MCP Server provides 3 tools:
5
+ 1. glue_preprocess_scrna: Preprocess scRNA-seq data with HVG selection, normalization, and PCA
6
+ 2. glue_preprocess_scatac: Preprocess scATAC-seq data with LSI dimension reduction
7
+ 3. glue_construct_regulatory_graph: Construct prior regulatory graph linking RNA and ATAC features
8
+
9
+ All tools extracted from `gao-lab/GLUE/blob/master/docs/preprocessing.ipynb`.
10
+ """
11
+
12
+ import os
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ # Standard imports
16
+ from typing import Annotated, Any, Literal
17
+
18
+ import anndata as ad
19
+ # Domain-specific imports
20
+ import matplotlib.pyplot as plt
21
+ import networkx as nx
22
+ import numpy as np
23
+ import pandas as pd
24
+ import scanpy as sc
25
+ import scglue
26
+ from fastmcp import FastMCP
27
+ from matplotlib import rcParams
28
+
29
+ # Project structure
30
+ PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
31
+ DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs"
32
+ DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs"
33
+
34
+ INPUT_DIR = Path(os.environ.get("PREPROCESSING_INPUT_DIR", DEFAULT_INPUT_DIR))
35
+ OUTPUT_DIR = Path(os.environ.get("PREPROCESSING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
36
+
37
+ # Ensure directories exist
38
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
39
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
40
+
41
+ # Timestamp for unique outputs
42
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
43
+
44
+ # Set plotting parameters
45
+ plt.rcParams["figure.dpi"] = 300
46
+ plt.rcParams["savefig.dpi"] = 300
47
+ scglue.plot.set_publication_params()
48
+ rcParams["figure.figsize"] = (4, 4)
49
+
50
+ # MCP server instance
51
+ preprocessing_mcp = FastMCP(name="preprocessing")
52
+
53
+
54
+ @preprocessing_mcp.tool
55
+ def glue_preprocess_scrna(
56
+ rna_path: Annotated[
57
+ str | None, "Path to scRNA-seq data file in h5ad format"
58
+ ] = None,
59
+ n_top_genes: Annotated[int, "Number of highly variable genes to select"] = 2000,
60
+ flavor: Annotated[
61
+ Literal["seurat", "cell_ranger", "seurat_v3"], "Method for HVG selection"
62
+ ] = "seurat_v3",
63
+ n_comps: Annotated[int, "Number of principal components"] = 100,
64
+ svd_solver: Annotated[
65
+ Literal["auto", "arpack", "randomized"], "SVD solver for PCA"
66
+ ] = "auto",
67
+ color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
68
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
69
+ ) -> dict:
70
+ """
71
+ Preprocess scRNA-seq data with highly variable gene selection, normalization, scaling, and PCA.
72
+ Input is scRNA-seq data in h5ad format and output is preprocessed data with PCA embedding and UMAP visualization.
73
+ """
74
+ # Input validation
75
+ if rna_path is None:
76
+ raise ValueError("Path to scRNA-seq data file must be provided")
77
+
78
+ # File existence validation
79
+ rna_file = Path(rna_path)
80
+ if not rna_file.exists():
81
+ raise FileNotFoundError(f"RNA data file not found: {rna_path}")
82
+
83
+ # Set output prefix
84
+ if out_prefix is None:
85
+ out_prefix = "glue_rna"
86
+
87
+ # Load data
88
+ rna = ad.read_h5ad(rna_path)
89
+
90
+ # Backup raw counts to "counts" layer
91
+ rna.layers["counts"] = rna.X.copy()
92
+
93
+ # Select highly variable genes
94
+ sc.pp.highly_variable_genes(rna, n_top_genes=n_top_genes, flavor=flavor)
95
+
96
+ # Normalize, log-transform, and scale
97
+ sc.pp.normalize_total(rna)
98
+ sc.pp.log1p(rna)
99
+ sc.pp.scale(rna)
100
+
101
+ # Perform PCA
102
+ sc.tl.pca(rna, n_comps=n_comps, svd_solver=svd_solver)
103
+
104
+ # Generate UMAP visualization
105
+ sc.pp.neighbors(rna, metric="cosine")
106
+ sc.tl.umap(rna)
107
+
108
+ # Save UMAP plot
109
+ fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
110
+ sc.pl.umap(rna, color=color_var, show=False)
111
+ plt.savefig(fig_output, dpi=300, bbox_inches="tight")
112
+ plt.close()
113
+
114
+ # Save preprocessed data
115
+ rna_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
116
+ rna.write(str(rna_output), compression="gzip")
117
+
118
+ return {
119
+ "message": f"Preprocessed RNA data: {n_top_genes} HVGs, {n_comps} PCs, UMAP generated",
120
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
121
+ "artifacts": [
122
+ {"description": "Preprocessed RNA data", "path": str(rna_output.resolve())},
123
+ {
124
+ "description": "RNA UMAP visualization",
125
+ "path": str(fig_output.resolve()),
126
+ },
127
+ ],
128
+ }
129
+
130
+
131
+ @preprocessing_mcp.tool
132
+ def glue_preprocess_scatac(
133
+ atac_path: Annotated[
134
+ str | None, "Path to scATAC-seq data file in h5ad format"
135
+ ] = None,
136
+ n_components: Annotated[int, "Number of LSI components"] = 100,
137
+ n_iter: Annotated[int, "Number of iterations for randomized SVD in LSI"] = 15,
138
+ color_var: Annotated[str, "Variable name for UMAP coloring"] = "cell_type",
139
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
140
+ ) -> dict:
141
+ """
142
+ Preprocess scATAC-seq data with latent semantic indexing (LSI) dimension reduction.
143
+ Input is scATAC-seq data in h5ad format and output is preprocessed data with LSI embedding and UMAP visualization.
144
+ """
145
+ # Input validation
146
+ if atac_path is None:
147
+ raise ValueError("Path to scATAC-seq data file must be provided")
148
+
149
+ # File existence validation
150
+ atac_file = Path(atac_path)
151
+ if not atac_file.exists():
152
+ raise FileNotFoundError(f"ATAC data file not found: {atac_path}")
153
+
154
+ # Set output prefix
155
+ if out_prefix is None:
156
+ out_prefix = "glue_atac"
157
+
158
+ # Load data
159
+ atac = ad.read_h5ad(atac_path)
160
+
161
+ # Perform LSI dimension reduction
162
+ scglue.data.lsi(atac, n_components=n_components, n_iter=n_iter)
163
+
164
+ # Generate UMAP visualization
165
+ sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine")
166
+ sc.tl.umap(atac)
167
+
168
+ # Save UMAP plot
169
+ fig_output = OUTPUT_DIR / f"{out_prefix}_umap_{timestamp}.png"
170
+ sc.pl.umap(atac, color=color_var, show=False)
171
+ plt.savefig(fig_output, dpi=300, bbox_inches="tight")
172
+ plt.close()
173
+
174
+ # Save preprocessed data
175
+ atac_output = OUTPUT_DIR / f"{out_prefix}_preprocessed_{timestamp}.h5ad"
176
+ atac.write(str(atac_output), compression="gzip")
177
+
178
+ return {
179
+ "message": f"Preprocessed ATAC data: {n_components} LSI components, UMAP generated",
180
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
181
+ "artifacts": [
182
+ {
183
+ "description": "Preprocessed ATAC data",
184
+ "path": str(atac_output.resolve()),
185
+ },
186
+ {
187
+ "description": "ATAC UMAP visualization",
188
+ "path": str(fig_output.resolve()),
189
+ },
190
+ ],
191
+ }
192
+
193
+
194
+ @preprocessing_mcp.tool
195
+ def glue_construct_regulatory_graph(
196
+ rna_path: Annotated[
197
+ str | None, "Path to preprocessed scRNA-seq data file in h5ad format"
198
+ ] = None,
199
+ atac_path: Annotated[
200
+ str | None, "Path to preprocessed scATAC-seq data file in h5ad format"
201
+ ] = None,
202
+ gtf_path: Annotated[
203
+ str | None, "Path to GTF annotation file for gene coordinates"
204
+ ] = None,
205
+ gtf_by: Annotated[str, "GTF attribute to match gene names"] = "gene_name",
206
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
207
+ ) -> dict:
208
+ """
209
+ Construct prior regulatory graph linking RNA genes and ATAC peaks via genomic proximity.
210
+ Input is preprocessed RNA and ATAC data with GTF annotation and output is NetworkX guidance graph.
211
+ """
212
+ # Input validation
213
+ if rna_path is None:
214
+ raise ValueError("Path to preprocessed scRNA-seq data file must be provided")
215
+ if atac_path is None:
216
+ raise ValueError("Path to preprocessed scATAC-seq data file must be provided")
217
+ if gtf_path is None:
218
+ raise ValueError("Path to GTF annotation file must be provided")
219
+
220
+ # File existence validation
221
+ rna_file = Path(rna_path)
222
+ if not rna_file.exists():
223
+ raise FileNotFoundError(f"RNA data file not found: {rna_path}")
224
+
225
+ atac_file = Path(atac_path)
226
+ if not atac_file.exists():
227
+ raise FileNotFoundError(f"ATAC data file not found: {atac_path}")
228
+
229
+ gtf_file = Path(gtf_path)
230
+ if not gtf_file.exists():
231
+ raise FileNotFoundError(f"GTF annotation file not found: {gtf_path}")
232
+
233
+ # Set output prefix
234
+ if out_prefix is None:
235
+ out_prefix = "glue_guidance"
236
+
237
+ # Load data
238
+ rna = ad.read_h5ad(rna_path)
239
+ atac = ad.read_h5ad(atac_path)
240
+
241
+ # Get gene annotation from GTF
242
+ scglue.data.get_gene_annotation(rna, gtf=gtf_path, gtf_by=gtf_by)
243
+
244
+ # Extract ATAC peak coordinates from var_names
245
+ split = atac.var_names.str.split(r"[:-]")
246
+ atac.var["chrom"] = split.map(lambda x: x[0])
247
+ atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
248
+ atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
249
+
250
+ # Construct guidance graph
251
+ guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
252
+
253
+ # Verify graph compliance
254
+ scglue.graph.check_graph(guidance, [rna, atac])
255
+
256
+ # Save guidance graph
257
+ graph_output = OUTPUT_DIR / f"{out_prefix}_graph_{timestamp}.graphml.gz"
258
+ nx.write_graphml(guidance, str(graph_output))
259
+
260
+ # Save updated data with coordinates
261
+ rna_output = OUTPUT_DIR / f"{out_prefix}_rna_annotated_{timestamp}.h5ad"
262
+ atac_output = OUTPUT_DIR / f"{out_prefix}_atac_annotated_{timestamp}.h5ad"
263
+ rna.write(str(rna_output), compression="gzip")
264
+ atac.write(str(atac_output), compression="gzip")
265
+
266
+ return {
267
+ "message": f"Constructed guidance graph with {guidance.number_of_nodes()} nodes and {guidance.number_of_edges()} edges",
268
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/preprocessing.ipynb",
269
+ "artifacts": [
270
+ {"description": "Guidance graph", "path": str(graph_output.resolve())},
271
+ {
272
+ "description": "RNA data with coordinates",
273
+ "path": str(rna_output.resolve()),
274
+ },
275
+ {
276
+ "description": "ATAC data with coordinates",
277
+ "path": str(atac_output.resolve()),
278
+ },
279
+ ],
280
+ }
src/tools/training.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GLUE model training workflow for multi-omics data integration.
3
+
4
+ This MCP Server provides 4 tools:
5
+ 1. glue_configure_datasets: Configure RNA-seq and ATAC-seq datasets for GLUE model training
6
+ 2. glue_train_model: Train GLUE model for multi-omics integration
7
+ 3. glue_check_integration_consistency: Evaluate integration quality with consistency scores
8
+ 4. glue_generate_embeddings: Generate cell and feature embeddings from trained GLUE model
9
+
10
+ All tools extracted from `gao-lab/GLUE/docs/training.ipynb`.
11
+ """
12
+
13
+ import os
14
+ from datetime import datetime
15
+ from itertools import chain
16
+ from pathlib import Path
17
+ # Standard imports
18
+ from typing import Annotated, Any, Literal
19
+
20
+ # Domain-specific imports
21
+ import anndata as ad
22
+ import matplotlib.pyplot as plt
23
+ import networkx as nx
24
+ import numpy as np
25
+ import pandas as pd
26
+ import scanpy as sc
27
+ import scglue
28
+ import seaborn as sns
29
+ from fastmcp import FastMCP
30
+ from matplotlib import rcParams
31
+
32
+ # Project structure
33
+ PROJECT_ROOT = Path(__file__).parent.parent.parent.resolve()
34
+ DEFAULT_INPUT_DIR = PROJECT_ROOT / "tmp" / "inputs"
35
+ DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "tmp" / "outputs"
36
+
37
+ INPUT_DIR = Path(os.environ.get("TRAINING_INPUT_DIR", DEFAULT_INPUT_DIR))
38
+ OUTPUT_DIR = Path(os.environ.get("TRAINING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
39
+
40
+ # Ensure directories exist
41
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
42
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Timestamp for unique outputs
45
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
46
+
47
+ # MCP server instance
48
+ training_mcp = FastMCP(name="training")
49
+
50
+ # Set plot parameters
51
+ plt.rcParams["figure.dpi"] = 300
52
+ plt.rcParams["savefig.dpi"] = 300
53
+ scglue.plot.set_publication_params()
54
+ rcParams["figure.figsize"] = (4, 4)
55
+
56
+
57
+ @training_mcp.tool
58
+ def glue_configure_datasets(
59
+ # Primary data inputs
60
+ rna_path: Annotated[
61
+ str | None, "Path to preprocessed RNA-seq data file with extension .h5ad"
62
+ ] = None,
63
+ atac_path: Annotated[
64
+ str | None, "Path to preprocessed ATAC-seq data file with extension .h5ad"
65
+ ] = None,
66
+ guidance_path: Annotated[
67
+ str | None, "Path to guidance graph file with extension .graphml.gz"
68
+ ] = None,
69
+ # Configuration parameters with tutorial defaults
70
+ prob_model: Annotated[
71
+ Literal["NB", "ZINB", "ZIP"], "Probabilistic generative model"
72
+ ] = "NB",
73
+ use_highly_variable: Annotated[bool, "Use only highly variable features"] = True,
74
+ rna_use_layer: Annotated[
75
+ str | None, "RNA data layer to use (None uses .X)"
76
+ ] = "counts",
77
+ rna_use_rep: Annotated[str, "RNA preprocessing embedding to use"] = "X_pca",
78
+ atac_use_rep: Annotated[str, "ATAC preprocessing embedding to use"] = "X_lsi",
79
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
80
+ ) -> dict:
81
+ """
82
+ Configure RNA-seq and ATAC-seq datasets for GLUE model training.
83
+ Input is preprocessed RNA/ATAC h5ad files and guidance graph, output is configured h5ad files and HVF-filtered guidance graph.
84
+ """
85
+ # Input file validation
86
+ if rna_path is None:
87
+ raise ValueError("Path to RNA-seq data file must be provided")
88
+ if atac_path is None:
89
+ raise ValueError("Path to ATAC-seq data file must be provided")
90
+ if guidance_path is None:
91
+ raise ValueError("Path to guidance graph file must be provided")
92
+
93
+ # File existence validation
94
+ rna_file = Path(rna_path)
95
+ if not rna_file.exists():
96
+ raise FileNotFoundError(f"RNA-seq file not found: {rna_path}")
97
+
98
+ atac_file = Path(atac_path)
99
+ if not atac_file.exists():
100
+ raise FileNotFoundError(f"ATAC-seq file not found: {atac_path}")
101
+
102
+ guidance_file = Path(guidance_path)
103
+ if not guidance_file.exists():
104
+ raise FileNotFoundError(f"Guidance graph file not found: {guidance_path}")
105
+
106
+ # Load data
107
+ rna = ad.read_h5ad(rna_path)
108
+ atac = ad.read_h5ad(atac_path)
109
+ guidance = nx.read_graphml(guidance_path)
110
+
111
+ # Configure datasets
112
+ scglue.models.configure_dataset(
113
+ rna,
114
+ prob_model,
115
+ use_highly_variable=use_highly_variable,
116
+ use_layer=rna_use_layer,
117
+ use_rep=rna_use_rep,
118
+ )
119
+
120
+ scglue.models.configure_dataset(
121
+ atac, prob_model, use_highly_variable=use_highly_variable, use_rep=atac_use_rep
122
+ )
123
+
124
+ # Extract subgraph with highly variable features
125
+ guidance_hvf = guidance.subgraph(
126
+ chain(
127
+ rna.var.query("highly_variable").index,
128
+ atac.var.query("highly_variable").index,
129
+ )
130
+ ).copy()
131
+
132
+ # Note: anndata drops None values during save/load, but scglue's configure_dataset
133
+ # creates these fields. We preserve them by converting None to a special string marker.
134
+ for adata in [rna, atac]:
135
+ if "__scglue__" in adata.uns:
136
+ config = adata.uns["__scglue__"]
137
+ # Convert None values to string markers that will survive serialization
138
+ for key in [
139
+ "batches",
140
+ "use_batch",
141
+ "use_cell_type",
142
+ "cell_types",
143
+ "use_dsc_weight",
144
+ "use_layer",
145
+ ]:
146
+ if key in config and config[key] is None:
147
+ config[key] = "__none__"
148
+
149
+ # Save configured datasets and HVF guidance graph
150
+ if out_prefix is None:
151
+ out_prefix = f"glue_configured_{timestamp}"
152
+
153
+ rna_output = OUTPUT_DIR / f"{out_prefix}_rna_configured.h5ad"
154
+ atac_output = OUTPUT_DIR / f"{out_prefix}_atac_configured.h5ad"
155
+ guidance_hvf_output = OUTPUT_DIR / f"{out_prefix}_guidance_hvf.graphml.gz"
156
+
157
+ rna.write(str(rna_output), compression="gzip")
158
+ atac.write(str(atac_output), compression="gzip")
159
+ nx.write_graphml(guidance_hvf, str(guidance_hvf_output))
160
+
161
+ # Return standardized format
162
+ return {
163
+ "message": f"Configured datasets with {len(rna.var.query('highly_variable'))} RNA and {len(atac.var.query('highly_variable'))} ATAC HVFs",
164
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/training.ipynb",
165
+ "artifacts": [
166
+ {
167
+ "description": "Configured RNA-seq data",
168
+ "path": str(rna_output.resolve()),
169
+ },
170
+ {
171
+ "description": "Configured ATAC-seq data",
172
+ "path": str(atac_output.resolve()),
173
+ },
174
+ {
175
+ "description": "HVF-filtered guidance graph",
176
+ "path": str(guidance_hvf_output.resolve()),
177
+ },
178
+ ],
179
+ }
180
+
181
+
182
+ @training_mcp.tool
183
+ def glue_train_model(
184
+ # Primary data inputs
185
+ rna_path: Annotated[
186
+ str | None, "Path to configured RNA-seq data file with extension .h5ad"
187
+ ] = None,
188
+ atac_path: Annotated[
189
+ str | None, "Path to configured ATAC-seq data file with extension .h5ad"
190
+ ] = None,
191
+ guidance_hvf_path: Annotated[
192
+ str | None,
193
+ "Path to HVF-filtered guidance graph file with extension .graphml.gz",
194
+ ] = None,
195
+ # Training parameters
196
+ training_dir: Annotated[
197
+ str | None, "Directory to store model snapshots and training logs"
198
+ ] = None,
199
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
200
+ ) -> dict:
201
+ """
202
+ Train GLUE model for multi-omics integration.
203
+ Input is configured RNA/ATAC h5ad files and HVF guidance graph, output is trained GLUE model.
204
+ """
205
+ # Input file validation
206
+ if rna_path is None:
207
+ raise ValueError("Path to configured RNA-seq data file must be provided")
208
+ if atac_path is None:
209
+ raise ValueError("Path to configured ATAC-seq data file must be provided")
210
+ if guidance_hvf_path is None:
211
+ raise ValueError("Path to HVF-filtered guidance graph file must be provided")
212
+
213
+ # File existence validation
214
+ rna_file = Path(rna_path)
215
+ if not rna_file.exists():
216
+ raise FileNotFoundError(f"RNA-seq file not found: {rna_path}")
217
+
218
+ atac_file = Path(atac_path)
219
+ if not atac_file.exists():
220
+ raise FileNotFoundError(f"ATAC-seq file not found: {atac_path}")
221
+
222
+ guidance_hvf_file = Path(guidance_hvf_path)
223
+ if not guidance_hvf_file.exists():
224
+ raise FileNotFoundError(
225
+ f"Guidance HVF graph file not found: {guidance_hvf_path}"
226
+ )
227
+
228
+ # Load data
229
+ rna = ad.read_h5ad(rna_path)
230
+ atac = ad.read_h5ad(atac_path)
231
+ guidance_hvf = nx.read_graphml(guidance_hvf_path)
232
+
233
+ # Convert string markers back to None for scglue compatibility
234
+ for adata in [rna, atac]:
235
+ if "__scglue__" in adata.uns:
236
+ config = adata.uns["__scglue__"]
237
+ for key in [
238
+ "batches",
239
+ "use_batch",
240
+ "use_cell_type",
241
+ "cell_types",
242
+ "use_dsc_weight",
243
+ "use_layer",
244
+ ]:
245
+ if key in config and config[key] == "__none__":
246
+ config[key] = None
247
+
248
+ # Set training directory
249
+ if training_dir is None:
250
+ if out_prefix is None:
251
+ out_prefix = f"glue_model_{timestamp}"
252
+ training_dir = str(OUTPUT_DIR / f"{out_prefix}_training")
253
+
254
+ # Create training directory
255
+ Path(training_dir).mkdir(parents=True, exist_ok=True)
256
+
257
+ # Train GLUE model
258
+ glue = scglue.models.fit_SCGLUE(
259
+ {"rna": rna, "atac": atac}, guidance_hvf, fit_kws={"directory": training_dir}
260
+ )
261
+
262
+ # Save trained model
263
+ if out_prefix is None:
264
+ out_prefix = f"glue_model_{timestamp}"
265
+
266
+ model_output = OUTPUT_DIR / f"{out_prefix}.dill"
267
+ glue.save(str(model_output))
268
+
269
+ # Return standardized format
270
+ return {
271
+ "message": "GLUE model training completed successfully",
272
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/training.ipynb",
273
+ "artifacts": [
274
+ {"description": "Trained GLUE model", "path": str(model_output.resolve())},
275
+ {
276
+ "description": "Training logs directory",
277
+ "path": str(Path(training_dir).resolve()),
278
+ },
279
+ ],
280
+ }
281
+
282
+
283
+ @training_mcp.tool
284
+ def glue_check_integration_consistency(
285
+ # Primary data inputs
286
+ model_path: Annotated[
287
+ str | None, "Path to trained GLUE model file with extension .dill"
288
+ ] = None,
289
+ rna_path: Annotated[
290
+ str | None, "Path to configured RNA-seq data file with extension .h5ad"
291
+ ] = None,
292
+ atac_path: Annotated[
293
+ str | None, "Path to configured ATAC-seq data file with extension .h5ad"
294
+ ] = None,
295
+ guidance_hvf_path: Annotated[
296
+ str | None,
297
+ "Path to HVF-filtered guidance graph file with extension .graphml.gz",
298
+ ] = None,
299
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
300
+ ) -> dict:
301
+ """
302
+ Evaluate integration quality with consistency scores across metacell granularities.
303
+ Input is trained model, RNA/ATAC data, and HVF guidance graph, output is consistency scores table and plot.
304
+ """
305
+ # Input file validation
306
+ if model_path is None:
307
+ raise ValueError("Path to trained GLUE model file must be provided")
308
+ if rna_path is None:
309
+ raise ValueError("Path to configured RNA-seq data file must be provided")
310
+ if atac_path is None:
311
+ raise ValueError("Path to configured ATAC-seq data file must be provided")
312
+ if guidance_hvf_path is None:
313
+ raise ValueError("Path to HVF-filtered guidance graph file must be provided")
314
+
315
+ # File existence validation
316
+ model_file = Path(model_path)
317
+ if not model_file.exists():
318
+ raise FileNotFoundError(f"Model file not found: {model_path}")
319
+
320
+ rna_file = Path(rna_path)
321
+ if not rna_file.exists():
322
+ raise FileNotFoundError(f"RNA-seq file not found: {rna_path}")
323
+
324
+ atac_file = Path(atac_path)
325
+ if not atac_file.exists():
326
+ raise FileNotFoundError(f"ATAC-seq file not found: {atac_path}")
327
+
328
+ guidance_hvf_file = Path(guidance_hvf_path)
329
+ if not guidance_hvf_file.exists():
330
+ raise FileNotFoundError(
331
+ f"Guidance HVF graph file not found: {guidance_hvf_path}"
332
+ )
333
+
334
+ # Load data
335
+ glue = scglue.models.load_model(model_path)
336
+ rna = ad.read_h5ad(rna_path)
337
+ atac = ad.read_h5ad(atac_path)
338
+ guidance_hvf = nx.read_graphml(guidance_hvf_path)
339
+
340
+ # Convert string markers back to None for scglue compatibility
341
+ for adata in [rna, atac]:
342
+ if "__scglue__" in adata.uns:
343
+ config = adata.uns["__scglue__"]
344
+ for key in [
345
+ "batches",
346
+ "use_batch",
347
+ "use_cell_type",
348
+ "cell_types",
349
+ "use_dsc_weight",
350
+ "use_layer",
351
+ ]:
352
+ if key in config and config[key] == "__none__":
353
+ config[key] = None
354
+
355
+ # Compute integration consistency
356
+ dx = scglue.models.integration_consistency(
357
+ glue, {"rna": rna, "atac": atac}, guidance_hvf
358
+ )
359
+
360
+ # Save consistency scores
361
+ if out_prefix is None:
362
+ out_prefix = f"glue_consistency_{timestamp}"
363
+
364
+ consistency_table = OUTPUT_DIR / f"{out_prefix}_scores.csv"
365
+ dx.to_csv(str(consistency_table), index=False)
366
+
367
+ # Generate consistency plot
368
+ plt.figure(figsize=(4, 4))
369
+ ax = sns.lineplot(x="n_meta", y="consistency", data=dx)
370
+ ax.axhline(y=0.05, c="darkred", ls="--")
371
+ plt.xlabel("Number of metacells")
372
+ plt.ylabel("Consistency score")
373
+ plt.tight_layout()
374
+
375
+ consistency_plot = OUTPUT_DIR / f"{out_prefix}_plot.png"
376
+ plt.savefig(str(consistency_plot), dpi=300, bbox_inches="tight")
377
+ plt.close()
378
+
379
+ # Return standardized format
380
+ return {
381
+ "message": f"Integration consistency computed (range: {dx['consistency'].min():.3f}-{dx['consistency'].max():.3f})",
382
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/training.ipynb",
383
+ "artifacts": [
384
+ {
385
+ "description": "Consistency scores table",
386
+ "path": str(consistency_table.resolve()),
387
+ },
388
+ {
389
+ "description": "Consistency plot",
390
+ "path": str(consistency_plot.resolve()),
391
+ },
392
+ ],
393
+ }
394
+
395
+
396
+ @training_mcp.tool
397
+ def glue_generate_embeddings(
398
+ # Primary data inputs
399
+ model_path: Annotated[
400
+ str | None, "Path to trained GLUE model file with extension .dill"
401
+ ] = None,
402
+ rna_path: Annotated[
403
+ str | None, "Path to configured RNA-seq data file with extension .h5ad"
404
+ ] = None,
405
+ atac_path: Annotated[
406
+ str | None, "Path to configured ATAC-seq data file with extension .h5ad"
407
+ ] = None,
408
+ guidance_hvf_path: Annotated[
409
+ str | None,
410
+ "Path to HVF-filtered guidance graph file with extension .graphml.gz",
411
+ ] = None,
412
+ # Visualization parameters with tutorial defaults
413
+ color_vars: Annotated[list, "Variables to color UMAP by"] = ["cell_type", "domain"],
414
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
415
+ ) -> dict:
416
+ """
417
+ Generate cell and feature embeddings from trained GLUE model and visualize alignment.
418
+ Input is trained model and RNA/ATAC data, output is h5ad files with embeddings and UMAP visualization.
419
+ """
420
+ # Input file validation
421
+ if model_path is None:
422
+ raise ValueError("Path to trained GLUE model file must be provided")
423
+ if rna_path is None:
424
+ raise ValueError("Path to configured RNA-seq data file must be provided")
425
+ if atac_path is None:
426
+ raise ValueError("Path to configured ATAC-seq data file must be provided")
427
+ if guidance_hvf_path is None:
428
+ raise ValueError("Path to HVF-filtered guidance graph file must be provided")
429
+
430
+ # File existence validation
431
+ model_file = Path(model_path)
432
+ if not model_file.exists():
433
+ raise FileNotFoundError(f"Model file not found: {model_path}")
434
+
435
+ rna_file = Path(rna_path)
436
+ if not rna_file.exists():
437
+ raise FileNotFoundError(f"RNA-seq file not found: {rna_path}")
438
+
439
+ atac_file = Path(atac_path)
440
+ if not atac_file.exists():
441
+ raise FileNotFoundError(f"ATAC-seq file not found: {atac_path}")
442
+
443
+ guidance_hvf_file = Path(guidance_hvf_path)
444
+ if not guidance_hvf_file.exists():
445
+ raise FileNotFoundError(
446
+ f"Guidance HVF graph file not found: {guidance_hvf_path}"
447
+ )
448
+
449
+ # Load data
450
+ glue = scglue.models.load_model(model_path)
451
+ rna = ad.read_h5ad(rna_path)
452
+ atac = ad.read_h5ad(atac_path)
453
+ guidance_hvf = nx.read_graphml(guidance_hvf_path)
454
+
455
+ # Convert string markers back to None for scglue compatibility
456
+ for adata in [rna, atac]:
457
+ if "__scglue__" in adata.uns:
458
+ config = adata.uns["__scglue__"]
459
+ for key in [
460
+ "batches",
461
+ "use_batch",
462
+ "use_cell_type",
463
+ "cell_types",
464
+ "use_dsc_weight",
465
+ "use_layer",
466
+ ]:
467
+ if key in config and config[key] == "__none__":
468
+ config[key] = None
469
+
470
+ # Generate cell embeddings
471
+ rna.obsm["X_glue"] = glue.encode_data("rna", rna)
472
+ atac.obsm["X_glue"] = glue.encode_data("atac", atac)
473
+
474
+ # Generate feature embeddings
475
+ feature_embeddings = glue.encode_graph(guidance_hvf)
476
+ feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices)
477
+
478
+ rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy()
479
+ atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy()
480
+
481
+ # Create combined dataset for visualization
482
+ combined = ad.concat([rna, atac])
483
+
484
+ # Generate UMAP visualization
485
+ sc.pp.neighbors(combined, use_rep="X_glue", metric="cosine")
486
+ sc.tl.umap(combined)
487
+ sc.pl.umap(combined, color=color_vars, wspace=0.65)
488
+
489
+ # Save UMAP plot
490
+ if out_prefix is None:
491
+ out_prefix = f"glue_embeddings_{timestamp}"
492
+
493
+ umap_plot = OUTPUT_DIR / f"{out_prefix}_umap.png"
494
+ plt.savefig(str(umap_plot), dpi=300, bbox_inches="tight")
495
+ plt.close()
496
+
497
+ # Save h5ad files with embeddings
498
+ rna_output = OUTPUT_DIR / f"{out_prefix}_rna_emb.h5ad"
499
+ atac_output = OUTPUT_DIR / f"{out_prefix}_atac_emb.h5ad"
500
+ guidance_hvf_output = OUTPUT_DIR / f"{out_prefix}_guidance_hvf.graphml.gz"
501
+
502
+ rna.write(str(rna_output), compression="gzip")
503
+ atac.write(str(atac_output), compression="gzip")
504
+ nx.write_graphml(guidance_hvf, str(guidance_hvf_output))
505
+
506
+ # Return standardized format
507
+ return {
508
+ "message": f"Generated embeddings for {rna.n_obs} RNA and {atac.n_obs} ATAC cells",
509
+ "reference": "https://github.com/gao-lab/GLUE/blob/master/docs/training.ipynb",
510
+ "artifacts": [
511
+ {
512
+ "description": "RNA data with embeddings",
513
+ "path": str(rna_output.resolve()),
514
+ },
515
+ {
516
+ "description": "ATAC data with embeddings",
517
+ "path": str(atac_output.resolve()),
518
+ },
519
+ {
520
+ "description": "HVF guidance graph",
521
+ "path": str(guidance_hvf_output.resolve()),
522
+ },
523
+ {"description": "UMAP visualization", "path": str(umap_plot.resolve())},
524
+ ],
525
+ }