File size: 8,995 Bytes
0cdac39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
"""
Batch variant scoring tutorial demonstrating how to score multiple genetic variants using AlphaGenome.

This MCP Server provides 2 tools:
1. score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
2. filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)

All tools extracted from `/batch_variant_scoring.ipynb`.
"""

# Standard imports
from typing import Annotated, Literal, Any
import pandas as pd
import numpy as np
from pathlib import Path
import os
from fastmcp import FastMCP
from datetime import datetime
from io import StringIO

# AlphaGenome imports
from alphagenome import colab_utils
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from tqdm import tqdm

# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
BASE_DIR = Path("/data")

DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"

INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))

# Ensure directories exist
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Timestamp for unique outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Fetch your secret
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]

# MCP server instance
batch_variant_scoring_mcp = FastMCP(name="batch_variant_scoring")

@batch_variant_scoring_mcp.tool
def score_variants_batch(
    # Primary data inputs
    vcf_path: Annotated[str | None, "Path to VCF file with extension .vcf or .tsv. The header of the file should include the following columns: variant_id, CHROM, POS, REF, ALT"] = None,
    # Analysis parameters with tutorial defaults
    api_key: Annotated[str, "AlphaGenome API key for authentication"] = ALPHAGENOME_API_KEY,
    organism: Annotated[Literal["human", "mouse"], "Organism for variant scoring"] = "human",
    sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variants to predict"] = "1MB",
    score_rna_seq: Annotated[bool, "Score RNA-seq effects"] = True,
    score_cage: Annotated[bool, "Score CAGE effects"] = True,
    score_procap: Annotated[bool, "Score ProCAP effects"] = True,
    score_atac: Annotated[bool, "Score ATAC-seq effects"] = True,
    score_dnase: Annotated[bool, "Score DNase effects"] = True,
    score_chip_histone: Annotated[bool, "Score ChIP histone effects"] = True,
    score_chip_tf: Annotated[bool, "Score ChIP transcription factor effects"] = True,
    score_polyadenylation: Annotated[bool, "Score polyadenylation effects"] = True,
    score_splice_sites: Annotated[bool, "Score splice sites effects"] = True,
    score_splice_site_usage: Annotated[bool, "Score splice site usage effects"] = True,
    score_splice_junctions: Annotated[bool, "Score splice junctions effects"] = True,
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Score multiple genetic variants in batch using AlphaGenome with configurable variant scorers.
    Input is VCF file with variant information and output is comprehensive variant scores table.
    """
    # Validate exactly one input
    if vcf_path is None:
        raise ValueError("Path to VCF file must be provided")
    
    # Set output prefix
    if out_prefix is None:
        out_prefix = f"batch_variant_scores_{timestamp}"
    
    # Load VCF file containing variants
    vcf = pd.read_csv(vcf_path, sep='\t')
    
    # Validate required columns
    required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
    for column in required_columns:
        if column not in vcf.columns:
            raise ValueError(f'VCF file is missing required column: {column}.')
    
    # Load the model
    dna_model = dna_client.create(api_key)
    
    # Parse organism specification
    organism_map = {
        'human': dna_client.Organism.HOMO_SAPIENS,
        'mouse': dna_client.Organism.MUS_MUSCULUS,
    }
    organism_enum = organism_map[organism]
    
    # Parse sequence length
    sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
        f'SEQUENCE_LENGTH_{sequence_length}'
    ]
    
    # Parse scorer specification
    scorer_selections = {
        'rna_seq': score_rna_seq,
        'cage': score_cage,
        'procap': score_procap,
        'atac': score_atac,
        'dnase': score_dnase,
        'chip_histone': score_chip_histone,
        'chip_tf': score_chip_tf,
        'polyadenylation': score_polyadenylation,
        'splice_sites': score_splice_sites,
        'splice_site_usage': score_splice_site_usage,
        'splice_junctions': score_splice_junctions,
    }
    
    all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
    selected_scorers = [
        all_scorers[key]
        for key in all_scorers
        if scorer_selections.get(key.lower(), False)
    ]
    
    # Remove any scorers or output types that are not supported for the chosen organism
    unsupported_scorers = [
        scorer
        for scorer in selected_scorers
        if (
            organism_enum.value
            not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
        )
        | (
            (scorer.requested_output == dna_client.OutputType.PROCAP)
            & (organism_enum == dna_client.Organism.MUS_MUSCULUS)
        )
    ]
    if len(unsupported_scorers) > 0:
        print(
            f'Excluding {unsupported_scorers} scorers as they are not supported for'
            f' {organism_enum}.'
        )
        for unsupported_scorer in unsupported_scorers:
            selected_scorers.remove(unsupported_scorer)
    
    # Score variants in the VCF file
    results = []
    
    for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
        variant = genome.Variant(
            chromosome=str(vcf_row.CHROM),
            position=int(vcf_row.POS),
            reference_bases=vcf_row.REF,
            alternate_bases=vcf_row.ALT,
            name=vcf_row.variant_id,
        )
        interval = variant.reference_interval.resize(sequence_length_value)
        
        variant_scores = dna_model.score_variant(
            interval=interval,
            variant=variant,
            variant_scorers=selected_scorers,
            organism=organism_enum,
        )
        results.append(variant_scores)
    
    df_scores = variant_scorers.tidy_scores(results)
    
    # Save results
    output_file = OUTPUT_DIR / f"{out_prefix}.csv"
    df_scores.to_csv(output_file, index=False)
    
    return {
        "message": f"Batch variant scoring completed for {len(vcf)} variants with {len(selected_scorers)} scorers",
        "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
        "artifacts": [
            {
                "description": "Batch variant scores results",
                "path": str(output_file.resolve())
            }
        ]
    }

@batch_variant_scoring_mcp.tool
def filter_variant_scores(
    # Primary data inputs
    scores_path: Annotated[str | None, "Path to variant scores CSV file from batch scoring"] = None,
    # Analysis parameters with tutorial defaults
    ontology_curie: Annotated[str, "Ontology CURIE for filtering (e.g., 'CL:0000084' for T-cells)"] = "CL:0000084",
    exclude_ontology_column: Annotated[bool, "Whether to exclude ontology_curie column from output"] = True,
    out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
    """
    Filter variant scores by ontology criteria to examine effects on specific cell types or tissues.
    Input is variant scores CSV file and output is filtered scores table.
    """
    # Validate exactly one input
    if scores_path is None:
        raise ValueError("Path to variant scores CSV file must be provided")
    
    # Set output prefix
    if out_prefix is None:
        out_prefix = f"filtered_variant_scores_{timestamp}"
    
    # Load variant scores
    df_scores = pd.read_csv(scores_path)
    
    # Filter by ontology criteria
    filtered_df = df_scores[df_scores['ontology_curie'] == ontology_curie]
    
    # Optionally exclude ontology column
    if exclude_ontology_column:
        columns = [c for c in filtered_df.columns if c != 'ontology_curie']
        filtered_df = filtered_df[columns]
    
    # Save filtered results
    output_file = OUTPUT_DIR / f"{out_prefix}.csv"
    filtered_df.to_csv(output_file, index=False)
    
    return {
        "message": f"Filtered {len(filtered_df)} variant scores for ontology {ontology_curie}",
        "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
        "artifacts": [
            {
                "description": "Filtered variant scores",
                "path": str(output_file.resolve())
            }
        ]
    }