alphagenome_mcp / tools /batch_variant_scoring.py
Paper2Agent's picture
Upload 10 files
0cdac39 verified
"""
Batch variant scoring tutorial demonstrating how to score multiple genetic variants using AlphaGenome.
This MCP Server provides 2 tools:
1. score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
2. filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)
All tools extracted from `/batch_variant_scoring.ipynb`.
"""
# Standard imports
from typing import Annotated, Literal, Any
import pandas as pd
import numpy as np
from pathlib import Path
import os
from fastmcp import FastMCP
from datetime import datetime
from io import StringIO
# AlphaGenome imports
from alphagenome import colab_utils
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from tqdm import tqdm
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
BASE_DIR = Path("/data")
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
# Ensure directories exist
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Timestamp for unique outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Fetch your secret
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
# MCP server instance
batch_variant_scoring_mcp = FastMCP(name="batch_variant_scoring")
@batch_variant_scoring_mcp.tool
def score_variants_batch(
# Primary data inputs
vcf_path: Annotated[str | None, "Path to VCF file with extension .vcf or .tsv. The header of the file should include the following columns: variant_id, CHROM, POS, REF, ALT"] = None,
# Analysis parameters with tutorial defaults
api_key: Annotated[str, "AlphaGenome API key for authentication"] = ALPHAGENOME_API_KEY,
organism: Annotated[Literal["human", "mouse"], "Organism for variant scoring"] = "human",
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variants to predict"] = "1MB",
score_rna_seq: Annotated[bool, "Score RNA-seq effects"] = True,
score_cage: Annotated[bool, "Score CAGE effects"] = True,
score_procap: Annotated[bool, "Score ProCAP effects"] = True,
score_atac: Annotated[bool, "Score ATAC-seq effects"] = True,
score_dnase: Annotated[bool, "Score DNase effects"] = True,
score_chip_histone: Annotated[bool, "Score ChIP histone effects"] = True,
score_chip_tf: Annotated[bool, "Score ChIP transcription factor effects"] = True,
score_polyadenylation: Annotated[bool, "Score polyadenylation effects"] = True,
score_splice_sites: Annotated[bool, "Score splice sites effects"] = True,
score_splice_site_usage: Annotated[bool, "Score splice site usage effects"] = True,
score_splice_junctions: Annotated[bool, "Score splice junctions effects"] = True,
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Score multiple genetic variants in batch using AlphaGenome with configurable variant scorers.
Input is VCF file with variant information and output is comprehensive variant scores table.
"""
# Validate exactly one input
if vcf_path is None:
raise ValueError("Path to VCF file must be provided")
# Set output prefix
if out_prefix is None:
out_prefix = f"batch_variant_scores_{timestamp}"
# Load VCF file containing variants
vcf = pd.read_csv(vcf_path, sep='\t')
# Validate required columns
required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
if column not in vcf.columns:
raise ValueError(f'VCF file is missing required column: {column}.')
# Load the model
dna_model = dna_client.create(api_key)
# Parse organism specification
organism_map = {
'human': dna_client.Organism.HOMO_SAPIENS,
'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism_enum = organism_map[organism]
# Parse sequence length
sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
f'SEQUENCE_LENGTH_{sequence_length}'
]
# Parse scorer specification
scorer_selections = {
'rna_seq': score_rna_seq,
'cage': score_cage,
'procap': score_procap,
'atac': score_atac,
'dnase': score_dnase,
'chip_histone': score_chip_histone,
'chip_tf': score_chip_tf,
'polyadenylation': score_polyadenylation,
'splice_sites': score_splice_sites,
'splice_site_usage': score_splice_site_usage,
'splice_junctions': score_splice_junctions,
}
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
all_scorers[key]
for key in all_scorers
if scorer_selections.get(key.lower(), False)
]
# Remove any scorers or output types that are not supported for the chosen organism
unsupported_scorers = [
scorer
for scorer in selected_scorers
if (
organism_enum.value
not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
)
| (
(scorer.requested_output == dna_client.OutputType.PROCAP)
& (organism_enum == dna_client.Organism.MUS_MUSCULUS)
)
]
if len(unsupported_scorers) > 0:
print(
f'Excluding {unsupported_scorers} scorers as they are not supported for'
f' {organism_enum}.'
)
for unsupported_scorer in unsupported_scorers:
selected_scorers.remove(unsupported_scorer)
# Score variants in the VCF file
results = []
for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
variant = genome.Variant(
chromosome=str(vcf_row.CHROM),
position=int(vcf_row.POS),
reference_bases=vcf_row.REF,
alternate_bases=vcf_row.ALT,
name=vcf_row.variant_id,
)
interval = variant.reference_interval.resize(sequence_length_value)
variant_scores = dna_model.score_variant(
interval=interval,
variant=variant,
variant_scorers=selected_scorers,
organism=organism_enum,
)
results.append(variant_scores)
df_scores = variant_scorers.tidy_scores(results)
# Save results
output_file = OUTPUT_DIR / f"{out_prefix}.csv"
df_scores.to_csv(output_file, index=False)
return {
"message": f"Batch variant scoring completed for {len(vcf)} variants with {len(selected_scorers)} scorers",
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
"artifacts": [
{
"description": "Batch variant scores results",
"path": str(output_file.resolve())
}
]
}
@batch_variant_scoring_mcp.tool
def filter_variant_scores(
# Primary data inputs
scores_path: Annotated[str | None, "Path to variant scores CSV file from batch scoring"] = None,
# Analysis parameters with tutorial defaults
ontology_curie: Annotated[str, "Ontology CURIE for filtering (e.g., 'CL:0000084' for T-cells)"] = "CL:0000084",
exclude_ontology_column: Annotated[bool, "Whether to exclude ontology_curie column from output"] = True,
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Filter variant scores by ontology criteria to examine effects on specific cell types or tissues.
Input is variant scores CSV file and output is filtered scores table.
"""
# Validate exactly one input
if scores_path is None:
raise ValueError("Path to variant scores CSV file must be provided")
# Set output prefix
if out_prefix is None:
out_prefix = f"filtered_variant_scores_{timestamp}"
# Load variant scores
df_scores = pd.read_csv(scores_path)
# Filter by ontology criteria
filtered_df = df_scores[df_scores['ontology_curie'] == ontology_curie]
# Optionally exclude ontology column
if exclude_ontology_column:
columns = [c for c in filtered_df.columns if c != 'ontology_curie']
filtered_df = filtered_df[columns]
# Save filtered results
output_file = OUTPUT_DIR / f"{out_prefix}.csv"
filtered_df.to_csv(output_file, index=False)
return {
"message": f"Filtered {len(filtered_df)} variant scores for ontology {ontology_curie}",
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
"artifacts": [
{
"description": "Filtered variant scores",
"path": str(output_file.resolve())
}
]
}