""" Batch variant scoring tutorial demonstrating how to score multiple genetic variants using AlphaGenome. This MCP Server provides 2 tools: 1. score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers 2. filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types) All tools extracted from `/batch_variant_scoring.ipynb`. """ # Standard imports from typing import Annotated, Literal, Any import pandas as pd import numpy as np from pathlib import Path import os from fastmcp import FastMCP from datetime import datetime from io import StringIO # AlphaGenome imports from alphagenome import colab_utils from alphagenome.data import genome from alphagenome.models import dna_client, variant_scorers from tqdm import tqdm # Base persistent directory (HF Spaces guarantees /data is writable & persistent) BASE_DIR = Path("/data") DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs" DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs" INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR)) OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR)) # Ensure directories exist INPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Timestamp for unique outputs timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Fetch your secret ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"] # MCP server instance batch_variant_scoring_mcp = FastMCP(name="batch_variant_scoring") @batch_variant_scoring_mcp.tool def score_variants_batch( # Primary data inputs vcf_path: Annotated[str | None, "Path to VCF file with extension .vcf or .tsv. The header of the file should include the following columns: variant_id, CHROM, POS, REF, ALT"] = None, # Analysis parameters with tutorial defaults api_key: Annotated[str, "AlphaGenome API key for authentication"] = ALPHAGENOME_API_KEY, organism: Annotated[Literal["human", "mouse"], "Organism for variant scoring"] = "human", sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variants to predict"] = "1MB", score_rna_seq: Annotated[bool, "Score RNA-seq effects"] = True, score_cage: Annotated[bool, "Score CAGE effects"] = True, score_procap: Annotated[bool, "Score ProCAP effects"] = True, score_atac: Annotated[bool, "Score ATAC-seq effects"] = True, score_dnase: Annotated[bool, "Score DNase effects"] = True, score_chip_histone: Annotated[bool, "Score ChIP histone effects"] = True, score_chip_tf: Annotated[bool, "Score ChIP transcription factor effects"] = True, score_polyadenylation: Annotated[bool, "Score polyadenylation effects"] = True, score_splice_sites: Annotated[bool, "Score splice sites effects"] = True, score_splice_site_usage: Annotated[bool, "Score splice site usage effects"] = True, score_splice_junctions: Annotated[bool, "Score splice junctions effects"] = True, out_prefix: Annotated[str | None, "Output file prefix"] = None, ) -> dict: """ Score multiple genetic variants in batch using AlphaGenome with configurable variant scorers. Input is VCF file with variant information and output is comprehensive variant scores table. """ # Validate exactly one input if vcf_path is None: raise ValueError("Path to VCF file must be provided") # Set output prefix if out_prefix is None: out_prefix = f"batch_variant_scores_{timestamp}" # Load VCF file containing variants vcf = pd.read_csv(vcf_path, sep='\t') # Validate required columns required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT'] for column in required_columns: if column not in vcf.columns: raise ValueError(f'VCF file is missing required column: {column}.') # Load the model dna_model = dna_client.create(api_key) # Parse organism specification organism_map = { 'human': dna_client.Organism.HOMO_SAPIENS, 'mouse': dna_client.Organism.MUS_MUSCULUS, } organism_enum = organism_map[organism] # Parse sequence length sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[ f'SEQUENCE_LENGTH_{sequence_length}' ] # Parse scorer specification scorer_selections = { 'rna_seq': score_rna_seq, 'cage': score_cage, 'procap': score_procap, 'atac': score_atac, 'dnase': score_dnase, 'chip_histone': score_chip_histone, 'chip_tf': score_chip_tf, 'polyadenylation': score_polyadenylation, 'splice_sites': score_splice_sites, 'splice_site_usage': score_splice_site_usage, 'splice_junctions': score_splice_junctions, } all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS selected_scorers = [ all_scorers[key] for key in all_scorers if scorer_selections.get(key.lower(), False) ] # Remove any scorers or output types that are not supported for the chosen organism unsupported_scorers = [ scorer for scorer in selected_scorers if ( organism_enum.value not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer] ) | ( (scorer.requested_output == dna_client.OutputType.PROCAP) & (organism_enum == dna_client.Organism.MUS_MUSCULUS) ) ] if len(unsupported_scorers) > 0: print( f'Excluding {unsupported_scorers} scorers as they are not supported for' f' {organism_enum}.' ) for unsupported_scorer in unsupported_scorers: selected_scorers.remove(unsupported_scorer) # Score variants in the VCF file results = [] for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)): variant = genome.Variant( chromosome=str(vcf_row.CHROM), position=int(vcf_row.POS), reference_bases=vcf_row.REF, alternate_bases=vcf_row.ALT, name=vcf_row.variant_id, ) interval = variant.reference_interval.resize(sequence_length_value) variant_scores = dna_model.score_variant( interval=interval, variant=variant, variant_scorers=selected_scorers, organism=organism_enum, ) results.append(variant_scores) df_scores = variant_scorers.tidy_scores(results) # Save results output_file = OUTPUT_DIR / f"{out_prefix}.csv" df_scores.to_csv(output_file, index=False) return { "message": f"Batch variant scoring completed for {len(vcf)} variants with {len(selected_scorers)} scorers", "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb", "artifacts": [ { "description": "Batch variant scores results", "path": str(output_file.resolve()) } ] } @batch_variant_scoring_mcp.tool def filter_variant_scores( # Primary data inputs scores_path: Annotated[str | None, "Path to variant scores CSV file from batch scoring"] = None, # Analysis parameters with tutorial defaults ontology_curie: Annotated[str, "Ontology CURIE for filtering (e.g., 'CL:0000084' for T-cells)"] = "CL:0000084", exclude_ontology_column: Annotated[bool, "Whether to exclude ontology_curie column from output"] = True, out_prefix: Annotated[str | None, "Output file prefix"] = None, ) -> dict: """ Filter variant scores by ontology criteria to examine effects on specific cell types or tissues. Input is variant scores CSV file and output is filtered scores table. """ # Validate exactly one input if scores_path is None: raise ValueError("Path to variant scores CSV file must be provided") # Set output prefix if out_prefix is None: out_prefix = f"filtered_variant_scores_{timestamp}" # Load variant scores df_scores = pd.read_csv(scores_path) # Filter by ontology criteria filtered_df = df_scores[df_scores['ontology_curie'] == ontology_curie] # Optionally exclude ontology column if exclude_ontology_column: columns = [c for c in filtered_df.columns if c != 'ontology_curie'] filtered_df = filtered_df[columns] # Save filtered results output_file = OUTPUT_DIR / f"{out_prefix}.csv" filtered_df.to_csv(output_file, index=False) return { "message": f"Filtered {len(filtered_df)} variant scores for ontology {ontology_curie}", "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb", "artifacts": [ { "description": "Filtered variant scores", "path": str(output_file.resolve()) } ] }