Spaces:
Running
Running
File size: 8,995 Bytes
0cdac39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
"""
Batch variant scoring tutorial demonstrating how to score multiple genetic variants using AlphaGenome.
This MCP Server provides 2 tools:
1. score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
2. filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)
All tools extracted from `/batch_variant_scoring.ipynb`.
"""
# Standard imports
from typing import Annotated, Literal, Any
import pandas as pd
import numpy as np
from pathlib import Path
import os
from fastmcp import FastMCP
from datetime import datetime
from io import StringIO
# AlphaGenome imports
from alphagenome import colab_utils
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
from tqdm import tqdm
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
BASE_DIR = Path("/data")
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
# Ensure directories exist
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Timestamp for unique outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Fetch your secret
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
# MCP server instance
batch_variant_scoring_mcp = FastMCP(name="batch_variant_scoring")
@batch_variant_scoring_mcp.tool
def score_variants_batch(
# Primary data inputs
vcf_path: Annotated[str | None, "Path to VCF file with extension .vcf or .tsv. The header of the file should include the following columns: variant_id, CHROM, POS, REF, ALT"] = None,
# Analysis parameters with tutorial defaults
api_key: Annotated[str, "AlphaGenome API key for authentication"] = ALPHAGENOME_API_KEY,
organism: Annotated[Literal["human", "mouse"], "Organism for variant scoring"] = "human",
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variants to predict"] = "1MB",
score_rna_seq: Annotated[bool, "Score RNA-seq effects"] = True,
score_cage: Annotated[bool, "Score CAGE effects"] = True,
score_procap: Annotated[bool, "Score ProCAP effects"] = True,
score_atac: Annotated[bool, "Score ATAC-seq effects"] = True,
score_dnase: Annotated[bool, "Score DNase effects"] = True,
score_chip_histone: Annotated[bool, "Score ChIP histone effects"] = True,
score_chip_tf: Annotated[bool, "Score ChIP transcription factor effects"] = True,
score_polyadenylation: Annotated[bool, "Score polyadenylation effects"] = True,
score_splice_sites: Annotated[bool, "Score splice sites effects"] = True,
score_splice_site_usage: Annotated[bool, "Score splice site usage effects"] = True,
score_splice_junctions: Annotated[bool, "Score splice junctions effects"] = True,
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Score multiple genetic variants in batch using AlphaGenome with configurable variant scorers.
Input is VCF file with variant information and output is comprehensive variant scores table.
"""
# Validate exactly one input
if vcf_path is None:
raise ValueError("Path to VCF file must be provided")
# Set output prefix
if out_prefix is None:
out_prefix = f"batch_variant_scores_{timestamp}"
# Load VCF file containing variants
vcf = pd.read_csv(vcf_path, sep='\t')
# Validate required columns
required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
if column not in vcf.columns:
raise ValueError(f'VCF file is missing required column: {column}.')
# Load the model
dna_model = dna_client.create(api_key)
# Parse organism specification
organism_map = {
'human': dna_client.Organism.HOMO_SAPIENS,
'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism_enum = organism_map[organism]
# Parse sequence length
sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
f'SEQUENCE_LENGTH_{sequence_length}'
]
# Parse scorer specification
scorer_selections = {
'rna_seq': score_rna_seq,
'cage': score_cage,
'procap': score_procap,
'atac': score_atac,
'dnase': score_dnase,
'chip_histone': score_chip_histone,
'chip_tf': score_chip_tf,
'polyadenylation': score_polyadenylation,
'splice_sites': score_splice_sites,
'splice_site_usage': score_splice_site_usage,
'splice_junctions': score_splice_junctions,
}
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
all_scorers[key]
for key in all_scorers
if scorer_selections.get(key.lower(), False)
]
# Remove any scorers or output types that are not supported for the chosen organism
unsupported_scorers = [
scorer
for scorer in selected_scorers
if (
organism_enum.value
not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
)
| (
(scorer.requested_output == dna_client.OutputType.PROCAP)
& (organism_enum == dna_client.Organism.MUS_MUSCULUS)
)
]
if len(unsupported_scorers) > 0:
print(
f'Excluding {unsupported_scorers} scorers as they are not supported for'
f' {organism_enum}.'
)
for unsupported_scorer in unsupported_scorers:
selected_scorers.remove(unsupported_scorer)
# Score variants in the VCF file
results = []
for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
variant = genome.Variant(
chromosome=str(vcf_row.CHROM),
position=int(vcf_row.POS),
reference_bases=vcf_row.REF,
alternate_bases=vcf_row.ALT,
name=vcf_row.variant_id,
)
interval = variant.reference_interval.resize(sequence_length_value)
variant_scores = dna_model.score_variant(
interval=interval,
variant=variant,
variant_scorers=selected_scorers,
organism=organism_enum,
)
results.append(variant_scores)
df_scores = variant_scorers.tidy_scores(results)
# Save results
output_file = OUTPUT_DIR / f"{out_prefix}.csv"
df_scores.to_csv(output_file, index=False)
return {
"message": f"Batch variant scoring completed for {len(vcf)} variants with {len(selected_scorers)} scorers",
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
"artifacts": [
{
"description": "Batch variant scores results",
"path": str(output_file.resolve())
}
]
}
@batch_variant_scoring_mcp.tool
def filter_variant_scores(
# Primary data inputs
scores_path: Annotated[str | None, "Path to variant scores CSV file from batch scoring"] = None,
# Analysis parameters with tutorial defaults
ontology_curie: Annotated[str, "Ontology CURIE for filtering (e.g., 'CL:0000084' for T-cells)"] = "CL:0000084",
exclude_ontology_column: Annotated[bool, "Whether to exclude ontology_curie column from output"] = True,
out_prefix: Annotated[str | None, "Output file prefix"] = None,
) -> dict:
"""
Filter variant scores by ontology criteria to examine effects on specific cell types or tissues.
Input is variant scores CSV file and output is filtered scores table.
"""
# Validate exactly one input
if scores_path is None:
raise ValueError("Path to variant scores CSV file must be provided")
# Set output prefix
if out_prefix is None:
out_prefix = f"filtered_variant_scores_{timestamp}"
# Load variant scores
df_scores = pd.read_csv(scores_path)
# Filter by ontology criteria
filtered_df = df_scores[df_scores['ontology_curie'] == ontology_curie]
# Optionally exclude ontology column
if exclude_ontology_column:
columns = [c for c in filtered_df.columns if c != 'ontology_curie']
filtered_df = filtered_df[columns]
# Save filtered results
output_file = OUTPUT_DIR / f"{out_prefix}.csv"
filtered_df.to_csv(output_file, index=False)
return {
"message": f"Filtered {len(filtered_df)} variant scores for ontology {ontology_curie}",
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
"artifacts": [
{
"description": "Filtered variant scores",
"path": str(output_file.resolve())
}
]
} |