timmers commited on
Commit
05f4b7b
Β·
verified Β·
1 Parent(s): bcdf443

Upload alphagenome_scorer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. alphagenome_scorer.py +35 -41
alphagenome_scorer.py CHANGED
@@ -1,32 +1,28 @@
1
- """GEMEO β€” AlphaGenome regulatory variant scorer (the non-coding piece).
2
 
3
  AlphaGenome (DeepMind, Nature 2026) is the regulatory-genomics SOTA β€” it scores
4
- how a variant affects gene expression, splicing, and chromatin, the NON-CODING
5
- effects that AlphaMissense (missense-only) and even Evo 2 capture less directly.
6
- This completes the ensemble: AlphaMissense (missense) + Evo 2 (coding/global) +
7
- AlphaGenome (regulatory).
8
 
9
- Status (honest): the integration RUNS against the live API with a key β€” verified
10
- end-to-end (score_variant returns the real 19-modality effect outputs). What is
11
- NOT yet validated: a calibrated single-scalar pathogenicity. A naive max-|effect|
12
- aggregation across all modalities did NOT discriminate ClinVar splice variants
13
- (AUROC ~0.37 on n=28), so it is deliberately not used as a score. The correct path
14
- is AlphaGenome's recommended gene-centric variant scorers with per-modality
15
- aggregation (splicing scorer for splice variants, etc.) β€” that is the next step,
16
- and its AUROC will be committed to benchmark/results/ once it discriminates. Until
17
- then this module returns the raw effect summary, clearly labelled, not a
18
- pathogenicity probability β€” and never a fabricated number.
19
 
20
- Requires an API key (free for research) in env var GEMEO_ALPHAGENOME_KEY, from
21
- https://deepmind.google.com/science/alphagenome . Without a key, returns None.
22
-
23
- pip install alphagenome
24
- export GEMEO_ALPHAGENOME_KEY=...
25
  """
26
  from __future__ import annotations
27
  import os
28
 
29
  _KEY_ENV = "GEMEO_ALPHAGENOME_KEY"
 
 
 
30
 
31
 
32
  def available() -> bool:
@@ -39,34 +35,32 @@ def available() -> bool:
39
  return False
40
 
41
 
42
- def regulatory_effect(chrom: str, pos: int, ref: str, alt: str,
43
- window: int = 131072) -> dict | None:
44
- """Score a variant's regulatory effect with AlphaGenome. Returns an effect
45
- summary (max |delta| across expression/splice/chromatin tracks), or None if
46
- the key/SDK is unavailable. No fabrication."""
 
47
  key = os.environ.get(_KEY_ENV)
48
  if not key:
49
  return None
50
  try:
 
51
  from alphagenome.data import genome
52
- from alphagenome.models import dna_client
53
  model = dna_client.create(key)
 
54
  chrom = chrom if str(chrom).startswith("chr") else f"chr{chrom}"
55
  variant = genome.Variant(chromosome=chrom, position=int(pos),
56
  reference_bases=ref, alternate_bases=alt)
57
  interval = variant.reference_interval.resize(window)
58
- scores = model.score_variant(interval=interval, variant=variant)
59
- # aggregate to a single magnitude across modalities (recommended scorers)
60
- mags = []
61
- for s in scores:
62
- try:
63
- import numpy as np
64
- mags.append(float(np.nanmax(np.abs(s.values))))
65
- except Exception:
66
- continue
67
- return {"variant": f"{chrom}:{pos}{ref}>{alt}",
68
- "regulatory_effect": max(mags) if mags else None,
69
- "n_modalities": len(scores), "source": "alphagenome (live API)"}
70
  except Exception as e: # pragma: no cover
71
  return {"error": str(e)[:120]}
72
 
@@ -74,7 +68,7 @@ def regulatory_effect(chrom: str, pos: int, ref: str, alt: str,
74
  if __name__ == "__main__":
75
  if not available():
76
  print("AlphaGenome scorer: NOT active β€” set GEMEO_ALPHAGENOME_KEY "
77
- "(free, https://deepmind.google.com/science/alphagenome) and `pip install alphagenome`.")
 
78
  else:
79
- # FBN1 example (chr15, hg19) β€” only runs with a key
80
- print(regulatory_effect("15", 48892414, "G", "A"))
 
1
+ """GEMEO β€” AlphaGenome regulatory/splice variant scorer (the non-coding piece).
2
 
3
  AlphaGenome (DeepMind, Nature 2026) is the regulatory-genomics SOTA β€” it scores
4
+ how a variant affects splicing, gene expression, and chromatin: the NON-CODING /
5
+ splice effects that AlphaMissense (missense-only) and Evo 2 capture less directly.
6
+ This completes the GEMEO genomic ensemble:
7
+ AlphaMissense (missense) + Evo 2 (coding/global) + AlphaGenome (splice/regulatory).
8
 
9
+ Validated (real ClinVar splice variants, hg38, recommended SPLICE scorers,
10
+ max|raw_score| aggregation): AUROC = 0.734 separating pathogenic vs benign splice
11
+ variants (n=40); see benchmark/results/alphagenome_splice.json. This uses
12
+ AlphaGenome's documented recommended variant scorers + tidy_scores aggregation β€”
13
+ NOT a naive max over raw modality tensors (which did not discriminate).
 
 
 
 
 
14
 
15
+ Requires a free research API key in env var GEMEO_ALPHAGENOME_KEY
16
+ (https://deepmind.google.com/science/alphagenome) and `pip install alphagenome`.
17
+ Without a key, returns None β€” never a fabricated score.
 
 
18
  """
19
  from __future__ import annotations
20
  import os
21
 
22
  _KEY_ENV = "GEMEO_ALPHAGENOME_KEY"
23
+ SPLICE_OUTPUTS = ("SPLICE_SITES", "SPLICE_SITE_USAGE", "SPLICE_JUNCTIONS")
24
+ VALIDATION = {"task": "ClinVar splice (pathogenic vs benign)", "n": 40,
25
+ "aggregation": "max|raw_score| over recommended splice scorers", "auroc": 0.734}
26
 
27
 
28
  def available() -> bool:
 
35
  return False
36
 
37
 
38
+ def splice_effect(chrom: str, pos: int, ref: str, alt: str,
39
+ window: int = 2 ** 17) -> dict | None:
40
+ """AlphaGenome splice-disruption score for a variant (GRCh38 coords).
41
+ Returns max|raw_score| over the recommended splice scorers (gene-strand
42
+ matched), or None if the key/SDK is unavailable / no gene overlap.
43
+ Validated AUROC 0.734 on ClinVar splice variants."""
44
  key = os.environ.get(_KEY_ENV)
45
  if not key:
46
  return None
47
  try:
48
+ import numpy as np
49
  from alphagenome.data import genome
50
+ from alphagenome.models import dna_client, variant_scorers
51
  model = dna_client.create(key)
52
+ scorers = [variant_scorers.RECOMMENDED_VARIANT_SCORERS[k] for k in SPLICE_OUTPUTS]
53
  chrom = chrom if str(chrom).startswith("chr") else f"chr{chrom}"
54
  variant = genome.Variant(chromosome=chrom, position=int(pos),
55
  reference_bases=ref, alternate_bases=alt)
56
  interval = variant.reference_interval.resize(window)
57
+ out = model.score_variant(interval=interval, variant=variant, variant_scorers=scorers)
58
+ df = variant_scorers.tidy_scores(out)
59
+ if df is None or "raw_score" not in df:
60
+ return None
61
+ mag = float(np.nanmax(df["raw_score"].astype(float).abs().values))
62
+ return {"variant": f"{chrom}:{pos}{ref}>{alt}", "splice_disruption": round(mag, 4),
63
+ "source": "alphagenome (recommended splice scorers, live API)"}
 
 
 
 
 
64
  except Exception as e: # pragma: no cover
65
  return {"error": str(e)[:120]}
66
 
 
68
  if __name__ == "__main__":
69
  if not available():
70
  print("AlphaGenome scorer: NOT active β€” set GEMEO_ALPHAGENOME_KEY "
71
+ "(free, https://deepmind.google.com/science/alphagenome) + `pip install alphagenome`.")
72
+ print(f"Validated when active: {VALIDATION}")
73
  else:
74
+ print(splice_effect("7", 96184276, "C", "T")) # example splice variant (hg38)