dpyd-classifier / src /fetch_gnomad.py
abhimanyu12's picture
Upload folder using huggingface_hub
2ea06dc verified
Raw
History Blame Contribute Delete
4.33 kB
"""Fetch gnomAD v4 SAS + global allele frequencies for DPYD variants.
Strategy: the gnomAD GraphQL *region* query over the full DPYD gene span
(~1.3 Mb) returns HTTP 502 (gateway times out). The *per-variant* query by
variant_id (`{chrom}-{pos}-{ref}-{alt}`, GRCh38) is reliable, so we query the
union of variant_ids derived from the ClinVar DPYD pull.
This is the full-run version of the scaffolding probe; it pulls *all* ClinVar
DPYD variant_ids (no cap).
Run: python -m src.fetch_gnomad --clinvar data/clinvar_dpyd.tsv --out data/gnomad_dpyd_sas.csv
"""
from __future__ import annotations
import argparse, csv, time
import requests
API = "https://gnomad.broadinstitute.org/api"
HEADERS = {"Content-Type": "application/json", "User-Agent": "anukriti-research/1.0"}
VARIANT_QUERY = """
query($vid:String!){
variant(variantId:$vid, dataset: gnomad_r4){
variant_id rsid
genome{ af populations{ id ac an } }
exome{ af populations{ id ac an } }
}
}"""
def _pop_af(block: dict | None, pop: str) -> float | None:
if not block:
return None
for p in block.get("populations") or []:
if p["id"] == pop:
return (p["ac"] / p["an"]) if p.get("an") else None
return None
def query_variant(vid: str, retries: int = 3, backoff: float = 2.0):
"""Return the gnomAD variant dict, None (absent), or 'ERR' (unreachable)."""
for _ in range(retries):
try:
r = requests.post(API, json={"query": VARIANT_QUERY, "variables": {"vid": vid}},
headers=HEADERS, timeout=40)
if r.status_code == 200:
return r.json().get("data", {}).get("variant")
time.sleep(backoff)
except requests.RequestException:
time.sleep(backoff)
return "ERR"
def candidate_variant_ids(clinvar_tsv: str):
seen, out = set(), []
with open(clinvar_tsv) as f:
for row in csv.DictReader(f, delimiter="\t"):
for alt in row["alt"].split(","):
if not alt or len(row["ref"]) > 50 or len(alt) > 50:
continue
vid = f'1-{row["pos"]}-{row["ref"]}-{alt}'
if vid not in seen:
seen.add(vid)
out.append((vid, row["rsid"]))
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--clinvar", default="data/clinvar_dpyd.tsv")
ap.add_argument("--out", default="data/gnomad_dpyd_sas.csv")
ap.add_argument("--cap", type=int, default=0, help="0 = no cap (full pull)")
ap.add_argument("--sleep", type=float, default=0.15)
args = ap.parse_args()
cands = candidate_variant_ids(args.clinvar)
if args.cap:
cands = cands[: args.cap]
print(f"querying {len(cands)} gnomAD variants...")
cols = ["variant_id", "rsid", "chrom", "pos", "ref", "alt",
"gnomad_global_af", "gnomad_sas_af", "in_gnomad", "source_dataset"]
found = null = err = 0
with open(args.out, "w", newline="") as fh:
w = csv.DictWriter(fh, fieldnames=cols)
w.writeheader()
for vid, rsid in cands:
_, pos, ref, alt = vid.split("-", 3)
v = query_variant(vid)
if v == "ERR":
err += 1
continue
row = {"variant_id": vid, "rsid": rsid, "chrom": "1", "pos": pos,
"ref": ref, "alt": alt, "source_dataset": "gnomad_r4"}
if v is None:
null += 1
row.update(gnomad_global_af="", gnomad_sas_af="", in_gnomad=0)
else:
g, e = v.get("genome"), v.get("exome")
gaf = (g or {}).get("af")
if gaf is None and e:
gaf = e.get("af")
sas = _pop_af(g, "sas")
if sas is None:
sas = _pop_af(e, "sas")
row.update(rsid=v.get("rsid") or rsid,
gnomad_global_af=gaf if gaf is not None else "",
gnomad_sas_af=sas if sas is not None else "",
in_gnomad=1)
found += 1
w.writerow(row)
time.sleep(args.sleep)
print(f"done: in_gnomad={found} not_in_gnomad={null} errors={err} -> {args.out}")
if __name__ == "__main__":
main()