"""Fetch curated gene reference sequences + exon/intron annotations from Ensembl REST. Run once to (re)generate data/genes.json. Sequences are returned in the gene's own transcribed (5'->3') orientation so they can be fed to Carbon directly. Usage: python scripts/fetch_genes.py """ import json import os import sys import time import urllib.request ENSEMBL = "https://rest.ensembl.org" # Curated set: small enough to fit on screen, biologically famous, varied. # (BRCA1 omitted — full genomic span is ~80kb, would need a slice not a whole gene.) GENES = [ {"symbol": "HBB", "blurb": "β-globin · sickle-cell gene · 3 exons"}, {"symbol": "INS", "blurb": "insulin · the smallest classic gene · 3 exons"}, {"symbol": "TP53", "blurb": "tumor suppressor · 'guardian of the genome' · 11 exons"}, {"symbol": "MYC", "blurb": "proto-oncogene · regulator of cell growth · 3 exons"}, ] def get_json(path): req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"}) return json.loads(urllib.request.urlopen(req, timeout=30).read()) def revcomp(s): comp = str.maketrans("ACGTNacgtn", "TGCANtgcan") return s.translate(comp)[::-1] def fetch_gene(symbol): g = get_json(f"/lookup/symbol/homo_sapiens/{symbol}?expand=1") chrom = g["seq_region_name"] strand = g["strand"] # Canonical transcript — use its boundaries (TSS to PAS), not the gene's # boundaries, which can include regulatory regions before/after the transcript. ct = next(t for t in g["Transcript"] if t.get("is_canonical")) t_start = ct["start"] t_end = ct["end"] exons = sorted(ct["Exon"], key=lambda e: e["start"]) # Fetch genomic sequence on the + strand for the transcript span only seq_data = get_json(f"/sequence/region/human/{chrom}:{t_start}..{t_end}:1?content-type=application/json") plus_seq = seq_data["seq"].upper() assert len(plus_seq) == t_end - t_start + 1, (len(plus_seq), t_end - t_start + 1) # Map exons to sequence-local coords. For a minus-strand gene, transcribe (revcomp) # the plus-strand sequence and flip exon coordinates accordingly. if strand == 1: seq = plus_seq local_exons = [(e["start"] - t_start, e["end"] - t_start + 1) for e in exons] # half-open else: seq = revcomp(plus_seq) L = len(plus_seq) local_exons = [(L - (e["end"] - t_start) - 1, L - (e["start"] - t_start)) for e in exons] # Order exons 5' -> 3' local_exons.sort(key=lambda x: x[0]) # Derive introns from gaps between exons. introns = [] for (a, b), (c, d) in zip(local_exons, local_exons[1:]): if c > b: introns.append((b, c)) return { "symbol": symbol, "ensembl_gene": g["id"], "ensembl_transcript": ct["id"], "chrom": chrom, "strand": strand, "length": len(seq), "seq": seq, "exons": [{"start": s, "end": e} for s, e in local_exons], "introns": [{"start": s, "end": e} for s, e in introns], } def main(): out = [] for entry in GENES: sym = entry["symbol"] print(f"fetching {sym}…", flush=True) try: data = fetch_gene(sym) data["blurb"] = entry["blurb"] print(f" {sym}: {data['length']}bp, {len(data['exons'])} exons", flush=True) out.append(data) time.sleep(0.3) # be polite to Ensembl except Exception as e: print(f" {sym}: FAILED — {e}", file=sys.stderr) sys.exit(1) here = os.path.dirname(os.path.abspath(__file__)) out_path = os.path.join(os.path.dirname(here), "data", "genes.json") with open(out_path, "w") as f: json.dump(out, f, indent=2) total_kb = os.path.getsize(out_path) / 1024 print(f"\nwrote {out_path} ({total_kb:.1f} KB)") if __name__ == "__main__": main()