Spaces:
Running
Running
File size: 3,912 Bytes
971b586 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """Fetch curated gene reference sequences + exon/intron annotations from Ensembl REST.
Run once to (re)generate data/genes.json. Sequences are returned in the gene's
own transcribed (5'->3') orientation so they can be fed to Carbon directly.
Usage:
python scripts/fetch_genes.py
"""
import json
import os
import sys
import time
import urllib.request
ENSEMBL = "https://rest.ensembl.org"
# Curated set: small enough to fit on screen, biologically famous, varied.
# (BRCA1 omitted — full genomic span is ~80kb, would need a slice not a whole gene.)
GENES = [
{"symbol": "HBB", "blurb": "β-globin · sickle-cell gene · 3 exons"},
{"symbol": "INS", "blurb": "insulin · the smallest classic gene · 3 exons"},
{"symbol": "TP53", "blurb": "tumor suppressor · 'guardian of the genome' · 11 exons"},
{"symbol": "MYC", "blurb": "proto-oncogene · regulator of cell growth · 3 exons"},
]
def get_json(path):
req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read())
def revcomp(s):
comp = str.maketrans("ACGTNacgtn", "TGCANtgcan")
return s.translate(comp)[::-1]
def fetch_gene(symbol):
g = get_json(f"/lookup/symbol/homo_sapiens/{symbol}?expand=1")
chrom = g["seq_region_name"]
strand = g["strand"]
# Canonical transcript — use its boundaries (TSS to PAS), not the gene's
# boundaries, which can include regulatory regions before/after the transcript.
ct = next(t for t in g["Transcript"] if t.get("is_canonical"))
t_start = ct["start"]
t_end = ct["end"]
exons = sorted(ct["Exon"], key=lambda e: e["start"])
# Fetch genomic sequence on the + strand for the transcript span only
seq_data = get_json(f"/sequence/region/human/{chrom}:{t_start}..{t_end}:1?content-type=application/json")
plus_seq = seq_data["seq"].upper()
assert len(plus_seq) == t_end - t_start + 1, (len(plus_seq), t_end - t_start + 1)
# Map exons to sequence-local coords. For a minus-strand gene, transcribe (revcomp)
# the plus-strand sequence and flip exon coordinates accordingly.
if strand == 1:
seq = plus_seq
local_exons = [(e["start"] - t_start, e["end"] - t_start + 1) for e in exons] # half-open
else:
seq = revcomp(plus_seq)
L = len(plus_seq)
local_exons = [(L - (e["end"] - t_start) - 1, L - (e["start"] - t_start)) for e in exons]
# Order exons 5' -> 3'
local_exons.sort(key=lambda x: x[0])
# Derive introns from gaps between exons.
introns = []
for (a, b), (c, d) in zip(local_exons, local_exons[1:]):
if c > b:
introns.append((b, c))
return {
"symbol": symbol,
"ensembl_gene": g["id"],
"ensembl_transcript": ct["id"],
"chrom": chrom,
"strand": strand,
"length": len(seq),
"seq": seq,
"exons": [{"start": s, "end": e} for s, e in local_exons],
"introns": [{"start": s, "end": e} for s, e in introns],
}
def main():
out = []
for entry in GENES:
sym = entry["symbol"]
print(f"fetching {sym}…", flush=True)
try:
data = fetch_gene(sym)
data["blurb"] = entry["blurb"]
print(f" {sym}: {data['length']}bp, {len(data['exons'])} exons", flush=True)
out.append(data)
time.sleep(0.3) # be polite to Ensembl
except Exception as e:
print(f" {sym}: FAILED — {e}", file=sys.stderr)
sys.exit(1)
here = os.path.dirname(os.path.abspath(__file__))
out_path = os.path.join(os.path.dirname(here), "data", "genes.json")
with open(out_path, "w") as f:
json.dump(out, f, indent=2)
total_kb = os.path.getsize(out_path) / 1024
print(f"\nwrote {out_path} ({total_kb:.1f} KB)")
if __name__ == "__main__":
main()
|