Spaces:
Running
Running
File size: 5,079 Bytes
971b586 2c5ccd8 971b586 2c5ccd8 971b586 2c5ccd8 971b586 2c5ccd8 971b586 2c5ccd8 971b586 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """Fetch cross-species orthologs for a curated set of genes.
For each (human gene, target species) pair, resolve the canonical orthologous
transcript in the target species, fetch its genomic sequence on its coding
strand, and save to data/species.json.
Usage:
python scripts/fetch_species.py
"""
import json
import os
import sys
import time
import urllib.request
ENSEMBL = "https://rest.ensembl.org"
# Curated set: pick genes with deeply conserved orthologs across vertebrates.
GENES = ["INS", "TP53"]
# (HBB has noisy orthology — chicken/mouse don't return the actual β-globin ortholog
# via Ensembl's homology API.)
SPECIES = [
{"id": "homo_sapiens", "common": "human", "color": "#1a1a1a"},
{"id": "mus_musculus", "common": "mouse", "color": "#2c5aa0"},
{"id": "gallus_gallus", "common": "chicken", "color": "#c08030"},
# Zebrafish dropped: ~450 My from human, the model usually can't pick up
# the lineage from ~400 bp of context and the row looks like noise next
# to mammals + bird.
]
PREFIX_LEN = 1200 # cap on returned seq length per species (only ~200 will be fed as prompt)
def get_json(path):
req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read())
def revcomp(s):
return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1]
def fetch_for(symbol, species_id):
if species_id == "homo_sapiens":
# Same path as fetch_genes, abbreviated
g = get_json(f"/lookup/symbol/{species_id}/{symbol}?expand=1")
ct = next(t for t in g["Transcript"] if t.get("is_canonical"))
else:
# Find ortholog via homology
h = get_json(f"/homology/symbol/human/{symbol}?target_species={species_id}&type=orthologues")
homologies = h.get("data", [{}])[0].get("homologies", [])
if not homologies:
raise RuntimeError(f"{symbol} → {species_id}: no ortholog")
target_gene_id = homologies[0]["target"]["id"]
g = get_json(f"/lookup/id/{target_gene_id}?expand=1")
if not g.get("Transcript"):
raise RuntimeError(f"{symbol} → {species_id}: no transcripts")
ct = next((t for t in g["Transcript"] if t.get("is_canonical")), g["Transcript"][0])
chrom = g["seq_region_name"]
strand = g["strand"]
t_start_full = ct["start"]
t_end_full = ct["end"]
exons_genomic = [(e["start"], e["end"]) for e in ct.get("Exon", [])]
t_start, t_end = t_start_full, t_end_full
# Cap fetched length so the JSON stays small
if t_end - t_start + 1 > PREFIX_LEN:
if strand == 1:
t_end = t_start + PREFIX_LEN - 1
else:
t_start = t_end - PREFIX_LEN + 1
species_path = species_id # ensembl uses the species name in the path
seq_data = get_json(f"/sequence/region/{species_path}/{chrom}:{t_start}..{t_end}:1?content-type=application/json")
plus_seq = seq_data["seq"].upper()
if strand == 1:
seq = plus_seq
else:
seq = revcomp(plus_seq)
# Translate exons from genomic coords into 0-based [start, end) offsets in
# `seq`. For + strand seq[i] = plus pos t_start + i. For - strand seq is
# revcomp(plus_seq), so seq[i] = plus pos t_end - i. Exons that fall
# outside the trimmed window are clipped or dropped.
exons_seq = []
seq_len = len(seq)
for e_start, e_end in exons_genomic:
if strand == 1:
s = e_start - t_start
e = e_end - t_start + 1
else:
s = t_end - e_end
e = t_end - e_start + 1
s = max(0, s); e = min(seq_len, e)
if e > s:
exons_seq.append({"start": s, "end": e})
exons_seq.sort(key=lambda x: x["start"])
return {
"ortholog_symbol": g.get("display_name", symbol),
"ensembl_gene": g["id"],
"ensembl_transcript": ct["id"],
"chrom": chrom,
"strand": strand,
"length": len(seq),
"seq": seq,
"exons": exons_seq,
}
def main():
out = []
for sym in GENES:
entry = {"symbol": sym, "species": []}
for sp in SPECIES:
print(f" {sym} → {sp['id']}…", flush=True)
try:
d = fetch_for(sym, sp["id"])
d["species_id"] = sp["id"]
d["common"] = sp["common"]
d["color"] = sp["color"]
entry["species"].append(d)
print(f" ✓ {d['ortholog_symbol']} · {d['length']}bp · chr{d['chrom']} strand {d['strand']}")
time.sleep(0.4)
except Exception as e:
print(f" ✗ — {e}", file=sys.stderr)
out.append(entry)
here = os.path.dirname(os.path.abspath(__file__))
out_path = os.path.join(os.path.dirname(here), "data", "species.json")
with open(out_path, "w") as f:
json.dump(out, f, indent=2)
print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB)")
if __name__ == "__main__":
main()
|