Spaces:
Running
Running
| """Fetch cross-species orthologs for a curated set of genes. | |
| For each (human gene, target species) pair, resolve the canonical orthologous | |
| transcript in the target species, fetch its genomic sequence on its coding | |
| strand, and save to data/species.json. | |
| Usage: | |
| python scripts/fetch_species.py | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import urllib.request | |
| ENSEMBL = "https://rest.ensembl.org" | |
| # Curated set: pick genes with deeply conserved orthologs across vertebrates. | |
| GENES = ["INS", "TP53"] | |
| # (HBB has noisy orthology — chicken/mouse don't return the actual β-globin ortholog | |
| # via Ensembl's homology API.) | |
| SPECIES = [ | |
| {"id": "homo_sapiens", "common": "human", "color": "#1a1a1a"}, | |
| {"id": "mus_musculus", "common": "mouse", "color": "#2c5aa0"}, | |
| {"id": "gallus_gallus", "common": "chicken", "color": "#c08030"}, | |
| # Zebrafish dropped: ~450 My from human, the model usually can't pick up | |
| # the lineage from ~400 bp of context and the row looks like noise next | |
| # to mammals + bird. | |
| ] | |
| PREFIX_LEN = 1200 # cap on returned seq length per species (only ~200 will be fed as prompt) | |
| def get_json(path): | |
| req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"}) | |
| return json.loads(urllib.request.urlopen(req, timeout=30).read()) | |
| def revcomp(s): | |
| return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1] | |
| def fetch_for(symbol, species_id): | |
| if species_id == "homo_sapiens": | |
| # Same path as fetch_genes, abbreviated | |
| g = get_json(f"/lookup/symbol/{species_id}/{symbol}?expand=1") | |
| ct = next(t for t in g["Transcript"] if t.get("is_canonical")) | |
| else: | |
| # Find ortholog via homology | |
| h = get_json(f"/homology/symbol/human/{symbol}?target_species={species_id}&type=orthologues") | |
| homologies = h.get("data", [{}])[0].get("homologies", []) | |
| if not homologies: | |
| raise RuntimeError(f"{symbol} → {species_id}: no ortholog") | |
| target_gene_id = homologies[0]["target"]["id"] | |
| g = get_json(f"/lookup/id/{target_gene_id}?expand=1") | |
| if not g.get("Transcript"): | |
| raise RuntimeError(f"{symbol} → {species_id}: no transcripts") | |
| ct = next((t for t in g["Transcript"] if t.get("is_canonical")), g["Transcript"][0]) | |
| chrom = g["seq_region_name"] | |
| strand = g["strand"] | |
| t_start_full = ct["start"] | |
| t_end_full = ct["end"] | |
| exons_genomic = [(e["start"], e["end"]) for e in ct.get("Exon", [])] | |
| t_start, t_end = t_start_full, t_end_full | |
| # Cap fetched length so the JSON stays small | |
| if t_end - t_start + 1 > PREFIX_LEN: | |
| if strand == 1: | |
| t_end = t_start + PREFIX_LEN - 1 | |
| else: | |
| t_start = t_end - PREFIX_LEN + 1 | |
| species_path = species_id # ensembl uses the species name in the path | |
| seq_data = get_json(f"/sequence/region/{species_path}/{chrom}:{t_start}..{t_end}:1?content-type=application/json") | |
| plus_seq = seq_data["seq"].upper() | |
| if strand == 1: | |
| seq = plus_seq | |
| else: | |
| seq = revcomp(plus_seq) | |
| # Translate exons from genomic coords into 0-based [start, end) offsets in | |
| # `seq`. For + strand seq[i] = plus pos t_start + i. For - strand seq is | |
| # revcomp(plus_seq), so seq[i] = plus pos t_end - i. Exons that fall | |
| # outside the trimmed window are clipped or dropped. | |
| exons_seq = [] | |
| seq_len = len(seq) | |
| for e_start, e_end in exons_genomic: | |
| if strand == 1: | |
| s = e_start - t_start | |
| e = e_end - t_start + 1 | |
| else: | |
| s = t_end - e_end | |
| e = t_end - e_start + 1 | |
| s = max(0, s); e = min(seq_len, e) | |
| if e > s: | |
| exons_seq.append({"start": s, "end": e}) | |
| exons_seq.sort(key=lambda x: x["start"]) | |
| return { | |
| "ortholog_symbol": g.get("display_name", symbol), | |
| "ensembl_gene": g["id"], | |
| "ensembl_transcript": ct["id"], | |
| "chrom": chrom, | |
| "strand": strand, | |
| "length": len(seq), | |
| "seq": seq, | |
| "exons": exons_seq, | |
| } | |
| def main(): | |
| out = [] | |
| for sym in GENES: | |
| entry = {"symbol": sym, "species": []} | |
| for sp in SPECIES: | |
| print(f" {sym} → {sp['id']}…", flush=True) | |
| try: | |
| d = fetch_for(sym, sp["id"]) | |
| d["species_id"] = sp["id"] | |
| d["common"] = sp["common"] | |
| d["color"] = sp["color"] | |
| entry["species"].append(d) | |
| print(f" ✓ {d['ortholog_symbol']} · {d['length']}bp · chr{d['chrom']} strand {d['strand']}") | |
| time.sleep(0.4) | |
| except Exception as e: | |
| print(f" ✗ — {e}", file=sys.stderr) | |
| out.append(entry) | |
| here = os.path.dirname(os.path.abspath(__file__)) | |
| out_path = os.path.join(os.path.dirname(here), "data", "species.json") | |
| with open(out_path, "w") as f: | |
| json.dump(out, f, indent=2) | |
| print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB)") | |
| if __name__ == "__main__": | |
| main() | |