Spaces:

HuggingFaceBio
/

carbon-demo

Running

App Files Files Community

carbon-demo / scripts /fetch_genes.py

lvwerra HF Staff

Add tabbed Demo / Model / Sandbox page (default = Sandbox)

971b586 28 days ago

raw

history blame

3.91 kB

	"""Fetch curated gene reference sequences + exon/intron annotations from Ensembl REST.

	Run once to (re)generate data/genes.json. Sequences are returned in the gene's
	own transcribed (5'->3') orientation so they can be fed to Carbon directly.

	Usage:
	python scripts/fetch_genes.py
	"""
	import json
	import os
	import sys
	import time
	import urllib.request

	ENSEMBL = "https://rest.ensembl.org"

	# Curated set: small enough to fit on screen, biologically famous, varied.
	# (BRCA1 omitted — full genomic span is ~80kb, would need a slice not a whole gene.)
	GENES = [
	{"symbol": "HBB", "blurb": "β-globin · sickle-cell gene · 3 exons"},
	{"symbol": "INS", "blurb": "insulin · the smallest classic gene · 3 exons"},
	{"symbol": "TP53", "blurb": "tumor suppressor · 'guardian of the genome' · 11 exons"},
	{"symbol": "MYC", "blurb": "proto-oncogene · regulator of cell growth · 3 exons"},
	]


	def get_json(path):
	req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
	return json.loads(urllib.request.urlopen(req, timeout=30).read())


	def revcomp(s):
	comp = str.maketrans("ACGTNacgtn", "TGCANtgcan")
	return s.translate(comp)[::-1]


	def fetch_gene(symbol):
	g = get_json(f"/lookup/symbol/homo_sapiens/{symbol}?expand=1")
	chrom = g["seq_region_name"]
	strand = g["strand"]
	# Canonical transcript — use its boundaries (TSS to PAS), not the gene's
	# boundaries, which can include regulatory regions before/after the transcript.
	ct = next(t for t in g["Transcript"] if t.get("is_canonical"))
	t_start = ct["start"]
	t_end = ct["end"]
	exons = sorted(ct["Exon"], key=lambda e: e["start"])

	# Fetch genomic sequence on the + strand for the transcript span only
	seq_data = get_json(f"/sequence/region/human/{chrom}:{t_start}..{t_end}:1?content-type=application/json")
	plus_seq = seq_data["seq"].upper()
	assert len(plus_seq) == t_end - t_start + 1, (len(plus_seq), t_end - t_start + 1)

	# Map exons to sequence-local coords. For a minus-strand gene, transcribe (revcomp)
	# the plus-strand sequence and flip exon coordinates accordingly.
	if strand == 1:
	seq = plus_seq
	local_exons = [(e["start"] - t_start, e["end"] - t_start + 1) for e in exons] # half-open
	else:
	seq = revcomp(plus_seq)
	L = len(plus_seq)
	local_exons = [(L - (e["end"] - t_start) - 1, L - (e["start"] - t_start)) for e in exons]
	# Order exons 5' -> 3'
	local_exons.sort(key=lambda x: x[0])

	# Derive introns from gaps between exons.
	introns = []
	for (a, b), (c, d) in zip(local_exons, local_exons[1:]):
	if c > b:
	introns.append((b, c))

	return {
	"symbol": symbol,
	"ensembl_gene": g["id"],
	"ensembl_transcript": ct["id"],
	"chrom": chrom,
	"strand": strand,
	"length": len(seq),
	"seq": seq,
	"exons": [{"start": s, "end": e} for s, e in local_exons],
	"introns": [{"start": s, "end": e} for s, e in introns],
	}


	def main():
	out = []
	for entry in GENES:
	sym = entry["symbol"]
	print(f"fetching {sym}…", flush=True)
	try:
	data = fetch_gene(sym)
	data["blurb"] = entry["blurb"]
	print(f" {sym}: {data['length']}bp, {len(data['exons'])} exons", flush=True)
	out.append(data)
	time.sleep(0.3) # be polite to Ensembl
	except Exception as e:
	print(f" {sym}: FAILED — {e}", file=sys.stderr)
	sys.exit(1)
	here = os.path.dirname(os.path.abspath(__file__))
	out_path = os.path.join(os.path.dirname(here), "data", "genes.json")
	with open(out_path, "w") as f:
	json.dump(out, f, indent=2)
	total_kb = os.path.getsize(out_path) / 1024
	print(f"\nwrote {out_path} ({total_kb:.1f} KB)")


	if __name__ == "__main__":
	main()