Spaces:

HuggingFaceBio
/

carbon-demo

Running

App Files Files Community

carbon-demo / scripts /build_real_umap.py

tfrere HF Staff

Add §7 species tree, slim down §6 UMAP, mount /experiments

1f90847 21 days ago

raw

history blame contribute delete

14.3 kB

	"""Convert Dana's viz.csv (Carbon 3B embedding UMAP) into our packed binary.

	Input:
	/tmp/carbon-umap/viz.csv (or pass --csv PATH)

	Columns expected:
	row_idx, gene_id, name, biotype, chrom, start, end, strand, assembly,
	species, length, gc_content, seq_len_used,
	umap2d_x, umap2d_y,
	umap3d_x, umap3d_y, umap3d_z

	Output (this format matches what demo.html's initDemoUmap() now parses):

	data/umap.bin (~5 MB raw → ~3.1 MB gzipped)
	Header (64 bytes, little-endian):
	uint32 magic 0xCAB0FA1D
	uint32 n_points
	uint32 n_species
	uint32 n_biotypes
	uint32 n_strands
	uint32 flags (bit 0 = has 3D, bit 1 = has gc, bit 2 = has length)
	float32 bounds_2d[4] (x_min, x_max, y_min, y_max)
	float32 bounds_3d[6] (x_min, x_max, y_min, y_max, z_min, z_max)
	zeros when bit 0 of flags is unset.
	Payload (column-major so each column gzips independently well):
	int16 pos_2d[n_points * 2] interleaved x,y
	int16 pos_3d[n_points * 3] interleaved x,y,z (only when
	bit 0 of flags is set — currently
	disabled: see "Why no pos_3d?"
	below)
	uint8 species[n_points]
	uint8 biotype[n_points]
	uint8 strand[n_points]
	uint8 gc_content[n_points] quantized to 0..255 over [0, 1]
	uint8 length[n_points] quantized to 0..255 over
	[log10_length_min, log10_length_max].
	uint8 + log gives ~2.5% relative
	precision over a 5+ decade dynamic
	range (~6 bp → ~3 Mb in this
	dataset), which is well below
	visual noise on a continuous
	gradient overlay.

	Why no pos_3d? v1 of the viewer ships a 2D scatter only — the front
	explicitly skips the pos_3d block on parse (see initDemoUmap, the
	`if (has3D)` branch that just advances the read cursor without
	storing). Shipping the int16 × 3 coords meant ~3.3 MB of wasted
	bandwidth per visit. We keep the computation alive in this script
	(negligible CPU, ~half a second on 570 K rows) but skip writing
	it to the binary; to re-enable, flip `WRITE_POS_3D` below.

	data/umap_labels.json
	Label arrays + kingdom mapping + bounds, consumed by the frontend
	to build the legend, tooltip, and color palettes. Now also carries
	`length_log10_range` (so the front can de-quantize a uint8 byte
	back to bp for the tooltip) and `length_bp_range` (for the legend
	tick marks).

	data/umap_names.txt
	One gene name per line, in the SAME species-sorted order as the
	binary's per-point columns. Plain text (not JSON) because the
	format is trivial, gzips identically well, and is ~10x faster to
	parse than JSON.parse over 500 K strings. Tooltip-only data, so
	the frontend fetches it lazily after the WebGL render is up.

	Points are sorted by species before writing so the species column becomes
	runs of identical bytes — gzip RLEs it down to ~10% of raw size.

	Usage:
	python scripts/build_real_umap.py [--csv /path/to/viz.csv]
	"""
	import argparse
	import array
	import csv
	import json
	import math
	import os
	import struct
	import sys
	import time

	HERE = os.path.dirname(os.path.abspath(__file__))
	DATA = os.path.join(os.path.dirname(HERE), "data")
	DEFAULT_CSV = "/tmp/carbon-umap/viz.csv"

	# Kingdom grouping for the 27 species in the bucket. Order inside each
	# kingdom controls the palette band that the species gets (the frontend
	# assigns adjacent hues to adjacent species).
	KINGDOMS = {
	"vertebrates": ["human", "macaque", "mouse", "rat", "dog", "cow", "pig",
	"chicken", "frog", "zebrafish"],
	"invertebrates": ["fly", "worm"],
	"plants": ["arabidopsis", "soybean", "tomato", "maize", "rice"],
	"fungi": ["yeast", "fission_yeast", "candida", "aspergillus",
	"neurospora"],
	"bacteria": ["ecoli", "bsubtilis", "saureus"],
	"viruses": ["sarscov2", "hiv1"],
	}


	def build_species_order():
	order, kingdom_map = [], {}
	for kingdom, members in KINGDOMS.items():
	for sp in members:
	order.append(sp)
	kingdom_map[sp] = kingdom
	return order, kingdom_map


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--csv", default=DEFAULT_CSV, help="path to viz.csv")
	args = ap.parse_args()

	if not os.path.exists(args.csv):
	sys.exit(f"viz.csv not found at {args.csv}\n"
	"Download from the HF bucket:\n"
	" huggingface.co/buckets/HuggingFaceBio/carbon_embeddings/"
	"tree/carbon_dana_embeddings_middle/viz.csv")

	species_order, kingdom_map = build_species_order()
	species_idx = {sp: i for i, sp in enumerate(species_order)}
	biotypes = ["protein_coding", "lncRNA", "snRNA", "misc_RNA"]
	biotype_idx = {b: i for i, b in enumerate(biotypes)}
	strands = ["+", "-"]
	strand_idx = {s: i for i, s in enumerate(strands)}

	t0 = time.time()
	print(f"reading {args.csv}", file=sys.stderr)

	# Single pass: bucket rows by species so we end up species-sorted in the
	# output (gzip-friendly) without an explicit O(n log n) sort.
	buckets = [[] for _ in species_order]
	seen_species = set()
	seen_biotype = set()
	unknown_species = set()
	with open(args.csv) as f:
	reader = csv.DictReader(f)
	for row in reader:
	sp = row["species"]
	if sp not in species_idx:
	unknown_species.add(sp)
	continue
	seen_species.add(sp)
	seen_biotype.add(row["biotype"])
	# `length` is the genomic span in bp. We clamp at >=1 so log10()
	# is always well-defined; the rare 0-length entries Dana sees
	# (corrupt rows in the source GFFs) get binned into the floor
	# of the colour scale instead of crashing the build.
	try:
	length_bp = max(1, int(row["length"]))
	except (KeyError, ValueError):
	length_bp = 1
	# Prefer the human-readable `name` (HBB, BRCA1, AT1G30814) but
	# fall back to gene_id when the source had no symbol — keeps
	# the tooltip from ever rendering "(empty)" rows.
	name = row.get("name") or row.get("gene_id") or "—"
	buckets[species_idx[sp]].append((
	float(row["umap2d_x"]), float(row["umap2d_y"]),
	float(row["umap3d_x"]), float(row["umap3d_y"]), float(row["umap3d_z"]),
	biotype_idx[row["biotype"]],
	strand_idx[row["strand"]],
	float(row["gc_content"]),
	length_bp,
	name,
	))

	if unknown_species:
	print(f"warning: {len(unknown_species)} unknown species skipped: "
	f"{sorted(unknown_species)}", file=sys.stderr)

	flat = [(sp_i, p) for sp_i, bucket in enumerate(buckets) for p in bucket]
	n = len(flat)
	print(f" {n:,} points kept in {time.time()-t0:.1f}s", file=sys.stderr)

	# --- Compute bounds for quantization -----------------------------------
	x2_min = min(p[1][0] for p in flat); x2_max = max(p[1][0] for p in flat)
	y2_min = min(p[1][1] for p in flat); y2_max = max(p[1][1] for p in flat)
	x3_min = min(p[1][2] for p in flat); x3_max = max(p[1][2] for p in flat)
	y3_min = min(p[1][3] for p in flat); y3_max = max(p[1][3] for p in flat)
	z3_min = min(p[1][4] for p in flat); z3_max = max(p[1][4] for p in flat)

	# Length is log-distributed (5+ decades from miRNAs to titin). We
	# quantize log10(length) into uint8 — gives a perceptually uniform
	# color ramp and keeps the byte storage compact. We also stash the
	# log range in umap_labels.json so the tooltip can de-quantize back
	# to bp for human-readable display.
	log_lengths = [math.log10(p[1][8]) for p in flat]
	len_log_min = min(log_lengths)
	len_log_max = max(log_lengths)
	len_bp_min = min(p[1][8] for p in flat)
	len_bp_max = max(p[1][8] for p in flat)

	def quantize(v, lo, hi):
	return int(round((v - lo) / (hi - lo) * 65534 - 32767))

	rx2 = 65534.0 / (x2_max - x2_min)
	ry2 = 65534.0 / (y2_max - y2_min)
	rx3 = 65534.0 / (x3_max - x3_min)
	ry3 = 65534.0 / (y3_max - y3_min)
	rz3 = 65534.0 / (z3_max - z3_min)
	# Avoid div-by-zero on the (highly degenerate) case of a single-length
	# input — quantize() would NaN. The 1.0 fallback collapses every byte
	# to the same value, which is the visually-correct behaviour.
	rlen = 255.0 / (len_log_max - len_log_min) if len_log_max > len_log_min else 1.0

	t1 = time.time()
	print(f"packing binary ({n:,} points)...", file=sys.stderr)

	pos2 = array.array("h", [0] * (2 * n))
	pos3 = array.array("h", [0] * (3 * n))
	sp_col = bytearray(n)
	bt_col = bytearray(n)
	st_col = bytearray(n)
	gc_col = bytearray(n)
	len_col = bytearray(n)
	names = [None] * n

	for i, (sp_i, p) in enumerate(flat):
	x2, y2, x3, y3, z3, bt, st, gc, length_bp, name = p
	pos2[2i] = int(round((x2 - x2_min) rx2 - 32767))
	pos2[2i + 1] = int(round((y2 - y2_min) ry2 - 32767))
	pos3[3i] = int(round((x3 - x3_min) rx3 - 32767))
	pos3[3i + 1] = int(round((y3 - y3_min) ry3 - 32767))
	pos3[3i + 2] = int(round((z3 - z3_min) rz3 - 32767))
	sp_col[i] = sp_i
	bt_col[i] = bt
	st_col[i] = st
	# gc_content is in [0, 1] but Dana's data tops at 0.9837, so the full
	# uint8 range gives ~0.4% precision which is well below visual noise.
	gc_col[i] = min(255, max(0, int(round(gc * 255))))
	len_col[i] = min(255, max(0, int(round((math.log10(length_bp) - len_log_min) * rlen))))
	# Strip newlines from gene names — we serialise as one-name-per-line
	# text below, so any embedded \n would shift the alignment of every
	# subsequent point. Tabs and stray whitespace pass through fine.
	names[i] = name.replace("\n", " ").replace("\r", " ")

	print(f" {time.time()-t1:.1f}s", file=sys.stderr)

	# --- Write binary -------------------------------------------------------
	# Toggle for the optional pos_3d payload. When False (current default),
	# bit 0 of `flags` is cleared, the int16×3 coords are NOT appended to
	# the binary, and the bounds_3d slots in the header are zeroed out so
	# the frontend never reads stale ranges. Flip back to True if/when a
	# 3D rotating viewer is added — the script always computes pos3/bounds_3d
	# so this is a single-edit, one-rebuild change.
	WRITE_POS_3D = False

	buf = bytearray()
	# bit 0: has 3D positions, bit 1: has gc_content, bit 2: has length
	flags = 0b111 if WRITE_POS_3D else 0b110
	buf += struct.pack("<6I", 0xCAB0FA1D, n, len(species_order),
	len(biotypes), len(strands), flags)
	buf += struct.pack("<4f", x2_min, x2_max, y2_min, y2_max)
	if WRITE_POS_3D:
	buf += struct.pack("<6f", x3_min, x3_max, y3_min, y3_max, z3_min, z3_max)
	else:
	# Keep the header at exactly 64 bytes: emit six zero floats so the
	# offset of the pos_2d payload (and the frontend's fixed-offset
	# reads) stays unchanged.
	buf += struct.pack("<6f", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
	assert len(buf) == 64, f"header size {len(buf)} != 64"
	buf += pos2.tobytes()
	if WRITE_POS_3D:
	buf += pos3.tobytes()
	buf += bytes(sp_col)
	buf += bytes(bt_col)
	buf += bytes(st_col)
	buf += bytes(gc_col)
	buf += bytes(len_col)

	out_bin = os.path.join(DATA, "umap.bin")
	with open(out_bin, "wb") as f:
	f.write(buf)
	print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)

	# --- Write labels -------------------------------------------------------
	out_labels = os.path.join(DATA, "umap_labels.json")
	with open(out_labels, "w") as f:
	json.dump({
	"species": species_order,
	"biotypes": biotypes,
	"strands": strands,
	"species_kingdom": kingdom_map,
	"kingdoms": list(KINGDOMS.keys()),
	"bounds_2d": [x2_min, x2_max, y2_min, y2_max],
	"bounds_3d": [x3_min, x3_max, y3_min, y3_max, z3_min, z3_max],
	# log-space range used for the uint8 quantization of `length`.
	# The frontend de-quantizes a byte b back to bp by:
	# bp = round(10 ** (log_min + b/255 * (log_max - log_min)))
	# Stored as a list (not tuple) for stable JSON.
	"length_log10_range": [len_log_min, len_log_max],
	"length_bp_range": [int(len_bp_min), int(len_bp_max)],
	"n_points": n,
	"has_3d": WRITE_POS_3D,
	"has_gc": True,
	"has_length": True,
	"has_names": True,
	"source": "HuggingFaceBio/carbon_embeddings (carbon_dana_embeddings_middle)",
	}, f, indent=2)
	print(f" wrote {out_labels}", file=sys.stderr)

	# --- Write per-point gene names ----------------------------------------
	# Plain-text, one-per-line, in the same species-sorted order as the
	# binary's column data. The frontend lazy-loads this AFTER the WebGL
	# render is up, then re-aligns it to the in-memory shuffled order
	# (see scatter.js shuffleParallel + permutation tracking).
	out_names = os.path.join(DATA, "umap_names.txt")
	with open(out_names, "w", encoding="utf-8") as f:
	f.write("\n".join(names))
	print(f" wrote {out_names}", file=sys.stderr)
	print(f"total: {time.time()-t0:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()