Spaces:

HuggingFaceBio
/

carbon-demo

Running

App Files Files Community

carbon-demo / scripts /gen_fake_umap.py

tfrere HF Staff

§6: WebGL UMAP scatter, 500K points on a fake fixture

f039f0b 23 days ago

raw

history blame contribute delete

7.51 kB

	"""Generate a fake UMAP scatter dataset for prototyping the §6 viewer.

	The real embedding-UMAP output (from Dana's Carbon 3B pipeline) will land
	in roughly the same shape. Until then, this gives us a 500K-point fixture
	to develop the WebGL frontend at target scale, so we don't discover
	performance cliffs later.

	Layout strategy:
	- 24 eukaryotic species grouped into 5 kingdoms (vertebrates, invertebrates,
	plants, fungi, protozoa). Kingdom centers are placed evenly around a
	radius-12 circle in UMAP space, with species centers jittered nearby.
	- Each point is drawn from a 2D Gaussian around its species center.
	- Biotype, strand, and codon phase are roughly orthogonal axes — toggling
	color in the UI reveals different organizations of the same points.
	- Points are output sorted by species so gzip can RLE the category columns
	aggressively (typically 90%+ compression on those bytes).

	Binary layout (little-endian, 40-byte header + ~4 MB payload for 500K pts):
	uint32 magic 0xCAB0FA1D
	uint32 n_points
	uint32 n_species
	uint32 n_biotypes
	uint32 n_strands
	uint32 n_phases
	float32 x_min, x_max, y_min, y_max (16 bytes — for de-quantization)
	int16 positions[n_points * 2] (interleaved x,y; quantized)
	uint8 species[n_points]
	uint8 biotype[n_points]
	uint8 strand[n_points]
	uint8 phase[n_points]

	Usage:
	python scripts/gen_fake_umap.py
	"""
	import array
	import json
	import math
	import os
	import random
	import struct
	import sys
	import time

	# --- Dataset shape ---------------------------------------------------------

	N_POINTS = 500_000
	SEED = 42

	# 24 eukaryotic species — matches the rough scale of the real run (24 species
	# from Leandro's Gemini-annotated set, 500/species in Dana's first batch).
	SPECIES = [
	# Vertebrates
	"human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
	"dog", "cow", "pig",
	# Invertebrates
	"fly", "worm", "mosquito", "honeybee", "sea_urchin",
	# Plants
	"arabidopsis", "rice", "maize", "wheat", "soybean",
	# Fungi
	"yeast", "neurospora", "candida",
	# Protozoa
	"plasmodium", "trypanosoma",
	]

	KINGDOMS = {
	"vertebrates": ["human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
	"dog", "cow", "pig"],
	"invertebrates": ["fly", "worm", "mosquito", "honeybee", "sea_urchin"],
	"plants": ["arabidopsis", "rice", "maize", "wheat", "soybean"],
	"fungi": ["yeast", "neurospora", "candida"],
	"protozoa": ["plasmodium", "trypanosoma"],
	}

	BIOTYPES = ["protein_coding", "lncRNA", "miRNA", "pseudogene"]
	# Roughly realistic frequencies for a mixed eukaryote set.
	BIOTYPE_WEIGHTS = [0.70, 0.15, 0.08, 0.07]

	STRANDS = ["+", "-"]
	PHASES = ["0", "1", "2"]

	HERE = os.path.dirname(os.path.abspath(__file__))
	DATA = os.path.join(os.path.dirname(HERE), "data")

	# --- Generation ------------------------------------------------------------


	def species_centers():
	"""Pick a UMAP-space center for each species, kingdom by kingdom."""
	centers = {}
	kingdom_names = list(KINGDOMS.keys())
	for i, kingdom in enumerate(kingdom_names):
	# Kingdom center placed evenly around a circle.
	cx = math.cos(2 * math.pi * i / len(kingdom_names)) * 12.0
	cy = math.sin(2 * math.pi * i / len(kingdom_names)) * 12.0
	for sp in KINGDOMS[kingdom]:
	ox = random.gauss(0, 1.8)
	oy = random.gauss(0, 1.8)
	centers[sp] = (cx + ox, cy + oy)
	return centers


	def weighted_choice_idx(weights):
	"""Faster than random.choices() for tight loops — returns the index."""
	r = random.random()
	acc = 0.0
	for i, w in enumerate(weights):
	acc += w
	if r < acc:
	return i
	return len(weights) - 1


	def generate():
	"""Produce all per-point data as parallel arrays (sorted by species)."""
	random.seed(SEED)
	centers = species_centers()

	xs = array.array("f")
	ys = array.array("f")
	sp_col = bytearray()
	bt_col = bytearray()
	st_col = bytearray()
	ph_col = bytearray()

	base = N_POINTS // len(SPECIES)
	extra = N_POINTS - base * len(SPECIES)

	for sp_idx, sp in enumerate(SPECIES):
	n_this = base + (1 if sp_idx < extra else 0)
	cx, cy = centers[sp]
	# Sample n_this points around this species' center.
	for _ in range(n_this):
	xs.append(cx + random.gauss(0, 1.2))
	ys.append(cy + random.gauss(0, 1.2))
	sp_col.append(sp_idx)
	bt_col.append(weighted_choice_idx(BIOTYPE_WEIGHTS))
	st_col.append(random.randrange(2))
	ph_col.append(random.randrange(3))

	return xs, ys, sp_col, bt_col, st_col, ph_col


	def pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col):
	"""Pack parallel arrays into the binary layout described in the docstring."""
	n = len(xs)
	x_min, x_max = min(xs), max(xs)
	y_min, y_max = min(ys), max(ys)

	# Map floats into int16 range [-32767, 32767] using the bounds (so the
	# client can reconstitute float coords if it ever needs to).
	qx = array.array("h")
	qy = array.array("h")
	rx = 65534.0 / (x_max - x_min)
	ry = 65534.0 / (y_max - y_min)
	for i in range(n):
	qx.append(int(round((xs[i] - x_min) * rx - 32767)))
	qy.append(int(round((ys[i] - y_min) * ry - 32767)))

	# Interleave xy into a single int16 stream (WebGL friendly: stride 4 bytes).
	pos = array.array("h", [0] * (2 * n))
	for i in range(n):
	pos[2 * i] = qx[i]
	pos[2 * i + 1] = qy[i]

	buf = bytearray()
	# Header — 40 bytes (6 uint32 + 4 float32).
	buf += struct.pack("<6I",
	0xCAB0FA1D, n,
	len(SPECIES), len(BIOTYPES),
	len(STRANDS), len(PHASES))
	buf += struct.pack("<4f", x_min, x_max, y_min, y_max)

	buf += pos.tobytes()
	buf += bytes(sp_col)
	buf += bytes(bt_col)
	buf += bytes(st_col)
	buf += bytes(ph_col)

	return buf, (x_min, x_max, y_min, y_max)


	def main():
	os.makedirs(DATA, exist_ok=True)

	t0 = time.time()
	print(f"generating {N_POINTS:,} fake UMAP points ...", file=sys.stderr)
	xs, ys, sp_col, bt_col, st_col, ph_col = generate()
	print(f" done in {time.time() - t0:.1f}s", file=sys.stderr)

	t1 = time.time()
	print("packing binary ...", file=sys.stderr)
	buf, bounds = pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col)
	print(f" done in {time.time() - t1:.1f}s", file=sys.stderr)

	out_bin = os.path.join(DATA, "umap.bin")
	with open(out_bin, "wb") as f:
	f.write(buf)
	print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)

	out_labels = os.path.join(DATA, "umap_labels.json")
	species_to_kingdom = {sp: kd for kd, members in KINGDOMS.items() for sp in members}
	with open(out_labels, "w") as f:
	json.dump({
	"species": SPECIES,
	"biotypes": BIOTYPES,
	"strands": STRANDS,
	"phases": PHASES,
	"species_kingdom": species_to_kingdom,
	"kingdoms": list(KINGDOMS.keys()),
	"bounds": list(bounds),
	"n_points": len(xs),
	"fake": True,
	}, f, indent=2)
	print(f" wrote {out_labels}", file=sys.stderr)
	print(f"total: {time.time() - t0:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()