carbon-demo / scripts /gen_fake_umap.py
tfrere's picture
tfrere HF Staff
§6: WebGL UMAP scatter, 500K points on a fake fixture
f039f0b
"""Generate a fake UMAP scatter dataset for prototyping the §6 viewer.
The real embedding-UMAP output (from Dana's Carbon 3B pipeline) will land
in roughly the same shape. Until then, this gives us a 500K-point fixture
to develop the WebGL frontend at target scale, so we don't discover
performance cliffs later.
Layout strategy:
- 24 eukaryotic species grouped into 5 kingdoms (vertebrates, invertebrates,
plants, fungi, protozoa). Kingdom centers are placed evenly around a
radius-12 circle in UMAP space, with species centers jittered nearby.
- Each point is drawn from a 2D Gaussian around its species center.
- Biotype, strand, and codon phase are roughly orthogonal axes — toggling
color in the UI reveals different organizations of the same points.
- Points are output sorted by species so gzip can RLE the category columns
aggressively (typically 90%+ compression on those bytes).
Binary layout (little-endian, 40-byte header + ~4 MB payload for 500K pts):
uint32 magic 0xCAB0FA1D
uint32 n_points
uint32 n_species
uint32 n_biotypes
uint32 n_strands
uint32 n_phases
float32 x_min, x_max, y_min, y_max (16 bytes — for de-quantization)
int16 positions[n_points * 2] (interleaved x,y; quantized)
uint8 species[n_points]
uint8 biotype[n_points]
uint8 strand[n_points]
uint8 phase[n_points]
Usage:
python scripts/gen_fake_umap.py
"""
import array
import json
import math
import os
import random
import struct
import sys
import time
# --- Dataset shape ---------------------------------------------------------
N_POINTS = 500_000
SEED = 42
# 24 eukaryotic species — matches the rough scale of the real run (24 species
# from Leandro's Gemini-annotated set, 500/species in Dana's first batch).
SPECIES = [
# Vertebrates
"human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
"dog", "cow", "pig",
# Invertebrates
"fly", "worm", "mosquito", "honeybee", "sea_urchin",
# Plants
"arabidopsis", "rice", "maize", "wheat", "soybean",
# Fungi
"yeast", "neurospora", "candida",
# Protozoa
"plasmodium", "trypanosoma",
]
KINGDOMS = {
"vertebrates": ["human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
"dog", "cow", "pig"],
"invertebrates": ["fly", "worm", "mosquito", "honeybee", "sea_urchin"],
"plants": ["arabidopsis", "rice", "maize", "wheat", "soybean"],
"fungi": ["yeast", "neurospora", "candida"],
"protozoa": ["plasmodium", "trypanosoma"],
}
BIOTYPES = ["protein_coding", "lncRNA", "miRNA", "pseudogene"]
# Roughly realistic frequencies for a mixed eukaryote set.
BIOTYPE_WEIGHTS = [0.70, 0.15, 0.08, 0.07]
STRANDS = ["+", "-"]
PHASES = ["0", "1", "2"]
HERE = os.path.dirname(os.path.abspath(__file__))
DATA = os.path.join(os.path.dirname(HERE), "data")
# --- Generation ------------------------------------------------------------
def species_centers():
"""Pick a UMAP-space center for each species, kingdom by kingdom."""
centers = {}
kingdom_names = list(KINGDOMS.keys())
for i, kingdom in enumerate(kingdom_names):
# Kingdom center placed evenly around a circle.
cx = math.cos(2 * math.pi * i / len(kingdom_names)) * 12.0
cy = math.sin(2 * math.pi * i / len(kingdom_names)) * 12.0
for sp in KINGDOMS[kingdom]:
ox = random.gauss(0, 1.8)
oy = random.gauss(0, 1.8)
centers[sp] = (cx + ox, cy + oy)
return centers
def weighted_choice_idx(weights):
"""Faster than random.choices() for tight loops — returns the index."""
r = random.random()
acc = 0.0
for i, w in enumerate(weights):
acc += w
if r < acc:
return i
return len(weights) - 1
def generate():
"""Produce all per-point data as parallel arrays (sorted by species)."""
random.seed(SEED)
centers = species_centers()
xs = array.array("f")
ys = array.array("f")
sp_col = bytearray()
bt_col = bytearray()
st_col = bytearray()
ph_col = bytearray()
base = N_POINTS // len(SPECIES)
extra = N_POINTS - base * len(SPECIES)
for sp_idx, sp in enumerate(SPECIES):
n_this = base + (1 if sp_idx < extra else 0)
cx, cy = centers[sp]
# Sample n_this points around this species' center.
for _ in range(n_this):
xs.append(cx + random.gauss(0, 1.2))
ys.append(cy + random.gauss(0, 1.2))
sp_col.append(sp_idx)
bt_col.append(weighted_choice_idx(BIOTYPE_WEIGHTS))
st_col.append(random.randrange(2))
ph_col.append(random.randrange(3))
return xs, ys, sp_col, bt_col, st_col, ph_col
def pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col):
"""Pack parallel arrays into the binary layout described in the docstring."""
n = len(xs)
x_min, x_max = min(xs), max(xs)
y_min, y_max = min(ys), max(ys)
# Map floats into int16 range [-32767, 32767] using the bounds (so the
# client can reconstitute float coords if it ever needs to).
qx = array.array("h")
qy = array.array("h")
rx = 65534.0 / (x_max - x_min)
ry = 65534.0 / (y_max - y_min)
for i in range(n):
qx.append(int(round((xs[i] - x_min) * rx - 32767)))
qy.append(int(round((ys[i] - y_min) * ry - 32767)))
# Interleave xy into a single int16 stream (WebGL friendly: stride 4 bytes).
pos = array.array("h", [0] * (2 * n))
for i in range(n):
pos[2 * i] = qx[i]
pos[2 * i + 1] = qy[i]
buf = bytearray()
# Header — 40 bytes (6 uint32 + 4 float32).
buf += struct.pack("<6I",
0xCAB0FA1D, n,
len(SPECIES), len(BIOTYPES),
len(STRANDS), len(PHASES))
buf += struct.pack("<4f", x_min, x_max, y_min, y_max)
buf += pos.tobytes()
buf += bytes(sp_col)
buf += bytes(bt_col)
buf += bytes(st_col)
buf += bytes(ph_col)
return buf, (x_min, x_max, y_min, y_max)
def main():
os.makedirs(DATA, exist_ok=True)
t0 = time.time()
print(f"generating {N_POINTS:,} fake UMAP points ...", file=sys.stderr)
xs, ys, sp_col, bt_col, st_col, ph_col = generate()
print(f" done in {time.time() - t0:.1f}s", file=sys.stderr)
t1 = time.time()
print("packing binary ...", file=sys.stderr)
buf, bounds = pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col)
print(f" done in {time.time() - t1:.1f}s", file=sys.stderr)
out_bin = os.path.join(DATA, "umap.bin")
with open(out_bin, "wb") as f:
f.write(buf)
print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)
out_labels = os.path.join(DATA, "umap_labels.json")
species_to_kingdom = {sp: kd for kd, members in KINGDOMS.items() for sp in members}
with open(out_labels, "w") as f:
json.dump({
"species": SPECIES,
"biotypes": BIOTYPES,
"strands": STRANDS,
"phases": PHASES,
"species_kingdom": species_to_kingdom,
"kingdoms": list(KINGDOMS.keys()),
"bounds": list(bounds),
"n_points": len(xs),
"fake": True,
}, f, indent=2)
print(f" wrote {out_labels}", file=sys.stderr)
print(f"total: {time.time() - t0:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()