"""Generate a fake UMAP scatter dataset for prototyping the §6 viewer. The real embedding-UMAP output (from Dana's Carbon 3B pipeline) will land in roughly the same shape. Until then, this gives us a 500K-point fixture to develop the WebGL frontend at target scale, so we don't discover performance cliffs later. Layout strategy: - 24 eukaryotic species grouped into 5 kingdoms (vertebrates, invertebrates, plants, fungi, protozoa). Kingdom centers are placed evenly around a radius-12 circle in UMAP space, with species centers jittered nearby. - Each point is drawn from a 2D Gaussian around its species center. - Biotype, strand, and codon phase are roughly orthogonal axes — toggling color in the UI reveals different organizations of the same points. - Points are output sorted by species so gzip can RLE the category columns aggressively (typically 90%+ compression on those bytes). Binary layout (little-endian, 40-byte header + ~4 MB payload for 500K pts): uint32 magic 0xCAB0FA1D uint32 n_points uint32 n_species uint32 n_biotypes uint32 n_strands uint32 n_phases float32 x_min, x_max, y_min, y_max (16 bytes — for de-quantization) int16 positions[n_points * 2] (interleaved x,y; quantized) uint8 species[n_points] uint8 biotype[n_points] uint8 strand[n_points] uint8 phase[n_points] Usage: python scripts/gen_fake_umap.py """ import array import json import math import os import random import struct import sys import time # --- Dataset shape --------------------------------------------------------- N_POINTS = 500_000 SEED = 42 # 24 eukaryotic species — matches the rough scale of the real run (24 species # from Leandro's Gemini-annotated set, 500/species in Dana's first batch). SPECIES = [ # Vertebrates "human", "mouse", "rat", "chicken", "zebrafish", "xenopus", "dog", "cow", "pig", # Invertebrates "fly", "worm", "mosquito", "honeybee", "sea_urchin", # Plants "arabidopsis", "rice", "maize", "wheat", "soybean", # Fungi "yeast", "neurospora", "candida", # Protozoa "plasmodium", "trypanosoma", ] KINGDOMS = { "vertebrates": ["human", "mouse", "rat", "chicken", "zebrafish", "xenopus", "dog", "cow", "pig"], "invertebrates": ["fly", "worm", "mosquito", "honeybee", "sea_urchin"], "plants": ["arabidopsis", "rice", "maize", "wheat", "soybean"], "fungi": ["yeast", "neurospora", "candida"], "protozoa": ["plasmodium", "trypanosoma"], } BIOTYPES = ["protein_coding", "lncRNA", "miRNA", "pseudogene"] # Roughly realistic frequencies for a mixed eukaryote set. BIOTYPE_WEIGHTS = [0.70, 0.15, 0.08, 0.07] STRANDS = ["+", "-"] PHASES = ["0", "1", "2"] HERE = os.path.dirname(os.path.abspath(__file__)) DATA = os.path.join(os.path.dirname(HERE), "data") # --- Generation ------------------------------------------------------------ def species_centers(): """Pick a UMAP-space center for each species, kingdom by kingdom.""" centers = {} kingdom_names = list(KINGDOMS.keys()) for i, kingdom in enumerate(kingdom_names): # Kingdom center placed evenly around a circle. cx = math.cos(2 * math.pi * i / len(kingdom_names)) * 12.0 cy = math.sin(2 * math.pi * i / len(kingdom_names)) * 12.0 for sp in KINGDOMS[kingdom]: ox = random.gauss(0, 1.8) oy = random.gauss(0, 1.8) centers[sp] = (cx + ox, cy + oy) return centers def weighted_choice_idx(weights): """Faster than random.choices() for tight loops — returns the index.""" r = random.random() acc = 0.0 for i, w in enumerate(weights): acc += w if r < acc: return i return len(weights) - 1 def generate(): """Produce all per-point data as parallel arrays (sorted by species).""" random.seed(SEED) centers = species_centers() xs = array.array("f") ys = array.array("f") sp_col = bytearray() bt_col = bytearray() st_col = bytearray() ph_col = bytearray() base = N_POINTS // len(SPECIES) extra = N_POINTS - base * len(SPECIES) for sp_idx, sp in enumerate(SPECIES): n_this = base + (1 if sp_idx < extra else 0) cx, cy = centers[sp] # Sample n_this points around this species' center. for _ in range(n_this): xs.append(cx + random.gauss(0, 1.2)) ys.append(cy + random.gauss(0, 1.2)) sp_col.append(sp_idx) bt_col.append(weighted_choice_idx(BIOTYPE_WEIGHTS)) st_col.append(random.randrange(2)) ph_col.append(random.randrange(3)) return xs, ys, sp_col, bt_col, st_col, ph_col def pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col): """Pack parallel arrays into the binary layout described in the docstring.""" n = len(xs) x_min, x_max = min(xs), max(xs) y_min, y_max = min(ys), max(ys) # Map floats into int16 range [-32767, 32767] using the bounds (so the # client can reconstitute float coords if it ever needs to). qx = array.array("h") qy = array.array("h") rx = 65534.0 / (x_max - x_min) ry = 65534.0 / (y_max - y_min) for i in range(n): qx.append(int(round((xs[i] - x_min) * rx - 32767))) qy.append(int(round((ys[i] - y_min) * ry - 32767))) # Interleave xy into a single int16 stream (WebGL friendly: stride 4 bytes). pos = array.array("h", [0] * (2 * n)) for i in range(n): pos[2 * i] = qx[i] pos[2 * i + 1] = qy[i] buf = bytearray() # Header — 40 bytes (6 uint32 + 4 float32). buf += struct.pack("<6I", 0xCAB0FA1D, n, len(SPECIES), len(BIOTYPES), len(STRANDS), len(PHASES)) buf += struct.pack("<4f", x_min, x_max, y_min, y_max) buf += pos.tobytes() buf += bytes(sp_col) buf += bytes(bt_col) buf += bytes(st_col) buf += bytes(ph_col) return buf, (x_min, x_max, y_min, y_max) def main(): os.makedirs(DATA, exist_ok=True) t0 = time.time() print(f"generating {N_POINTS:,} fake UMAP points ...", file=sys.stderr) xs, ys, sp_col, bt_col, st_col, ph_col = generate() print(f" done in {time.time() - t0:.1f}s", file=sys.stderr) t1 = time.time() print("packing binary ...", file=sys.stderr) buf, bounds = pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col) print(f" done in {time.time() - t1:.1f}s", file=sys.stderr) out_bin = os.path.join(DATA, "umap.bin") with open(out_bin, "wb") as f: f.write(buf) print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr) out_labels = os.path.join(DATA, "umap_labels.json") species_to_kingdom = {sp: kd for kd, members in KINGDOMS.items() for sp in members} with open(out_labels, "w") as f: json.dump({ "species": SPECIES, "biotypes": BIOTYPES, "strands": STRANDS, "phases": PHASES, "species_kingdom": species_to_kingdom, "kingdoms": list(KINGDOMS.keys()), "bounds": list(bounds), "n_points": len(xs), "fake": True, }, f, indent=2) print(f" wrote {out_labels}", file=sys.stderr) print(f"total: {time.time() - t0:.1f}s", file=sys.stderr) if __name__ == "__main__": main()