"""Convert Dana's viz.csv (Carbon 3B embedding UMAP) into our packed binary. Input: /tmp/carbon-umap/viz.csv (or pass --csv PATH) Columns expected: row_idx, gene_id, name, biotype, chrom, start, end, strand, assembly, species, length, gc_content, seq_len_used, umap2d_x, umap2d_y, umap3d_x, umap3d_y, umap3d_z Output (this format matches what demo.html's initDemoUmap() now parses): data/umap.bin (~5 MB raw → ~3.1 MB gzipped) Header (64 bytes, little-endian): uint32 magic 0xCAB0FA1D uint32 n_points uint32 n_species uint32 n_biotypes uint32 n_strands uint32 flags (bit 0 = has 3D, bit 1 = has gc, bit 2 = has length) float32 bounds_2d[4] (x_min, x_max, y_min, y_max) float32 bounds_3d[6] (x_min, x_max, y_min, y_max, z_min, z_max) zeros when bit 0 of flags is unset. Payload (column-major so each column gzips independently well): int16 pos_2d[n_points * 2] interleaved x,y int16 pos_3d[n_points * 3] interleaved x,y,z (only when bit 0 of flags is set — currently disabled: see "Why no pos_3d?" below) uint8 species[n_points] uint8 biotype[n_points] uint8 strand[n_points] uint8 gc_content[n_points] quantized to 0..255 over [0, 1] uint8 length[n_points] quantized to 0..255 over [log10_length_min, log10_length_max]. uint8 + log gives ~2.5% relative precision over a 5+ decade dynamic range (~6 bp → ~3 Mb in this dataset), which is well below visual noise on a continuous gradient overlay. Why no pos_3d? v1 of the viewer ships a 2D scatter only — the front explicitly skips the pos_3d block on parse (see initDemoUmap, the `if (has3D)` branch that just advances the read cursor without storing). Shipping the int16 × 3 coords meant ~3.3 MB of wasted bandwidth per visit. We keep the computation alive in this script (negligible CPU, ~half a second on 570 K rows) but skip writing it to the binary; to re-enable, flip `WRITE_POS_3D` below. data/umap_labels.json Label arrays + kingdom mapping + bounds, consumed by the frontend to build the legend, tooltip, and color palettes. Now also carries `length_log10_range` (so the front can de-quantize a uint8 byte back to bp for the tooltip) and `length_bp_range` (for the legend tick marks). data/umap_names.txt One gene name per line, in the SAME species-sorted order as the binary's per-point columns. Plain text (not JSON) because the format is trivial, gzips identically well, and is ~10x faster to parse than JSON.parse over 500 K strings. Tooltip-only data, so the frontend fetches it lazily after the WebGL render is up. Points are sorted by species before writing so the species column becomes runs of identical bytes — gzip RLEs it down to ~10% of raw size. Usage: python scripts/build_real_umap.py [--csv /path/to/viz.csv] """ import argparse import array import csv import json import math import os import struct import sys import time HERE = os.path.dirname(os.path.abspath(__file__)) DATA = os.path.join(os.path.dirname(HERE), "data") DEFAULT_CSV = "/tmp/carbon-umap/viz.csv" # Kingdom grouping for the 27 species in the bucket. Order inside each # kingdom controls the palette band that the species gets (the frontend # assigns adjacent hues to adjacent species). KINGDOMS = { "vertebrates": ["human", "macaque", "mouse", "rat", "dog", "cow", "pig", "chicken", "frog", "zebrafish"], "invertebrates": ["fly", "worm"], "plants": ["arabidopsis", "soybean", "tomato", "maize", "rice"], "fungi": ["yeast", "fission_yeast", "candida", "aspergillus", "neurospora"], "bacteria": ["ecoli", "bsubtilis", "saureus"], "viruses": ["sarscov2", "hiv1"], } def build_species_order(): order, kingdom_map = [], {} for kingdom, members in KINGDOMS.items(): for sp in members: order.append(sp) kingdom_map[sp] = kingdom return order, kingdom_map def main(): ap = argparse.ArgumentParser() ap.add_argument("--csv", default=DEFAULT_CSV, help="path to viz.csv") args = ap.parse_args() if not os.path.exists(args.csv): sys.exit(f"viz.csv not found at {args.csv}\n" "Download from the HF bucket:\n" " huggingface.co/buckets/HuggingFaceBio/carbon_embeddings/" "tree/carbon_dana_embeddings_middle/viz.csv") species_order, kingdom_map = build_species_order() species_idx = {sp: i for i, sp in enumerate(species_order)} biotypes = ["protein_coding", "lncRNA", "snRNA", "misc_RNA"] biotype_idx = {b: i for i, b in enumerate(biotypes)} strands = ["+", "-"] strand_idx = {s: i for i, s in enumerate(strands)} t0 = time.time() print(f"reading {args.csv}", file=sys.stderr) # Single pass: bucket rows by species so we end up species-sorted in the # output (gzip-friendly) without an explicit O(n log n) sort. buckets = [[] for _ in species_order] seen_species = set() seen_biotype = set() unknown_species = set() with open(args.csv) as f: reader = csv.DictReader(f) for row in reader: sp = row["species"] if sp not in species_idx: unknown_species.add(sp) continue seen_species.add(sp) seen_biotype.add(row["biotype"]) # `length` is the genomic span in bp. We clamp at >=1 so log10() # is always well-defined; the rare 0-length entries Dana sees # (corrupt rows in the source GFFs) get binned into the floor # of the colour scale instead of crashing the build. try: length_bp = max(1, int(row["length"])) except (KeyError, ValueError): length_bp = 1 # Prefer the human-readable `name` (HBB, BRCA1, AT1G30814) but # fall back to gene_id when the source had no symbol — keeps # the tooltip from ever rendering "(empty)" rows. name = row.get("name") or row.get("gene_id") or "—" buckets[species_idx[sp]].append(( float(row["umap2d_x"]), float(row["umap2d_y"]), float(row["umap3d_x"]), float(row["umap3d_y"]), float(row["umap3d_z"]), biotype_idx[row["biotype"]], strand_idx[row["strand"]], float(row["gc_content"]), length_bp, name, )) if unknown_species: print(f"warning: {len(unknown_species)} unknown species skipped: " f"{sorted(unknown_species)}", file=sys.stderr) flat = [(sp_i, p) for sp_i, bucket in enumerate(buckets) for p in bucket] n = len(flat) print(f" {n:,} points kept in {time.time()-t0:.1f}s", file=sys.stderr) # --- Compute bounds for quantization ----------------------------------- x2_min = min(p[1][0] for p in flat); x2_max = max(p[1][0] for p in flat) y2_min = min(p[1][1] for p in flat); y2_max = max(p[1][1] for p in flat) x3_min = min(p[1][2] for p in flat); x3_max = max(p[1][2] for p in flat) y3_min = min(p[1][3] for p in flat); y3_max = max(p[1][3] for p in flat) z3_min = min(p[1][4] for p in flat); z3_max = max(p[1][4] for p in flat) # Length is *log-distributed* (5+ decades from miRNAs to titin). We # quantize log10(length) into uint8 — gives a perceptually uniform # color ramp and keeps the byte storage compact. We also stash the # log range in umap_labels.json so the tooltip can de-quantize back # to bp for human-readable display. log_lengths = [math.log10(p[1][8]) for p in flat] len_log_min = min(log_lengths) len_log_max = max(log_lengths) len_bp_min = min(p[1][8] for p in flat) len_bp_max = max(p[1][8] for p in flat) def quantize(v, lo, hi): return int(round((v - lo) / (hi - lo) * 65534 - 32767)) rx2 = 65534.0 / (x2_max - x2_min) ry2 = 65534.0 / (y2_max - y2_min) rx3 = 65534.0 / (x3_max - x3_min) ry3 = 65534.0 / (y3_max - y3_min) rz3 = 65534.0 / (z3_max - z3_min) # Avoid div-by-zero on the (highly degenerate) case of a single-length # input — quantize() would NaN. The 1.0 fallback collapses every byte # to the same value, which is the visually-correct behaviour. rlen = 255.0 / (len_log_max - len_log_min) if len_log_max > len_log_min else 1.0 t1 = time.time() print(f"packing binary ({n:,} points)...", file=sys.stderr) pos2 = array.array("h", [0] * (2 * n)) pos3 = array.array("h", [0] * (3 * n)) sp_col = bytearray(n) bt_col = bytearray(n) st_col = bytearray(n) gc_col = bytearray(n) len_col = bytearray(n) names = [None] * n for i, (sp_i, p) in enumerate(flat): x2, y2, x3, y3, z3, bt, st, gc, length_bp, name = p pos2[2*i] = int(round((x2 - x2_min) * rx2 - 32767)) pos2[2*i + 1] = int(round((y2 - y2_min) * ry2 - 32767)) pos3[3*i] = int(round((x3 - x3_min) * rx3 - 32767)) pos3[3*i + 1] = int(round((y3 - y3_min) * ry3 - 32767)) pos3[3*i + 2] = int(round((z3 - z3_min) * rz3 - 32767)) sp_col[i] = sp_i bt_col[i] = bt st_col[i] = st # gc_content is in [0, 1] but Dana's data tops at 0.9837, so the full # uint8 range gives ~0.4% precision which is well below visual noise. gc_col[i] = min(255, max(0, int(round(gc * 255)))) len_col[i] = min(255, max(0, int(round((math.log10(length_bp) - len_log_min) * rlen)))) # Strip newlines from gene names — we serialise as one-name-per-line # text below, so any embedded \n would shift the alignment of every # subsequent point. Tabs and stray whitespace pass through fine. names[i] = name.replace("\n", " ").replace("\r", " ") print(f" {time.time()-t1:.1f}s", file=sys.stderr) # --- Write binary ------------------------------------------------------- # Toggle for the optional pos_3d payload. When False (current default), # bit 0 of `flags` is cleared, the int16×3 coords are NOT appended to # the binary, and the bounds_3d slots in the header are zeroed out so # the frontend never reads stale ranges. Flip back to True if/when a # 3D rotating viewer is added — the script always computes pos3/bounds_3d # so this is a single-edit, one-rebuild change. WRITE_POS_3D = False buf = bytearray() # bit 0: has 3D positions, bit 1: has gc_content, bit 2: has length flags = 0b111 if WRITE_POS_3D else 0b110 buf += struct.pack("<6I", 0xCAB0FA1D, n, len(species_order), len(biotypes), len(strands), flags) buf += struct.pack("<4f", x2_min, x2_max, y2_min, y2_max) if WRITE_POS_3D: buf += struct.pack("<6f", x3_min, x3_max, y3_min, y3_max, z3_min, z3_max) else: # Keep the header at exactly 64 bytes: emit six zero floats so the # offset of the pos_2d payload (and the frontend's fixed-offset # reads) stays unchanged. buf += struct.pack("<6f", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) assert len(buf) == 64, f"header size {len(buf)} != 64" buf += pos2.tobytes() if WRITE_POS_3D: buf += pos3.tobytes() buf += bytes(sp_col) buf += bytes(bt_col) buf += bytes(st_col) buf += bytes(gc_col) buf += bytes(len_col) out_bin = os.path.join(DATA, "umap.bin") with open(out_bin, "wb") as f: f.write(buf) print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr) # --- Write labels ------------------------------------------------------- out_labels = os.path.join(DATA, "umap_labels.json") with open(out_labels, "w") as f: json.dump({ "species": species_order, "biotypes": biotypes, "strands": strands, "species_kingdom": kingdom_map, "kingdoms": list(KINGDOMS.keys()), "bounds_2d": [x2_min, x2_max, y2_min, y2_max], "bounds_3d": [x3_min, x3_max, y3_min, y3_max, z3_min, z3_max], # log-space range used for the uint8 quantization of `length`. # The frontend de-quantizes a byte b back to bp by: # bp = round(10 ** (log_min + b/255 * (log_max - log_min))) # Stored as a list (not tuple) for stable JSON. "length_log10_range": [len_log_min, len_log_max], "length_bp_range": [int(len_bp_min), int(len_bp_max)], "n_points": n, "has_3d": WRITE_POS_3D, "has_gc": True, "has_length": True, "has_names": True, "source": "HuggingFaceBio/carbon_embeddings (carbon_dana_embeddings_middle)", }, f, indent=2) print(f" wrote {out_labels}", file=sys.stderr) # --- Write per-point gene names ---------------------------------------- # Plain-text, one-per-line, in the same species-sorted order as the # binary's column data. The frontend lazy-loads this AFTER the WebGL # render is up, then re-aligns it to the in-memory shuffled order # (see scatter.js shuffleParallel + permutation tracking). out_names = os.path.join(DATA, "umap_names.txt") with open(out_names, "w", encoding="utf-8") as f: f.write("\n".join(names)) print(f" wrote {out_names}", file=sys.stderr) print(f"total: {time.time()-t0:.1f}s", file=sys.stderr) if __name__ == "__main__": main()