carbon-demo / scripts /build_real_umap.py
tfrere's picture
tfrere HF Staff
Add §7 species tree, slim down §6 UMAP, mount /experiments
1f90847
"""Convert Dana's viz.csv (Carbon 3B embedding UMAP) into our packed binary.
Input:
/tmp/carbon-umap/viz.csv (or pass --csv PATH)
Columns expected:
row_idx, gene_id, name, biotype, chrom, start, end, strand, assembly,
species, length, gc_content, seq_len_used,
umap2d_x, umap2d_y,
umap3d_x, umap3d_y, umap3d_z
Output (this format matches what demo.html's initDemoUmap() now parses):
data/umap.bin (~5 MB raw → ~3.1 MB gzipped)
Header (64 bytes, little-endian):
uint32 magic 0xCAB0FA1D
uint32 n_points
uint32 n_species
uint32 n_biotypes
uint32 n_strands
uint32 flags (bit 0 = has 3D, bit 1 = has gc, bit 2 = has length)
float32 bounds_2d[4] (x_min, x_max, y_min, y_max)
float32 bounds_3d[6] (x_min, x_max, y_min, y_max, z_min, z_max)
zeros when bit 0 of flags is unset.
Payload (column-major so each column gzips independently well):
int16 pos_2d[n_points * 2] interleaved x,y
int16 pos_3d[n_points * 3] interleaved x,y,z (only when
bit 0 of flags is set — currently
disabled: see "Why no pos_3d?"
below)
uint8 species[n_points]
uint8 biotype[n_points]
uint8 strand[n_points]
uint8 gc_content[n_points] quantized to 0..255 over [0, 1]
uint8 length[n_points] quantized to 0..255 over
[log10_length_min, log10_length_max].
uint8 + log gives ~2.5% relative
precision over a 5+ decade dynamic
range (~6 bp → ~3 Mb in this
dataset), which is well below
visual noise on a continuous
gradient overlay.
Why no pos_3d? v1 of the viewer ships a 2D scatter only — the front
explicitly skips the pos_3d block on parse (see initDemoUmap, the
`if (has3D)` branch that just advances the read cursor without
storing). Shipping the int16 × 3 coords meant ~3.3 MB of wasted
bandwidth per visit. We keep the computation alive in this script
(negligible CPU, ~half a second on 570 K rows) but skip writing
it to the binary; to re-enable, flip `WRITE_POS_3D` below.
data/umap_labels.json
Label arrays + kingdom mapping + bounds, consumed by the frontend
to build the legend, tooltip, and color palettes. Now also carries
`length_log10_range` (so the front can de-quantize a uint8 byte
back to bp for the tooltip) and `length_bp_range` (for the legend
tick marks).
data/umap_names.txt
One gene name per line, in the SAME species-sorted order as the
binary's per-point columns. Plain text (not JSON) because the
format is trivial, gzips identically well, and is ~10x faster to
parse than JSON.parse over 500 K strings. Tooltip-only data, so
the frontend fetches it lazily after the WebGL render is up.
Points are sorted by species before writing so the species column becomes
runs of identical bytes — gzip RLEs it down to ~10% of raw size.
Usage:
python scripts/build_real_umap.py [--csv /path/to/viz.csv]
"""
import argparse
import array
import csv
import json
import math
import os
import struct
import sys
import time
HERE = os.path.dirname(os.path.abspath(__file__))
DATA = os.path.join(os.path.dirname(HERE), "data")
DEFAULT_CSV = "/tmp/carbon-umap/viz.csv"
# Kingdom grouping for the 27 species in the bucket. Order inside each
# kingdom controls the palette band that the species gets (the frontend
# assigns adjacent hues to adjacent species).
KINGDOMS = {
"vertebrates": ["human", "macaque", "mouse", "rat", "dog", "cow", "pig",
"chicken", "frog", "zebrafish"],
"invertebrates": ["fly", "worm"],
"plants": ["arabidopsis", "soybean", "tomato", "maize", "rice"],
"fungi": ["yeast", "fission_yeast", "candida", "aspergillus",
"neurospora"],
"bacteria": ["ecoli", "bsubtilis", "saureus"],
"viruses": ["sarscov2", "hiv1"],
}
def build_species_order():
order, kingdom_map = [], {}
for kingdom, members in KINGDOMS.items():
for sp in members:
order.append(sp)
kingdom_map[sp] = kingdom
return order, kingdom_map
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--csv", default=DEFAULT_CSV, help="path to viz.csv")
args = ap.parse_args()
if not os.path.exists(args.csv):
sys.exit(f"viz.csv not found at {args.csv}\n"
"Download from the HF bucket:\n"
" huggingface.co/buckets/HuggingFaceBio/carbon_embeddings/"
"tree/carbon_dana_embeddings_middle/viz.csv")
species_order, kingdom_map = build_species_order()
species_idx = {sp: i for i, sp in enumerate(species_order)}
biotypes = ["protein_coding", "lncRNA", "snRNA", "misc_RNA"]
biotype_idx = {b: i for i, b in enumerate(biotypes)}
strands = ["+", "-"]
strand_idx = {s: i for i, s in enumerate(strands)}
t0 = time.time()
print(f"reading {args.csv}", file=sys.stderr)
# Single pass: bucket rows by species so we end up species-sorted in the
# output (gzip-friendly) without an explicit O(n log n) sort.
buckets = [[] for _ in species_order]
seen_species = set()
seen_biotype = set()
unknown_species = set()
with open(args.csv) as f:
reader = csv.DictReader(f)
for row in reader:
sp = row["species"]
if sp not in species_idx:
unknown_species.add(sp)
continue
seen_species.add(sp)
seen_biotype.add(row["biotype"])
# `length` is the genomic span in bp. We clamp at >=1 so log10()
# is always well-defined; the rare 0-length entries Dana sees
# (corrupt rows in the source GFFs) get binned into the floor
# of the colour scale instead of crashing the build.
try:
length_bp = max(1, int(row["length"]))
except (KeyError, ValueError):
length_bp = 1
# Prefer the human-readable `name` (HBB, BRCA1, AT1G30814) but
# fall back to gene_id when the source had no symbol — keeps
# the tooltip from ever rendering "(empty)" rows.
name = row.get("name") or row.get("gene_id") or "—"
buckets[species_idx[sp]].append((
float(row["umap2d_x"]), float(row["umap2d_y"]),
float(row["umap3d_x"]), float(row["umap3d_y"]), float(row["umap3d_z"]),
biotype_idx[row["biotype"]],
strand_idx[row["strand"]],
float(row["gc_content"]),
length_bp,
name,
))
if unknown_species:
print(f"warning: {len(unknown_species)} unknown species skipped: "
f"{sorted(unknown_species)}", file=sys.stderr)
flat = [(sp_i, p) for sp_i, bucket in enumerate(buckets) for p in bucket]
n = len(flat)
print(f" {n:,} points kept in {time.time()-t0:.1f}s", file=sys.stderr)
# --- Compute bounds for quantization -----------------------------------
x2_min = min(p[1][0] for p in flat); x2_max = max(p[1][0] for p in flat)
y2_min = min(p[1][1] for p in flat); y2_max = max(p[1][1] for p in flat)
x3_min = min(p[1][2] for p in flat); x3_max = max(p[1][2] for p in flat)
y3_min = min(p[1][3] for p in flat); y3_max = max(p[1][3] for p in flat)
z3_min = min(p[1][4] for p in flat); z3_max = max(p[1][4] for p in flat)
# Length is *log-distributed* (5+ decades from miRNAs to titin). We
# quantize log10(length) into uint8 — gives a perceptually uniform
# color ramp and keeps the byte storage compact. We also stash the
# log range in umap_labels.json so the tooltip can de-quantize back
# to bp for human-readable display.
log_lengths = [math.log10(p[1][8]) for p in flat]
len_log_min = min(log_lengths)
len_log_max = max(log_lengths)
len_bp_min = min(p[1][8] for p in flat)
len_bp_max = max(p[1][8] for p in flat)
def quantize(v, lo, hi):
return int(round((v - lo) / (hi - lo) * 65534 - 32767))
rx2 = 65534.0 / (x2_max - x2_min)
ry2 = 65534.0 / (y2_max - y2_min)
rx3 = 65534.0 / (x3_max - x3_min)
ry3 = 65534.0 / (y3_max - y3_min)
rz3 = 65534.0 / (z3_max - z3_min)
# Avoid div-by-zero on the (highly degenerate) case of a single-length
# input — quantize() would NaN. The 1.0 fallback collapses every byte
# to the same value, which is the visually-correct behaviour.
rlen = 255.0 / (len_log_max - len_log_min) if len_log_max > len_log_min else 1.0
t1 = time.time()
print(f"packing binary ({n:,} points)...", file=sys.stderr)
pos2 = array.array("h", [0] * (2 * n))
pos3 = array.array("h", [0] * (3 * n))
sp_col = bytearray(n)
bt_col = bytearray(n)
st_col = bytearray(n)
gc_col = bytearray(n)
len_col = bytearray(n)
names = [None] * n
for i, (sp_i, p) in enumerate(flat):
x2, y2, x3, y3, z3, bt, st, gc, length_bp, name = p
pos2[2*i] = int(round((x2 - x2_min) * rx2 - 32767))
pos2[2*i + 1] = int(round((y2 - y2_min) * ry2 - 32767))
pos3[3*i] = int(round((x3 - x3_min) * rx3 - 32767))
pos3[3*i + 1] = int(round((y3 - y3_min) * ry3 - 32767))
pos3[3*i + 2] = int(round((z3 - z3_min) * rz3 - 32767))
sp_col[i] = sp_i
bt_col[i] = bt
st_col[i] = st
# gc_content is in [0, 1] but Dana's data tops at 0.9837, so the full
# uint8 range gives ~0.4% precision which is well below visual noise.
gc_col[i] = min(255, max(0, int(round(gc * 255))))
len_col[i] = min(255, max(0, int(round((math.log10(length_bp) - len_log_min) * rlen))))
# Strip newlines from gene names — we serialise as one-name-per-line
# text below, so any embedded \n would shift the alignment of every
# subsequent point. Tabs and stray whitespace pass through fine.
names[i] = name.replace("\n", " ").replace("\r", " ")
print(f" {time.time()-t1:.1f}s", file=sys.stderr)
# --- Write binary -------------------------------------------------------
# Toggle for the optional pos_3d payload. When False (current default),
# bit 0 of `flags` is cleared, the int16×3 coords are NOT appended to
# the binary, and the bounds_3d slots in the header are zeroed out so
# the frontend never reads stale ranges. Flip back to True if/when a
# 3D rotating viewer is added — the script always computes pos3/bounds_3d
# so this is a single-edit, one-rebuild change.
WRITE_POS_3D = False
buf = bytearray()
# bit 0: has 3D positions, bit 1: has gc_content, bit 2: has length
flags = 0b111 if WRITE_POS_3D else 0b110
buf += struct.pack("<6I", 0xCAB0FA1D, n, len(species_order),
len(biotypes), len(strands), flags)
buf += struct.pack("<4f", x2_min, x2_max, y2_min, y2_max)
if WRITE_POS_3D:
buf += struct.pack("<6f", x3_min, x3_max, y3_min, y3_max, z3_min, z3_max)
else:
# Keep the header at exactly 64 bytes: emit six zero floats so the
# offset of the pos_2d payload (and the frontend's fixed-offset
# reads) stays unchanged.
buf += struct.pack("<6f", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
assert len(buf) == 64, f"header size {len(buf)} != 64"
buf += pos2.tobytes()
if WRITE_POS_3D:
buf += pos3.tobytes()
buf += bytes(sp_col)
buf += bytes(bt_col)
buf += bytes(st_col)
buf += bytes(gc_col)
buf += bytes(len_col)
out_bin = os.path.join(DATA, "umap.bin")
with open(out_bin, "wb") as f:
f.write(buf)
print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)
# --- Write labels -------------------------------------------------------
out_labels = os.path.join(DATA, "umap_labels.json")
with open(out_labels, "w") as f:
json.dump({
"species": species_order,
"biotypes": biotypes,
"strands": strands,
"species_kingdom": kingdom_map,
"kingdoms": list(KINGDOMS.keys()),
"bounds_2d": [x2_min, x2_max, y2_min, y2_max],
"bounds_3d": [x3_min, x3_max, y3_min, y3_max, z3_min, z3_max],
# log-space range used for the uint8 quantization of `length`.
# The frontend de-quantizes a byte b back to bp by:
# bp = round(10 ** (log_min + b/255 * (log_max - log_min)))
# Stored as a list (not tuple) for stable JSON.
"length_log10_range": [len_log_min, len_log_max],
"length_bp_range": [int(len_bp_min), int(len_bp_max)],
"n_points": n,
"has_3d": WRITE_POS_3D,
"has_gc": True,
"has_length": True,
"has_names": True,
"source": "HuggingFaceBio/carbon_embeddings (carbon_dana_embeddings_middle)",
}, f, indent=2)
print(f" wrote {out_labels}", file=sys.stderr)
# --- Write per-point gene names ----------------------------------------
# Plain-text, one-per-line, in the same species-sorted order as the
# binary's column data. The frontend lazy-loads this AFTER the WebGL
# render is up, then re-aligns it to the in-memory shuffled order
# (see scatter.js shuffleParallel + permutation tracking).
out_names = os.path.join(DATA, "umap_names.txt")
with open(out_names, "w", encoding="utf-8") as f:
f.write("\n".join(names))
print(f" wrote {out_names}", file=sys.stderr)
print(f"total: {time.time()-t0:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()