Spaces:
Running
Running
| """Convert Dana's viz.csv (Carbon 3B embedding UMAP) into our packed binary. | |
| Input: | |
| /tmp/carbon-umap/viz.csv (or pass --csv PATH) | |
| Columns expected: | |
| row_idx, gene_id, name, biotype, chrom, start, end, strand, assembly, | |
| species, length, gc_content, seq_len_used, | |
| umap2d_x, umap2d_y, | |
| umap3d_x, umap3d_y, umap3d_z | |
| Output (this format matches what demo.html's initDemoUmap() now parses): | |
| data/umap.bin (~5 MB raw → ~3.1 MB gzipped) | |
| Header (64 bytes, little-endian): | |
| uint32 magic 0xCAB0FA1D | |
| uint32 n_points | |
| uint32 n_species | |
| uint32 n_biotypes | |
| uint32 n_strands | |
| uint32 flags (bit 0 = has 3D, bit 1 = has gc, bit 2 = has length) | |
| float32 bounds_2d[4] (x_min, x_max, y_min, y_max) | |
| float32 bounds_3d[6] (x_min, x_max, y_min, y_max, z_min, z_max) | |
| zeros when bit 0 of flags is unset. | |
| Payload (column-major so each column gzips independently well): | |
| int16 pos_2d[n_points * 2] interleaved x,y | |
| int16 pos_3d[n_points * 3] interleaved x,y,z (only when | |
| bit 0 of flags is set — currently | |
| disabled: see "Why no pos_3d?" | |
| below) | |
| uint8 species[n_points] | |
| uint8 biotype[n_points] | |
| uint8 strand[n_points] | |
| uint8 gc_content[n_points] quantized to 0..255 over [0, 1] | |
| uint8 length[n_points] quantized to 0..255 over | |
| [log10_length_min, log10_length_max]. | |
| uint8 + log gives ~2.5% relative | |
| precision over a 5+ decade dynamic | |
| range (~6 bp → ~3 Mb in this | |
| dataset), which is well below | |
| visual noise on a continuous | |
| gradient overlay. | |
| Why no pos_3d? v1 of the viewer ships a 2D scatter only — the front | |
| explicitly skips the pos_3d block on parse (see initDemoUmap, the | |
| `if (has3D)` branch that just advances the read cursor without | |
| storing). Shipping the int16 × 3 coords meant ~3.3 MB of wasted | |
| bandwidth per visit. We keep the computation alive in this script | |
| (negligible CPU, ~half a second on 570 K rows) but skip writing | |
| it to the binary; to re-enable, flip `WRITE_POS_3D` below. | |
| data/umap_labels.json | |
| Label arrays + kingdom mapping + bounds, consumed by the frontend | |
| to build the legend, tooltip, and color palettes. Now also carries | |
| `length_log10_range` (so the front can de-quantize a uint8 byte | |
| back to bp for the tooltip) and `length_bp_range` (for the legend | |
| tick marks). | |
| data/umap_names.txt | |
| One gene name per line, in the SAME species-sorted order as the | |
| binary's per-point columns. Plain text (not JSON) because the | |
| format is trivial, gzips identically well, and is ~10x faster to | |
| parse than JSON.parse over 500 K strings. Tooltip-only data, so | |
| the frontend fetches it lazily after the WebGL render is up. | |
| Points are sorted by species before writing so the species column becomes | |
| runs of identical bytes — gzip RLEs it down to ~10% of raw size. | |
| Usage: | |
| python scripts/build_real_umap.py [--csv /path/to/viz.csv] | |
| """ | |
| import argparse | |
| import array | |
| import csv | |
| import json | |
| import math | |
| import os | |
| import struct | |
| import sys | |
| import time | |
| HERE = os.path.dirname(os.path.abspath(__file__)) | |
| DATA = os.path.join(os.path.dirname(HERE), "data") | |
| DEFAULT_CSV = "/tmp/carbon-umap/viz.csv" | |
| # Kingdom grouping for the 27 species in the bucket. Order inside each | |
| # kingdom controls the palette band that the species gets (the frontend | |
| # assigns adjacent hues to adjacent species). | |
| KINGDOMS = { | |
| "vertebrates": ["human", "macaque", "mouse", "rat", "dog", "cow", "pig", | |
| "chicken", "frog", "zebrafish"], | |
| "invertebrates": ["fly", "worm"], | |
| "plants": ["arabidopsis", "soybean", "tomato", "maize", "rice"], | |
| "fungi": ["yeast", "fission_yeast", "candida", "aspergillus", | |
| "neurospora"], | |
| "bacteria": ["ecoli", "bsubtilis", "saureus"], | |
| "viruses": ["sarscov2", "hiv1"], | |
| } | |
| def build_species_order(): | |
| order, kingdom_map = [], {} | |
| for kingdom, members in KINGDOMS.items(): | |
| for sp in members: | |
| order.append(sp) | |
| kingdom_map[sp] = kingdom | |
| return order, kingdom_map | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--csv", default=DEFAULT_CSV, help="path to viz.csv") | |
| args = ap.parse_args() | |
| if not os.path.exists(args.csv): | |
| sys.exit(f"viz.csv not found at {args.csv}\n" | |
| "Download from the HF bucket:\n" | |
| " huggingface.co/buckets/HuggingFaceBio/carbon_embeddings/" | |
| "tree/carbon_dana_embeddings_middle/viz.csv") | |
| species_order, kingdom_map = build_species_order() | |
| species_idx = {sp: i for i, sp in enumerate(species_order)} | |
| biotypes = ["protein_coding", "lncRNA", "snRNA", "misc_RNA"] | |
| biotype_idx = {b: i for i, b in enumerate(biotypes)} | |
| strands = ["+", "-"] | |
| strand_idx = {s: i for i, s in enumerate(strands)} | |
| t0 = time.time() | |
| print(f"reading {args.csv}", file=sys.stderr) | |
| # Single pass: bucket rows by species so we end up species-sorted in the | |
| # output (gzip-friendly) without an explicit O(n log n) sort. | |
| buckets = [[] for _ in species_order] | |
| seen_species = set() | |
| seen_biotype = set() | |
| unknown_species = set() | |
| with open(args.csv) as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| sp = row["species"] | |
| if sp not in species_idx: | |
| unknown_species.add(sp) | |
| continue | |
| seen_species.add(sp) | |
| seen_biotype.add(row["biotype"]) | |
| # `length` is the genomic span in bp. We clamp at >=1 so log10() | |
| # is always well-defined; the rare 0-length entries Dana sees | |
| # (corrupt rows in the source GFFs) get binned into the floor | |
| # of the colour scale instead of crashing the build. | |
| try: | |
| length_bp = max(1, int(row["length"])) | |
| except (KeyError, ValueError): | |
| length_bp = 1 | |
| # Prefer the human-readable `name` (HBB, BRCA1, AT1G30814) but | |
| # fall back to gene_id when the source had no symbol — keeps | |
| # the tooltip from ever rendering "(empty)" rows. | |
| name = row.get("name") or row.get("gene_id") or "—" | |
| buckets[species_idx[sp]].append(( | |
| float(row["umap2d_x"]), float(row["umap2d_y"]), | |
| float(row["umap3d_x"]), float(row["umap3d_y"]), float(row["umap3d_z"]), | |
| biotype_idx[row["biotype"]], | |
| strand_idx[row["strand"]], | |
| float(row["gc_content"]), | |
| length_bp, | |
| name, | |
| )) | |
| if unknown_species: | |
| print(f"warning: {len(unknown_species)} unknown species skipped: " | |
| f"{sorted(unknown_species)}", file=sys.stderr) | |
| flat = [(sp_i, p) for sp_i, bucket in enumerate(buckets) for p in bucket] | |
| n = len(flat) | |
| print(f" {n:,} points kept in {time.time()-t0:.1f}s", file=sys.stderr) | |
| # --- Compute bounds for quantization ----------------------------------- | |
| x2_min = min(p[1][0] for p in flat); x2_max = max(p[1][0] for p in flat) | |
| y2_min = min(p[1][1] for p in flat); y2_max = max(p[1][1] for p in flat) | |
| x3_min = min(p[1][2] for p in flat); x3_max = max(p[1][2] for p in flat) | |
| y3_min = min(p[1][3] for p in flat); y3_max = max(p[1][3] for p in flat) | |
| z3_min = min(p[1][4] for p in flat); z3_max = max(p[1][4] for p in flat) | |
| # Length is *log-distributed* (5+ decades from miRNAs to titin). We | |
| # quantize log10(length) into uint8 — gives a perceptually uniform | |
| # color ramp and keeps the byte storage compact. We also stash the | |
| # log range in umap_labels.json so the tooltip can de-quantize back | |
| # to bp for human-readable display. | |
| log_lengths = [math.log10(p[1][8]) for p in flat] | |
| len_log_min = min(log_lengths) | |
| len_log_max = max(log_lengths) | |
| len_bp_min = min(p[1][8] for p in flat) | |
| len_bp_max = max(p[1][8] for p in flat) | |
| def quantize(v, lo, hi): | |
| return int(round((v - lo) / (hi - lo) * 65534 - 32767)) | |
| rx2 = 65534.0 / (x2_max - x2_min) | |
| ry2 = 65534.0 / (y2_max - y2_min) | |
| rx3 = 65534.0 / (x3_max - x3_min) | |
| ry3 = 65534.0 / (y3_max - y3_min) | |
| rz3 = 65534.0 / (z3_max - z3_min) | |
| # Avoid div-by-zero on the (highly degenerate) case of a single-length | |
| # input — quantize() would NaN. The 1.0 fallback collapses every byte | |
| # to the same value, which is the visually-correct behaviour. | |
| rlen = 255.0 / (len_log_max - len_log_min) if len_log_max > len_log_min else 1.0 | |
| t1 = time.time() | |
| print(f"packing binary ({n:,} points)...", file=sys.stderr) | |
| pos2 = array.array("h", [0] * (2 * n)) | |
| pos3 = array.array("h", [0] * (3 * n)) | |
| sp_col = bytearray(n) | |
| bt_col = bytearray(n) | |
| st_col = bytearray(n) | |
| gc_col = bytearray(n) | |
| len_col = bytearray(n) | |
| names = [None] * n | |
| for i, (sp_i, p) in enumerate(flat): | |
| x2, y2, x3, y3, z3, bt, st, gc, length_bp, name = p | |
| pos2[2*i] = int(round((x2 - x2_min) * rx2 - 32767)) | |
| pos2[2*i + 1] = int(round((y2 - y2_min) * ry2 - 32767)) | |
| pos3[3*i] = int(round((x3 - x3_min) * rx3 - 32767)) | |
| pos3[3*i + 1] = int(round((y3 - y3_min) * ry3 - 32767)) | |
| pos3[3*i + 2] = int(round((z3 - z3_min) * rz3 - 32767)) | |
| sp_col[i] = sp_i | |
| bt_col[i] = bt | |
| st_col[i] = st | |
| # gc_content is in [0, 1] but Dana's data tops at 0.9837, so the full | |
| # uint8 range gives ~0.4% precision which is well below visual noise. | |
| gc_col[i] = min(255, max(0, int(round(gc * 255)))) | |
| len_col[i] = min(255, max(0, int(round((math.log10(length_bp) - len_log_min) * rlen)))) | |
| # Strip newlines from gene names — we serialise as one-name-per-line | |
| # text below, so any embedded \n would shift the alignment of every | |
| # subsequent point. Tabs and stray whitespace pass through fine. | |
| names[i] = name.replace("\n", " ").replace("\r", " ") | |
| print(f" {time.time()-t1:.1f}s", file=sys.stderr) | |
| # --- Write binary ------------------------------------------------------- | |
| # Toggle for the optional pos_3d payload. When False (current default), | |
| # bit 0 of `flags` is cleared, the int16×3 coords are NOT appended to | |
| # the binary, and the bounds_3d slots in the header are zeroed out so | |
| # the frontend never reads stale ranges. Flip back to True if/when a | |
| # 3D rotating viewer is added — the script always computes pos3/bounds_3d | |
| # so this is a single-edit, one-rebuild change. | |
| WRITE_POS_3D = False | |
| buf = bytearray() | |
| # bit 0: has 3D positions, bit 1: has gc_content, bit 2: has length | |
| flags = 0b111 if WRITE_POS_3D else 0b110 | |
| buf += struct.pack("<6I", 0xCAB0FA1D, n, len(species_order), | |
| len(biotypes), len(strands), flags) | |
| buf += struct.pack("<4f", x2_min, x2_max, y2_min, y2_max) | |
| if WRITE_POS_3D: | |
| buf += struct.pack("<6f", x3_min, x3_max, y3_min, y3_max, z3_min, z3_max) | |
| else: | |
| # Keep the header at exactly 64 bytes: emit six zero floats so the | |
| # offset of the pos_2d payload (and the frontend's fixed-offset | |
| # reads) stays unchanged. | |
| buf += struct.pack("<6f", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) | |
| assert len(buf) == 64, f"header size {len(buf)} != 64" | |
| buf += pos2.tobytes() | |
| if WRITE_POS_3D: | |
| buf += pos3.tobytes() | |
| buf += bytes(sp_col) | |
| buf += bytes(bt_col) | |
| buf += bytes(st_col) | |
| buf += bytes(gc_col) | |
| buf += bytes(len_col) | |
| out_bin = os.path.join(DATA, "umap.bin") | |
| with open(out_bin, "wb") as f: | |
| f.write(buf) | |
| print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr) | |
| # --- Write labels ------------------------------------------------------- | |
| out_labels = os.path.join(DATA, "umap_labels.json") | |
| with open(out_labels, "w") as f: | |
| json.dump({ | |
| "species": species_order, | |
| "biotypes": biotypes, | |
| "strands": strands, | |
| "species_kingdom": kingdom_map, | |
| "kingdoms": list(KINGDOMS.keys()), | |
| "bounds_2d": [x2_min, x2_max, y2_min, y2_max], | |
| "bounds_3d": [x3_min, x3_max, y3_min, y3_max, z3_min, z3_max], | |
| # log-space range used for the uint8 quantization of `length`. | |
| # The frontend de-quantizes a byte b back to bp by: | |
| # bp = round(10 ** (log_min + b/255 * (log_max - log_min))) | |
| # Stored as a list (not tuple) for stable JSON. | |
| "length_log10_range": [len_log_min, len_log_max], | |
| "length_bp_range": [int(len_bp_min), int(len_bp_max)], | |
| "n_points": n, | |
| "has_3d": WRITE_POS_3D, | |
| "has_gc": True, | |
| "has_length": True, | |
| "has_names": True, | |
| "source": "HuggingFaceBio/carbon_embeddings (carbon_dana_embeddings_middle)", | |
| }, f, indent=2) | |
| print(f" wrote {out_labels}", file=sys.stderr) | |
| # --- Write per-point gene names ---------------------------------------- | |
| # Plain-text, one-per-line, in the same species-sorted order as the | |
| # binary's column data. The frontend lazy-loads this AFTER the WebGL | |
| # render is up, then re-aligns it to the in-memory shuffled order | |
| # (see scatter.js shuffleParallel + permutation tracking). | |
| out_names = os.path.join(DATA, "umap_names.txt") | |
| with open(out_names, "w", encoding="utf-8") as f: | |
| f.write("\n".join(names)) | |
| print(f" wrote {out_names}", file=sys.stderr) | |
| print(f"total: {time.time()-t0:.1f}s", file=sys.stderr) | |
| if __name__ == "__main__": | |
| main() | |