Spaces:
Running
Running
File size: 14,328 Bytes
5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 1f90847 5c11b40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | """Convert Dana's viz.csv (Carbon 3B embedding UMAP) into our packed binary.
Input:
/tmp/carbon-umap/viz.csv (or pass --csv PATH)
Columns expected:
row_idx, gene_id, name, biotype, chrom, start, end, strand, assembly,
species, length, gc_content, seq_len_used,
umap2d_x, umap2d_y,
umap3d_x, umap3d_y, umap3d_z
Output (this format matches what demo.html's initDemoUmap() now parses):
data/umap.bin (~5 MB raw β ~3.1 MB gzipped)
Header (64 bytes, little-endian):
uint32 magic 0xCAB0FA1D
uint32 n_points
uint32 n_species
uint32 n_biotypes
uint32 n_strands
uint32 flags (bit 0 = has 3D, bit 1 = has gc, bit 2 = has length)
float32 bounds_2d[4] (x_min, x_max, y_min, y_max)
float32 bounds_3d[6] (x_min, x_max, y_min, y_max, z_min, z_max)
zeros when bit 0 of flags is unset.
Payload (column-major so each column gzips independently well):
int16 pos_2d[n_points * 2] interleaved x,y
int16 pos_3d[n_points * 3] interleaved x,y,z (only when
bit 0 of flags is set β currently
disabled: see "Why no pos_3d?"
below)
uint8 species[n_points]
uint8 biotype[n_points]
uint8 strand[n_points]
uint8 gc_content[n_points] quantized to 0..255 over [0, 1]
uint8 length[n_points] quantized to 0..255 over
[log10_length_min, log10_length_max].
uint8 + log gives ~2.5% relative
precision over a 5+ decade dynamic
range (~6 bp β ~3 Mb in this
dataset), which is well below
visual noise on a continuous
gradient overlay.
Why no pos_3d? v1 of the viewer ships a 2D scatter only β the front
explicitly skips the pos_3d block on parse (see initDemoUmap, the
`if (has3D)` branch that just advances the read cursor without
storing). Shipping the int16 Γ 3 coords meant ~3.3 MB of wasted
bandwidth per visit. We keep the computation alive in this script
(negligible CPU, ~half a second on 570 K rows) but skip writing
it to the binary; to re-enable, flip `WRITE_POS_3D` below.
data/umap_labels.json
Label arrays + kingdom mapping + bounds, consumed by the frontend
to build the legend, tooltip, and color palettes. Now also carries
`length_log10_range` (so the front can de-quantize a uint8 byte
back to bp for the tooltip) and `length_bp_range` (for the legend
tick marks).
data/umap_names.txt
One gene name per line, in the SAME species-sorted order as the
binary's per-point columns. Plain text (not JSON) because the
format is trivial, gzips identically well, and is ~10x faster to
parse than JSON.parse over 500 K strings. Tooltip-only data, so
the frontend fetches it lazily after the WebGL render is up.
Points are sorted by species before writing so the species column becomes
runs of identical bytes β gzip RLEs it down to ~10% of raw size.
Usage:
python scripts/build_real_umap.py [--csv /path/to/viz.csv]
"""
import argparse
import array
import csv
import json
import math
import os
import struct
import sys
import time
HERE = os.path.dirname(os.path.abspath(__file__))
DATA = os.path.join(os.path.dirname(HERE), "data")
DEFAULT_CSV = "/tmp/carbon-umap/viz.csv"
# Kingdom grouping for the 27 species in the bucket. Order inside each
# kingdom controls the palette band that the species gets (the frontend
# assigns adjacent hues to adjacent species).
KINGDOMS = {
"vertebrates": ["human", "macaque", "mouse", "rat", "dog", "cow", "pig",
"chicken", "frog", "zebrafish"],
"invertebrates": ["fly", "worm"],
"plants": ["arabidopsis", "soybean", "tomato", "maize", "rice"],
"fungi": ["yeast", "fission_yeast", "candida", "aspergillus",
"neurospora"],
"bacteria": ["ecoli", "bsubtilis", "saureus"],
"viruses": ["sarscov2", "hiv1"],
}
def build_species_order():
order, kingdom_map = [], {}
for kingdom, members in KINGDOMS.items():
for sp in members:
order.append(sp)
kingdom_map[sp] = kingdom
return order, kingdom_map
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--csv", default=DEFAULT_CSV, help="path to viz.csv")
args = ap.parse_args()
if not os.path.exists(args.csv):
sys.exit(f"viz.csv not found at {args.csv}\n"
"Download from the HF bucket:\n"
" huggingface.co/buckets/HuggingFaceBio/carbon_embeddings/"
"tree/carbon_dana_embeddings_middle/viz.csv")
species_order, kingdom_map = build_species_order()
species_idx = {sp: i for i, sp in enumerate(species_order)}
biotypes = ["protein_coding", "lncRNA", "snRNA", "misc_RNA"]
biotype_idx = {b: i for i, b in enumerate(biotypes)}
strands = ["+", "-"]
strand_idx = {s: i for i, s in enumerate(strands)}
t0 = time.time()
print(f"reading {args.csv}", file=sys.stderr)
# Single pass: bucket rows by species so we end up species-sorted in the
# output (gzip-friendly) without an explicit O(n log n) sort.
buckets = [[] for _ in species_order]
seen_species = set()
seen_biotype = set()
unknown_species = set()
with open(args.csv) as f:
reader = csv.DictReader(f)
for row in reader:
sp = row["species"]
if sp not in species_idx:
unknown_species.add(sp)
continue
seen_species.add(sp)
seen_biotype.add(row["biotype"])
# `length` is the genomic span in bp. We clamp at >=1 so log10()
# is always well-defined; the rare 0-length entries Dana sees
# (corrupt rows in the source GFFs) get binned into the floor
# of the colour scale instead of crashing the build.
try:
length_bp = max(1, int(row["length"]))
except (KeyError, ValueError):
length_bp = 1
# Prefer the human-readable `name` (HBB, BRCA1, AT1G30814) but
# fall back to gene_id when the source had no symbol β keeps
# the tooltip from ever rendering "(empty)" rows.
name = row.get("name") or row.get("gene_id") or "β"
buckets[species_idx[sp]].append((
float(row["umap2d_x"]), float(row["umap2d_y"]),
float(row["umap3d_x"]), float(row["umap3d_y"]), float(row["umap3d_z"]),
biotype_idx[row["biotype"]],
strand_idx[row["strand"]],
float(row["gc_content"]),
length_bp,
name,
))
if unknown_species:
print(f"warning: {len(unknown_species)} unknown species skipped: "
f"{sorted(unknown_species)}", file=sys.stderr)
flat = [(sp_i, p) for sp_i, bucket in enumerate(buckets) for p in bucket]
n = len(flat)
print(f" {n:,} points kept in {time.time()-t0:.1f}s", file=sys.stderr)
# --- Compute bounds for quantization -----------------------------------
x2_min = min(p[1][0] for p in flat); x2_max = max(p[1][0] for p in flat)
y2_min = min(p[1][1] for p in flat); y2_max = max(p[1][1] for p in flat)
x3_min = min(p[1][2] for p in flat); x3_max = max(p[1][2] for p in flat)
y3_min = min(p[1][3] for p in flat); y3_max = max(p[1][3] for p in flat)
z3_min = min(p[1][4] for p in flat); z3_max = max(p[1][4] for p in flat)
# Length is *log-distributed* (5+ decades from miRNAs to titin). We
# quantize log10(length) into uint8 β gives a perceptually uniform
# color ramp and keeps the byte storage compact. We also stash the
# log range in umap_labels.json so the tooltip can de-quantize back
# to bp for human-readable display.
log_lengths = [math.log10(p[1][8]) for p in flat]
len_log_min = min(log_lengths)
len_log_max = max(log_lengths)
len_bp_min = min(p[1][8] for p in flat)
len_bp_max = max(p[1][8] for p in flat)
def quantize(v, lo, hi):
return int(round((v - lo) / (hi - lo) * 65534 - 32767))
rx2 = 65534.0 / (x2_max - x2_min)
ry2 = 65534.0 / (y2_max - y2_min)
rx3 = 65534.0 / (x3_max - x3_min)
ry3 = 65534.0 / (y3_max - y3_min)
rz3 = 65534.0 / (z3_max - z3_min)
# Avoid div-by-zero on the (highly degenerate) case of a single-length
# input β quantize() would NaN. The 1.0 fallback collapses every byte
# to the same value, which is the visually-correct behaviour.
rlen = 255.0 / (len_log_max - len_log_min) if len_log_max > len_log_min else 1.0
t1 = time.time()
print(f"packing binary ({n:,} points)...", file=sys.stderr)
pos2 = array.array("h", [0] * (2 * n))
pos3 = array.array("h", [0] * (3 * n))
sp_col = bytearray(n)
bt_col = bytearray(n)
st_col = bytearray(n)
gc_col = bytearray(n)
len_col = bytearray(n)
names = [None] * n
for i, (sp_i, p) in enumerate(flat):
x2, y2, x3, y3, z3, bt, st, gc, length_bp, name = p
pos2[2*i] = int(round((x2 - x2_min) * rx2 - 32767))
pos2[2*i + 1] = int(round((y2 - y2_min) * ry2 - 32767))
pos3[3*i] = int(round((x3 - x3_min) * rx3 - 32767))
pos3[3*i + 1] = int(round((y3 - y3_min) * ry3 - 32767))
pos3[3*i + 2] = int(round((z3 - z3_min) * rz3 - 32767))
sp_col[i] = sp_i
bt_col[i] = bt
st_col[i] = st
# gc_content is in [0, 1] but Dana's data tops at 0.9837, so the full
# uint8 range gives ~0.4% precision which is well below visual noise.
gc_col[i] = min(255, max(0, int(round(gc * 255))))
len_col[i] = min(255, max(0, int(round((math.log10(length_bp) - len_log_min) * rlen))))
# Strip newlines from gene names β we serialise as one-name-per-line
# text below, so any embedded \n would shift the alignment of every
# subsequent point. Tabs and stray whitespace pass through fine.
names[i] = name.replace("\n", " ").replace("\r", " ")
print(f" {time.time()-t1:.1f}s", file=sys.stderr)
# --- Write binary -------------------------------------------------------
# Toggle for the optional pos_3d payload. When False (current default),
# bit 0 of `flags` is cleared, the int16Γ3 coords are NOT appended to
# the binary, and the bounds_3d slots in the header are zeroed out so
# the frontend never reads stale ranges. Flip back to True if/when a
# 3D rotating viewer is added β the script always computes pos3/bounds_3d
# so this is a single-edit, one-rebuild change.
WRITE_POS_3D = False
buf = bytearray()
# bit 0: has 3D positions, bit 1: has gc_content, bit 2: has length
flags = 0b111 if WRITE_POS_3D else 0b110
buf += struct.pack("<6I", 0xCAB0FA1D, n, len(species_order),
len(biotypes), len(strands), flags)
buf += struct.pack("<4f", x2_min, x2_max, y2_min, y2_max)
if WRITE_POS_3D:
buf += struct.pack("<6f", x3_min, x3_max, y3_min, y3_max, z3_min, z3_max)
else:
# Keep the header at exactly 64 bytes: emit six zero floats so the
# offset of the pos_2d payload (and the frontend's fixed-offset
# reads) stays unchanged.
buf += struct.pack("<6f", 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
assert len(buf) == 64, f"header size {len(buf)} != 64"
buf += pos2.tobytes()
if WRITE_POS_3D:
buf += pos3.tobytes()
buf += bytes(sp_col)
buf += bytes(bt_col)
buf += bytes(st_col)
buf += bytes(gc_col)
buf += bytes(len_col)
out_bin = os.path.join(DATA, "umap.bin")
with open(out_bin, "wb") as f:
f.write(buf)
print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)
# --- Write labels -------------------------------------------------------
out_labels = os.path.join(DATA, "umap_labels.json")
with open(out_labels, "w") as f:
json.dump({
"species": species_order,
"biotypes": biotypes,
"strands": strands,
"species_kingdom": kingdom_map,
"kingdoms": list(KINGDOMS.keys()),
"bounds_2d": [x2_min, x2_max, y2_min, y2_max],
"bounds_3d": [x3_min, x3_max, y3_min, y3_max, z3_min, z3_max],
# log-space range used for the uint8 quantization of `length`.
# The frontend de-quantizes a byte b back to bp by:
# bp = round(10 ** (log_min + b/255 * (log_max - log_min)))
# Stored as a list (not tuple) for stable JSON.
"length_log10_range": [len_log_min, len_log_max],
"length_bp_range": [int(len_bp_min), int(len_bp_max)],
"n_points": n,
"has_3d": WRITE_POS_3D,
"has_gc": True,
"has_length": True,
"has_names": True,
"source": "HuggingFaceBio/carbon_embeddings (carbon_dana_embeddings_middle)",
}, f, indent=2)
print(f" wrote {out_labels}", file=sys.stderr)
# --- Write per-point gene names ----------------------------------------
# Plain-text, one-per-line, in the same species-sorted order as the
# binary's column data. The frontend lazy-loads this AFTER the WebGL
# render is up, then re-aligns it to the in-memory shuffled order
# (see scatter.js shuffleParallel + permutation tracking).
out_names = os.path.join(DATA, "umap_names.txt")
with open(out_names, "w", encoding="utf-8") as f:
f.write("\n".join(names))
print(f" wrote {out_names}", file=sys.stderr)
print(f"total: {time.time()-t0:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
|