tfrere HF Staff Cursor commited on
Commit
45f472e
·
2 Parent(s): 28428b8f039f0b

Merge feature/umap-scatter into feat/folding-3d (local-only)

Browse files

Combines the §6 UMAP scatter and §5 Folding work into one branch for
end-to-end local testing. To be split back into two separate PRs.

demo.html conflict resolution: kept §6 UMAP IIFE from umap-scatter,
kept the Canvas 2D banner comment from folding-3d (perf branch
already merged into main). No code changes, just a comment block.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (5) hide show
  1. app.py +28 -0
  2. data/umap.bin +3 -0
  3. data/umap_labels.json +84 -0
  4. demo.html +617 -9
  5. scripts/gen_fake_umap.py +219 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import os
4
 
5
  import httpx
6
  from fastapi import FastAPI, Request
 
7
  from fastapi.responses import FileResponse, StreamingResponse
8
  from fastapi.staticfiles import StaticFiles
9
  from openai import OpenAI
@@ -69,6 +70,11 @@ def left_pad_to_six(seq: str) -> tuple[str, int]:
69
 
70
 
71
  app = FastAPI()
 
 
 
 
 
72
  app.mount("/img", StaticFiles(directory=os.path.join(HERE, "img")), name="img")
73
 
74
 
@@ -108,6 +114,28 @@ def species():
108
  return FileResponse(os.path.join(HERE, "data", "species.json"), media_type="application/json")
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  @app.post("/score")
112
  async def score(request: Request):
113
  """Return per-token logprobs over a (forced) sequence using echo=True.
 
4
 
5
  import httpx
6
  from fastapi import FastAPI, Request
7
+ from fastapi.middleware.gzip import GZipMiddleware
8
  from fastapi.responses import FileResponse, StreamingResponse
9
  from fastapi.staticfiles import StaticFiles
10
  from openai import OpenAI
 
70
 
71
 
72
  app = FastAPI()
73
+ # Compress responses >= 1 KB. Mostly aimed at /umap (~4 MB binary blob
74
+ # → ~2 MB on the wire) and the JSON gene/variant/species catalogs.
75
+ # compresslevel=6 is the gzip(1) system default — within ~3% of level 9
76
+ # in ratio but ~5x cheaper in CPU. Worth it on every request.
77
+ app.add_middleware(GZipMiddleware, minimum_size=1024, compresslevel=6)
78
  app.mount("/img", StaticFiles(directory=os.path.join(HERE, "img")), name="img")
79
 
80
 
 
114
  return FileResponse(os.path.join(HERE, "data", "species.json"), media_type="application/json")
115
 
116
 
117
+ @app.get("/umap")
118
+ def umap():
119
+ """Binary packed scatter (int16 positions + uint8 categories) for §6.
120
+
121
+ The §6 frontend fetches this as an ArrayBuffer and feeds it straight
122
+ into WebGL — no JSON parse, no per-point allocations. See
123
+ scripts/gen_fake_umap.py for the binary layout.
124
+ """
125
+ return FileResponse(
126
+ os.path.join(HERE, "data", "umap.bin"),
127
+ media_type="application/octet-stream",
128
+ )
129
+
130
+
131
+ @app.get("/umap_labels")
132
+ def umap_labels():
133
+ return FileResponse(
134
+ os.path.join(HERE, "data", "umap_labels.json"),
135
+ media_type="application/json",
136
+ )
137
+
138
+
139
  @app.post("/score")
140
  async def score(request: Request):
141
  """Return per-token logprobs over a (forced) sequence using echo=True.
data/umap.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8de51da32a9fa05d12b8b580b8dad36bbe1a257ee6a721b18b4030c12daa92
3
+ size 4000040
data/umap_labels.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "species": [
3
+ "human",
4
+ "mouse",
5
+ "rat",
6
+ "chicken",
7
+ "zebrafish",
8
+ "xenopus",
9
+ "dog",
10
+ "cow",
11
+ "pig",
12
+ "fly",
13
+ "worm",
14
+ "mosquito",
15
+ "honeybee",
16
+ "sea_urchin",
17
+ "arabidopsis",
18
+ "rice",
19
+ "maize",
20
+ "wheat",
21
+ "soybean",
22
+ "yeast",
23
+ "neurospora",
24
+ "candida",
25
+ "plasmodium",
26
+ "trypanosoma"
27
+ ],
28
+ "biotypes": [
29
+ "protein_coding",
30
+ "lncRNA",
31
+ "miRNA",
32
+ "pseudogene"
33
+ ],
34
+ "strands": [
35
+ "+",
36
+ "-"
37
+ ],
38
+ "phases": [
39
+ "0",
40
+ "1",
41
+ "2"
42
+ ],
43
+ "species_kingdom": {
44
+ "human": "vertebrates",
45
+ "mouse": "vertebrates",
46
+ "rat": "vertebrates",
47
+ "chicken": "vertebrates",
48
+ "zebrafish": "vertebrates",
49
+ "xenopus": "vertebrates",
50
+ "dog": "vertebrates",
51
+ "cow": "vertebrates",
52
+ "pig": "vertebrates",
53
+ "fly": "invertebrates",
54
+ "worm": "invertebrates",
55
+ "mosquito": "invertebrates",
56
+ "honeybee": "invertebrates",
57
+ "sea_urchin": "invertebrates",
58
+ "arabidopsis": "plants",
59
+ "rice": "plants",
60
+ "maize": "plants",
61
+ "wheat": "plants",
62
+ "soybean": "plants",
63
+ "yeast": "fungi",
64
+ "neurospora": "fungi",
65
+ "candida": "fungi",
66
+ "plasmodium": "protozoa",
67
+ "trypanosoma": "protozoa"
68
+ },
69
+ "kingdoms": [
70
+ "vertebrates",
71
+ "invertebrates",
72
+ "plants",
73
+ "fungi",
74
+ "protozoa"
75
+ ],
76
+ "bounds": [
77
+ -19.07978630065918,
78
+ 17.736162185668945,
79
+ -15.417572021484375,
80
+ 16.83187484741211
81
+ ],
82
+ "n_points": 500000,
83
+ "fake": true
84
+ }
demo.html CHANGED
@@ -291,6 +291,77 @@
291
  font-size: 9px; color: #aaa; margin-bottom: 8px;
292
  }
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  /* --- Gene-completion specifics (§1) --- */
295
  .gene-info {
296
  font-family: "JetBrains Mono", monospace;
@@ -1081,21 +1152,57 @@
1081
  </section>
1082
 
1083
  <!-- ============================================================ -->
1084
- <!-- §6 — UMAP (stub, coming soon) -->
1085
  <!-- ============================================================ -->
1086
  <section id="umap">
1087
- <div class="section-num">§6 · Coming soon</div>
1088
  <div class="section-title">The genome, organized</div>
1089
  <p class="lede">
1090
- Embed every human gene with Carbon, project to 2D with UMAP, color by gene family, function,
1091
- or species. Genes the model never saw labeled cluster anyway kinases with kinases,
1092
- histones with histones, mitochondrial DNA off in its own corner. The structure of biology
1093
- falls out of the structure of sequence.
1094
  </p>
1095
 
1096
- <div class="stub" style="padding:64px 24px">
1097
- <div class="stub-tag">coming soon</div>
1098
- <div style="margin-top:6px">interactive UMAP of carbon embeddings · ~20k human genes</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099
  </div>
1100
  </section>
1101
 
@@ -3880,6 +3987,507 @@ function loadGenes() {
3880
  }
3881
  })();
3882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3883
  // =========================================================================
3884
  // Carbon banner — animated DNA helix (Canvas 2D)
3885
  //
 
291
  font-size: 9px; color: #aaa; margin-bottom: 8px;
292
  }
293
 
294
+ /* --- UMAP scatter specifics (§6) --- */
295
+ .umap-frame {
296
+ position: relative;
297
+ width: 100%;
298
+ aspect-ratio: 16 / 10;
299
+ background: #fff;
300
+ border: 1px solid #eee;
301
+ overflow: hidden;
302
+ }
303
+ .umap-canvas {
304
+ position: absolute; inset: 0;
305
+ width: 100%; height: 100%;
306
+ display: block;
307
+ cursor: grab;
308
+ touch-action: none;
309
+ }
310
+ .umap-canvas.panning { cursor: grabbing; }
311
+ .umap-tooltip {
312
+ position: absolute;
313
+ pointer-events: none;
314
+ background: #1f1f1d; color: #f7f5ee;
315
+ font-family: "JetBrains Mono", monospace;
316
+ font-size: 10px; line-height: 1.4;
317
+ padding: 6px 9px;
318
+ border-radius: 2px;
319
+ white-space: nowrap;
320
+ opacity: 0;
321
+ transform: translate(8px, -100%);
322
+ transition: opacity 0.12s;
323
+ z-index: 4;
324
+ }
325
+ .umap-tooltip.visible { opacity: 0.96; }
326
+ .umap-tooltip .t-label {
327
+ color: #8c918b;
328
+ text-transform: uppercase; letter-spacing: 1px;
329
+ font-size: 8px;
330
+ margin-right: 4px;
331
+ }
332
+ .umap-status-overlay {
333
+ position: absolute; inset: 0;
334
+ display: flex; align-items: center; justify-content: center;
335
+ color: #aaa;
336
+ font-family: "JetBrains Mono", monospace;
337
+ font-size: 11px; letter-spacing: 1.5px;
338
+ text-transform: uppercase;
339
+ background: rgba(247, 245, 238, 0.85);
340
+ pointer-events: none;
341
+ transition: opacity 0.2s;
342
+ }
343
+ .umap-status-overlay.hidden { opacity: 0; }
344
+ .umap-legend {
345
+ display: flex; flex-wrap: wrap;
346
+ gap: 6px 14px;
347
+ margin-top: 10px;
348
+ font-family: "JetBrains Mono", monospace;
349
+ font-size: 10px;
350
+ color: #666;
351
+ }
352
+ .umap-legend .swatch {
353
+ display: inline-block;
354
+ width: 9px; height: 9px;
355
+ margin-right: 5px;
356
+ vertical-align: middle;
357
+ border-radius: 2px;
358
+ }
359
+ .umap-legend .item {
360
+ display: inline-flex;
361
+ align-items: center;
362
+ cursor: default;
363
+ }
364
+
365
  /* --- Gene-completion specifics (§1) --- */
366
  .gene-info {
367
  font-family: "JetBrains Mono", monospace;
 
1152
  </section>
1153
 
1154
  <!-- ============================================================ -->
1155
+ <!-- §6 — UMAP (interactive scatter) -->
1156
  <!-- ============================================================ -->
1157
  <section id="umap">
1158
+ <div class="section-num">§6 · Embedding space</div>
1159
  <div class="section-title">The genome, organized</div>
1160
  <p class="lede">
1161
+ Embed half a million sequences from 24 eukaryotes with Carbon, project to 2D
1162
+ with UMAP, color by anything. Switch the coloring and a completely different
1163
+ organization emerges from the same points the model's embedding space
1164
+ carries multiple axes of biology at once, none of which were ever labeled.
1165
  </p>
1166
 
1167
+ <div class="demo" id="demoUmap">
1168
+ <div class="demo-toolbar">
1169
+ <span>color by</span>
1170
+ <span id="dumap-color-pills" class="pills">
1171
+ <button class="pill active" data-color="species">species</button>
1172
+ <button class="pill" data-color="biotype">biotype</button>
1173
+ <button class="pill" data-color="strand">strand</button>
1174
+ <button class="pill" data-color="phase">codon phase</button>
1175
+ </span>
1176
+ <span class="spacer"></span>
1177
+ <button id="dumap-reset" class="action">↺ reset view</button>
1178
+ <span class="status" id="dumap-status"><span class="dot"></span><span>idle</span></span>
1179
+ </div>
1180
+
1181
+ <div class="gene-info" id="dumap-info">scroll to zoom · drag to pan · hover for details</div>
1182
+
1183
+ <div class="umap-frame">
1184
+ <canvas class="umap-canvas" id="dumap-canvas"></canvas>
1185
+ <div class="umap-tooltip" id="dumap-tooltip"></div>
1186
+ <div class="umap-status-overlay" id="dumap-overlay">loading 500K points · ~2 MB gzipped</div>
1187
+ </div>
1188
+
1189
+ <div class="umap-legend" id="dumap-legend"></div>
1190
+
1191
+ <div class="stat-row" id="dumap-stats">
1192
+ <div class="stat-pair"><span class="stat-pair-label">points</span><span class="stat-pair-val muted" id="dumap-n">—</span></div>
1193
+ <div class="stat-pair"><span class="stat-pair-label">species</span><span class="stat-pair-val muted" id="dumap-nsp">—</span></div>
1194
+ <div class="stat-pair"><span class="stat-pair-label">embedding dim</span><span class="stat-pair-val muted">3072</span></div>
1195
+ <div class="stat-pair"><span class="stat-pair-label">render</span><span class="stat-pair-val muted" id="dumap-fps">—</span></div>
1196
+ </div>
1197
+ </div>
1198
+
1199
+ <div class="takeaway">
1200
+ <strong>What to look for</strong>
1201
+ Switch coloring from <em>species</em> to <em>biotype</em>: same points, completely
1202
+ different organization emerges. The five rough macro-clusters trace the eukaryotic
1203
+ kingdoms — vertebrates, invertebrates, plants, fungi, protozoa — discovered from
1204
+ raw sequence alone. <em>The current 500K dataset is synthetic, awaiting the real
1205
+ Carbon 3B embeddings.</em>
1206
  </div>
1207
  </section>
1208
 
 
3987
  }
3988
  })();
3989
 
3990
+ // =========================================================================
3991
+ // §6 — UMAP scatter (WebGL, 500K points)
3992
+ //
3993
+ // Loads a binary-packed scatter (int16 quantized positions + 4 uint8 category
3994
+ // columns) and renders it via WebGL gl.POINTS with a 1D palette texture for
3995
+ // coloring. Toggle between coloring axes (species / biotype / strand / phase)
3996
+ // rebinds a single byte-attribute buffer and swaps the palette texture — no
3997
+ // re-upload of the 500K vertex stream. Hover lookup uses a flat grid index
3998
+ // so picking stays O(small) regardless of total point count.
3999
+ // =========================================================================
4000
+ (function initDemoUmap() {
4001
+ const canvas = document.getElementById("dumap-canvas");
4002
+ if (!canvas) return;
4003
+ const tooltip = document.getElementById("dumap-tooltip");
4004
+ const overlay = document.getElementById("dumap-overlay");
4005
+ const info = document.getElementById("dumap-info");
4006
+ const legend = document.getElementById("dumap-legend");
4007
+ const resetBtn = document.getElementById("dumap-reset");
4008
+ const status = document.getElementById("dumap-status");
4009
+ const statusText = status.querySelector("span:last-child");
4010
+ const colorPills = document.querySelectorAll("#dumap-color-pills .pill");
4011
+ const elN = document.getElementById("dumap-n");
4012
+ const elNsp = document.getElementById("dumap-nsp");
4013
+ const elFps = document.getElementById("dumap-fps");
4014
+
4015
+ // ---- Palettes ----------------------------------------------------------
4016
+ // 24 species are grouped into 5 kingdoms — each kingdom gets a hue band.
4017
+ // Within a band, lightness varies to keep adjacent species distinguishable.
4018
+ const SPECIES_PALETTE = [
4019
+ // vertebrates (9) — blue/indigo band
4020
+ [69,117,180],[97,144,200],[125,170,220],[153,194,240],
4021
+ [120,90,170],[140,110,190],
4022
+ [80,90,150],[100,110,170],[120,130,190],
4023
+ // invertebrates (5) — orange band
4024
+ [217,95,2],[230,120,30],[240,150,60],[250,180,90],[253,210,120],
4025
+ // plants (5) — olive/lime band (intentionally different from Carbon's
4026
+ // signal-green #317f3f so the UI chrome doesn't blend with the data)
4027
+ [85,140,55],[115,165,75],[145,195,100],[175,220,135],[205,240,170],
4028
+ // fungi (3) — magenta/rose band
4029
+ [200,40,120],[220,80,140],[240,130,170],
4030
+ // protozoa (2) — gold band
4031
+ [200,150,30],[230,180,60],
4032
+ ];
4033
+ const BIOTYPE_PALETTE = [
4034
+ [49,127,63], // protein_coding — Carbon green
4035
+ [188,46,37], // lncRNA — Carbon red
4036
+ [70,90,140], // miRNA — slate blue
4037
+ [170,170,170], // pseudogene — neutral gray
4038
+ ];
4039
+ const STRAND_PALETTE = [
4040
+ [49,127,63], // + (forward)
4041
+ [188,46,37], // - (reverse)
4042
+ ];
4043
+ // 3-step ordinal palette (viridis-ish endpoints) — codon phase 0/1/2.
4044
+ const PHASE_PALETTE = [
4045
+ [68,1,84], [33,144,140], [253,231,37],
4046
+ ];
4047
+ const PALETTES = {
4048
+ species: SPECIES_PALETTE,
4049
+ biotype: BIOTYPE_PALETTE,
4050
+ strand: STRAND_PALETTE,
4051
+ phase: PHASE_PALETTE,
4052
+ };
4053
+
4054
+ // ---- State -------------------------------------------------------------
4055
+ let gl, program;
4056
+ let posBuf; // int16 interleaved x,y
4057
+ let catBufs = {}; // { species|biotype|strand|phase: GLBuffer of uint8 }
4058
+ let paletteTex;
4059
+ let n = 0;
4060
+ let labels = null; // { species:[], biotypes:[], strands:[], phases:[], bounds:[xmin,xmax,ymin,ymax] }
4061
+ // Raw category bytes — kept on CPU side too for tooltip lookups.
4062
+ let cats = { species: null, biotype: null, strand: null, phase: null };
4063
+ // World bounds + current colorBy axis.
4064
+ let bounds = [0,0,0,0];
4065
+ let colorBy = "species";
4066
+ // Viewport: translate (tx, ty) + scale around origin, in NDC space.
4067
+ // The whole world is fit into [-0.95, 0.95]² at initial zoom.
4068
+ let view = { tx: 0, ty: 0, scale: 1 };
4069
+ let dpr = Math.max(1, window.devicePixelRatio || 1);
4070
+ let needsRedraw = false;
4071
+ // Spatial grid for hover (built once after data load, in normalized world space).
4072
+ let grid = null;
4073
+
4074
+ function setStatus(state, text) {
4075
+ status.classList.remove("streaming", "error");
4076
+ if (state === "streaming") status.classList.add("streaming");
4077
+ if (state === "error") status.classList.add("error");
4078
+ statusText.textContent = text;
4079
+ }
4080
+
4081
+ // ---- WebGL setup -------------------------------------------------------
4082
+ const VS = `
4083
+ attribute vec2 a_pos; // raw int16, normalized via attribPointer (-1..1)
4084
+ attribute float a_cat; // category index (uint8 -> float)
4085
+ uniform vec3 u_xform; // x: scale, y: tx, z: ty
4086
+ uniform float u_pointSize;
4087
+ varying float v_cat;
4088
+ void main() {
4089
+ vec2 world = a_pos * u_xform.x + vec2(u_xform.y, u_xform.z);
4090
+ gl_Position = vec4(world, 0.0, 1.0);
4091
+ gl_PointSize = u_pointSize;
4092
+ v_cat = a_cat;
4093
+ }
4094
+ `;
4095
+ const FS = `
4096
+ precision mediump float;
4097
+ varying float v_cat;
4098
+ uniform sampler2D u_palette;
4099
+ uniform float u_paletteN;
4100
+ uniform float u_alpha;
4101
+ void main() {
4102
+ vec2 d = gl_PointCoord - 0.5;
4103
+ float r = length(d);
4104
+ float aa = smoothstep(0.50, 0.42, r);
4105
+ if (aa <= 0.001) discard;
4106
+ float t = (v_cat + 0.5) / u_paletteN;
4107
+ vec3 color = texture2D(u_palette, vec2(t, 0.5)).rgb;
4108
+ gl_FragColor = vec4(color, aa * u_alpha);
4109
+ }
4110
+ `;
4111
+ function compile(type, src) {
4112
+ const sh = gl.createShader(type);
4113
+ gl.shaderSource(sh, src);
4114
+ gl.compileShader(sh);
4115
+ if (!gl.getShaderParameter(sh, gl.COMPILE_STATUS)) {
4116
+ throw new Error("shader compile: " + gl.getShaderInfoLog(sh));
4117
+ }
4118
+ return sh;
4119
+ }
4120
+ function setupGL() {
4121
+ gl = canvas.getContext("webgl", {
4122
+ antialias: true, alpha: true, premultipliedAlpha: true,
4123
+ preserveDrawingBuffer: false,
4124
+ });
4125
+ if (!gl) throw new Error("WebGL unavailable");
4126
+ program = gl.createProgram();
4127
+ gl.attachShader(program, compile(gl.VERTEX_SHADER, VS));
4128
+ gl.attachShader(program, compile(gl.FRAGMENT_SHADER, FS));
4129
+ gl.linkProgram(program);
4130
+ if (!gl.getProgramParameter(program, gl.LINK_STATUS)) {
4131
+ throw new Error("program link: " + gl.getProgramInfoLog(program));
4132
+ }
4133
+ gl.useProgram(program);
4134
+
4135
+ // Standard premultiplied-alpha additive-ish blending — points blend over
4136
+ // the paper background and over each other cleanly at dense overlaps.
4137
+ gl.enable(gl.BLEND);
4138
+ gl.blendFunc(gl.ONE, gl.ONE_MINUS_SRC_ALPHA);
4139
+ gl.clearColor(1, 1, 1, 0);
4140
+
4141
+ paletteTex = gl.createTexture();
4142
+ gl.bindTexture(gl.TEXTURE_2D, paletteTex);
4143
+ gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
4144
+ gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
4145
+ gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
4146
+ gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
4147
+ }
4148
+
4149
+ function uploadPalette(palette) {
4150
+ const n = palette.length;
4151
+ const buf = new Uint8Array(n * 3);
4152
+ for (let i = 0; i < n; i++) {
4153
+ buf[i*3] = palette[i][0];
4154
+ buf[i*3+1] = palette[i][1];
4155
+ buf[i*3+2] = palette[i][2];
4156
+ }
4157
+ gl.bindTexture(gl.TEXTURE_2D, paletteTex);
4158
+ gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGB, n, 1, 0, gl.RGB, gl.UNSIGNED_BYTE, buf);
4159
+ gl.uniform1f(gl.getUniformLocation(program, "u_paletteN"), n);
4160
+ }
4161
+
4162
+ // ---- Data load ---------------------------------------------------------
4163
+ async function loadData() {
4164
+ setStatus("streaming", "loading…");
4165
+ const t0 = performance.now();
4166
+ const [binResp, labelsResp] = await Promise.all([
4167
+ fetch("/umap"),
4168
+ fetch("/umap_labels"),
4169
+ ]);
4170
+ if (!binResp.ok) throw new Error("fetch /umap failed: " + binResp.status);
4171
+ const buf = await binResp.arrayBuffer();
4172
+ labels = await labelsResp.json();
4173
+
4174
+ // Parse header (matches scripts/gen_fake_umap.py).
4175
+ const hdrU32 = new Uint32Array(buf, 0, 6);
4176
+ const magic = hdrU32[0];
4177
+ if (magic !== 0xCAB0FA1D) throw new Error("bad magic: " + magic.toString(16));
4178
+ n = hdrU32[1];
4179
+ const nSp = hdrU32[2], nBt = hdrU32[3], nSt = hdrU32[4], nPh = hdrU32[5];
4180
+ const hdrF32 = new Float32Array(buf, 24, 4);
4181
+ bounds = [hdrF32[0], hdrF32[1], hdrF32[2], hdrF32[3]];
4182
+
4183
+ let off = 40;
4184
+ const pos16 = new Int16Array(buf, off, n * 2); off += n * 2 * 2;
4185
+ cats.species = new Uint8Array(buf, off, n); off += n;
4186
+ cats.biotype = new Uint8Array(buf, off, n); off += n;
4187
+ cats.strand = new Uint8Array(buf, off, n); off += n;
4188
+ cats.phase = new Uint8Array(buf, off, n); off += n;
4189
+
4190
+ // Upload to GPU.
4191
+ posBuf = gl.createBuffer();
4192
+ gl.bindBuffer(gl.ARRAY_BUFFER, posBuf);
4193
+ gl.bufferData(gl.ARRAY_BUFFER, pos16, gl.STATIC_DRAW);
4194
+ for (const key of ["species", "biotype", "strand", "phase"]) {
4195
+ const b = gl.createBuffer();
4196
+ gl.bindBuffer(gl.ARRAY_BUFFER, b);
4197
+ gl.bufferData(gl.ARRAY_BUFFER, cats[key], gl.STATIC_DRAW);
4198
+ catBufs[key] = b;
4199
+ }
4200
+
4201
+ // Wire attributes (position is constant; category attribute is rebound on toggle).
4202
+ const posLoc = gl.getAttribLocation(program, "a_pos");
4203
+ gl.bindBuffer(gl.ARRAY_BUFFER, posBuf);
4204
+ gl.enableVertexAttribArray(posLoc);
4205
+ // normalize=true → int16 mapped to [-1, 1] in shader — exactly the
4206
+ // quantization we did in the Python packer.
4207
+ gl.vertexAttribPointer(posLoc, 2, gl.SHORT, true, 0, 0);
4208
+
4209
+ // Build spatial grid (in [-1, 1]² normalized world space).
4210
+ buildGrid(pos16);
4211
+
4212
+ elN.textContent = n.toLocaleString("en-US");
4213
+ elN.classList.remove("muted");
4214
+ elNsp.textContent = labels.species.length;
4215
+ elNsp.classList.remove("muted");
4216
+
4217
+ const ms = (performance.now() - t0) | 0;
4218
+ setStatus("idle", `loaded ${(n/1000)|0}k pts · ${ms} ms`);
4219
+ info.textContent = `${n.toLocaleString("en-US")} sequences · ${labels.species.length} eukaryotic species · drag to pan, wheel to zoom`;
4220
+ overlay.classList.add("hidden");
4221
+
4222
+ return pos16;
4223
+ }
4224
+
4225
+ // ---- Spatial grid (hover picking) --------------------------------------
4226
+ // We store, per cell, a list of point indices whose normalized (x,y) falls
4227
+ // in that cell. At hover, look up the cell under the cursor plus the 8
4228
+ // neighbors, then scan for the nearest point within a screen-space radius.
4229
+ const GRID_N = 128;
4230
+ function buildGrid(pos16) {
4231
+ const cells = new Array(GRID_N * GRID_N);
4232
+ for (let i = 0; i < cells.length; i++) cells[i] = null;
4233
+ for (let i = 0; i < n; i++) {
4234
+ // pos16 entries are in [-32767, 32767] → normalize to [0, GRID_N).
4235
+ const x = (pos16[2*i] + 32767) / 65534;
4236
+ const y = (pos16[2*i + 1] + 32767) / 65534;
4237
+ const cx = Math.min(GRID_N - 1, Math.max(0, (x * GRID_N) | 0));
4238
+ const cy = Math.min(GRID_N - 1, Math.max(0, (y * GRID_N) | 0));
4239
+ const id = cy * GRID_N + cx;
4240
+ const list = cells[id];
4241
+ if (list === null) cells[id] = [i];
4242
+ else list.push(i);
4243
+ }
4244
+ grid = cells;
4245
+ }
4246
+
4247
+ // ---- Render ------------------------------------------------------------
4248
+ function resize() {
4249
+ const rect = canvas.getBoundingClientRect();
4250
+ if (rect.width === 0 || rect.height === 0) return false;
4251
+ dpr = Math.max(1, window.devicePixelRatio || 1);
4252
+ const w = Math.round(rect.width * dpr);
4253
+ const h = Math.round(rect.height * dpr);
4254
+ if (canvas.width !== w || canvas.height !== h) {
4255
+ canvas.width = w; canvas.height = h;
4256
+ }
4257
+ gl.viewport(0, 0, w, h);
4258
+ return true;
4259
+ }
4260
+
4261
+ let lastFrameTs = 0, frameCount = 0, fpsTs = 0;
4262
+ function draw() {
4263
+ needsRedraw = false;
4264
+ if (!resize()) return;
4265
+ gl.clear(gl.COLOR_BUFFER_BIT);
4266
+
4267
+ // The vertex shader does world = pos * scale + (tx, ty). We choose scale
4268
+ // so the data (normalized to [-1, 1]) fits in [-0.92, 0.92] of NDC at
4269
+ // zoom 1, with a tiny margin so points at the edge aren't clipped.
4270
+ const baseScale = 0.92;
4271
+ gl.uniform3f(gl.getUniformLocation(program, "u_xform"),
4272
+ baseScale * view.scale, view.tx, view.ty);
4273
+ // Point size scales sub-linearly with zoom — denser areas stay readable
4274
+ // but the dots get visibly bigger when you zoom in.
4275
+ const ps = Math.min(8.0, Math.max(1.4, 1.4 + 0.6 * Math.log2(view.scale + 1))) * dpr;
4276
+ gl.uniform1f(gl.getUniformLocation(program, "u_pointSize"), ps);
4277
+ // Alpha falls off slightly with zoom-out so the dense cloud doesn't burn.
4278
+ const alpha = Math.min(0.85, Math.max(0.35, 0.35 + 0.18 * Math.log2(view.scale + 1)));
4279
+ gl.uniform1f(gl.getUniformLocation(program, "u_alpha"), alpha);
4280
+
4281
+ gl.drawArrays(gl.POINTS, 0, n);
4282
+
4283
+ // FPS counter — sampled, not per-frame.
4284
+ const now = performance.now();
4285
+ frameCount++;
4286
+ if (now - fpsTs > 500) {
4287
+ const fps = (frameCount * 1000) / (now - fpsTs);
4288
+ elFps.textContent = `${fps.toFixed(0)} fps`;
4289
+ elFps.classList.remove("muted");
4290
+ fpsTs = now;
4291
+ frameCount = 0;
4292
+ }
4293
+ lastFrameTs = now;
4294
+ }
4295
+ function requestRedraw() {
4296
+ if (needsRedraw) return;
4297
+ needsRedraw = true;
4298
+ requestAnimationFrame(draw);
4299
+ }
4300
+
4301
+ // ---- Color toggle ------------------------------------------------------
4302
+ function setColorBy(key) {
4303
+ colorBy = key;
4304
+ const catLoc = gl.getAttribLocation(program, "a_cat");
4305
+ gl.bindBuffer(gl.ARRAY_BUFFER, catBufs[key]);
4306
+ gl.enableVertexAttribArray(catLoc);
4307
+ // Unnormalized — we want the raw byte value in the shader.
4308
+ gl.vertexAttribPointer(catLoc, 1, gl.UNSIGNED_BYTE, false, 0, 0);
4309
+ uploadPalette(PALETTES[key]);
4310
+ renderLegend();
4311
+ requestRedraw();
4312
+ }
4313
+
4314
+ // ---- Legend ------------------------------------------------------------
4315
+ function renderLegend() {
4316
+ if (!labels) return;
4317
+ const palette = PALETTES[colorBy];
4318
+ const itemLabels = (colorBy === "species") ? labels.species
4319
+ : (colorBy === "biotype") ? labels.biotypes
4320
+ : (colorBy === "strand") ? labels.strands
4321
+ : labels.phases;
4322
+ legend.innerHTML = itemLabels.map((name, i) => {
4323
+ const [r, g, b] = palette[i % palette.length];
4324
+ return `<span class="item"><span class="swatch" style="background:rgb(${r},${g},${b})"></span>${name}</span>`;
4325
+ }).join("");
4326
+ }
4327
+
4328
+ // ---- Pan / zoom / hover ------------------------------------------------
4329
+ function resetView() { view = { tx: 0, ty: 0, scale: 1 }; requestRedraw(); }
4330
+
4331
+ // Convert a clientX/Y to NDC (-1..1) and to normalized data space ([-1, 1]).
4332
+ function clientToNDC(e) {
4333
+ const rect = canvas.getBoundingClientRect();
4334
+ return {
4335
+ x: ((e.clientX - rect.left) / rect.width) * 2 - 1,
4336
+ y: -((e.clientY - rect.top) / rect.height) * 2 + 1,
4337
+ };
4338
+ }
4339
+ function ndcToData(ndc) {
4340
+ const baseScale = 0.92;
4341
+ return {
4342
+ x: (ndc.x - view.tx) / (baseScale * view.scale),
4343
+ y: (ndc.y - view.ty) / (baseScale * view.scale),
4344
+ };
4345
+ }
4346
+
4347
+ let panning = false, panLast = null;
4348
+ canvas.addEventListener("pointerdown", e => {
4349
+ canvas.setPointerCapture(e.pointerId);
4350
+ panning = true;
4351
+ panLast = { x: e.clientX, y: e.clientY };
4352
+ canvas.classList.add("panning");
4353
+ hideTooltip();
4354
+ });
4355
+ canvas.addEventListener("pointermove", e => {
4356
+ if (panning) {
4357
+ const rect = canvas.getBoundingClientRect();
4358
+ const dx = ((e.clientX - panLast.x) / rect.width) * 2;
4359
+ const dy = -((e.clientY - panLast.y) / rect.height) * 2;
4360
+ view.tx += dx; view.ty += dy;
4361
+ panLast = { x: e.clientX, y: e.clientY };
4362
+ requestRedraw();
4363
+ } else {
4364
+ handleHover(e);
4365
+ }
4366
+ });
4367
+ function endPan(e) {
4368
+ if (!panning) return;
4369
+ panning = false;
4370
+ canvas.classList.remove("panning");
4371
+ try { canvas.releasePointerCapture(e.pointerId); } catch {}
4372
+ }
4373
+ canvas.addEventListener("pointerup", endPan);
4374
+ canvas.addEventListener("pointercancel", endPan);
4375
+ canvas.addEventListener("pointerleave", () => hideTooltip());
4376
+
4377
+ canvas.addEventListener("wheel", e => {
4378
+ e.preventDefault();
4379
+ const ndc = clientToNDC(e);
4380
+ // Zoom factor — natural feeling on both trackpad and mouse wheel.
4381
+ const factor = Math.exp(-e.deltaY * 0.0018);
4382
+ const newScale = Math.min(50, Math.max(0.5, view.scale * factor));
4383
+ const k = newScale / view.scale;
4384
+ // Zoom around the cursor: shift translate so the point under the cursor
4385
+ // stays under the cursor.
4386
+ view.tx = ndc.x - (ndc.x - view.tx) * k;
4387
+ view.ty = ndc.y - (ndc.y - view.ty) * k;
4388
+ view.scale = newScale;
4389
+ requestRedraw();
4390
+ hideTooltip();
4391
+ }, { passive: false });
4392
+
4393
+ resetBtn.addEventListener("click", resetView);
4394
+
4395
+ // ---- Hover picking -----------------------------------------------------
4396
+ function showTooltip(idx, x, y) {
4397
+ const sp = labels.species[cats.species[idx]];
4398
+ const bt = labels.biotypes[cats.biotype[idx]];
4399
+ const st = labels.strands[cats.strand[idx]];
4400
+ const ph = labels.phases[cats.phase[idx]];
4401
+ tooltip.innerHTML =
4402
+ `<div><span class="t-label">species</span>${sp}</div>` +
4403
+ `<div><span class="t-label">biotype</span>${bt}</div>` +
4404
+ `<div><span class="t-label">strand</span>${st} &nbsp; <span class="t-label">phase</span>${ph}</div>`;
4405
+ tooltip.style.left = x + "px";
4406
+ tooltip.style.top = y + "px";
4407
+ tooltip.classList.add("visible");
4408
+ }
4409
+ function hideTooltip() { tooltip.classList.remove("visible"); }
4410
+
4411
+ function handleHover(e) {
4412
+ if (!grid) return;
4413
+ const ndc = clientToNDC(e);
4414
+ const data = ndcToData(ndc);
4415
+ // Convert data-space (-1..1) into grid coords.
4416
+ const gx = (data.x + 1) * 0.5 * GRID_N;
4417
+ const gy = (data.y + 1) * 0.5 * GRID_N;
4418
+ const cx = Math.floor(gx), cy = Math.floor(gy);
4419
+ if (cx < -1 || cx > GRID_N || cy < -1 || cy > GRID_N) return hideTooltip();
4420
+
4421
+ // Adaptive search radius: at higher zoom, we want a tighter pick radius.
4422
+ // ~8px screen radius converted to data space.
4423
+ const rect = canvas.getBoundingClientRect();
4424
+ const screenR = 8;
4425
+ const dataR = (screenR / rect.width) * 2 / (0.92 * view.scale);
4426
+ const dataR2 = dataR * dataR;
4427
+
4428
+ let best = -1, bestD2 = dataR2;
4429
+ const cellSpan = Math.max(1, Math.ceil(dataR * GRID_N * 0.5) + 1);
4430
+ for (let dy = -cellSpan; dy <= cellSpan; dy++) {
4431
+ const yy = cy + dy;
4432
+ if (yy < 0 || yy >= GRID_N) continue;
4433
+ for (let dx = -cellSpan; dx <= cellSpan; dx++) {
4434
+ const xx = cx + dx;
4435
+ if (xx < 0 || xx >= GRID_N) continue;
4436
+ const list = grid[yy * GRID_N + xx];
4437
+ if (!list) continue;
4438
+ for (let k = 0; k < list.length; k++) {
4439
+ const idx = list[k];
4440
+ // Recompute the point's normalized [-1, 1] position from posBuf16
4441
+ // — we don't keep it on CPU, but we can re-derive from int16 cheaply.
4442
+ const px = posSnapshot[2*idx] / 32767;
4443
+ const py = posSnapshot[2*idx + 1] / 32767;
4444
+ const ex = px - data.x, ey = py - data.y;
4445
+ const d2 = ex*ex + ey*ey;
4446
+ if (d2 < bestD2) { bestD2 = d2; best = idx; }
4447
+ }
4448
+ }
4449
+ }
4450
+ if (best === -1) return hideTooltip();
4451
+ // Place tooltip near cursor, offset to the right & above.
4452
+ const relX = e.clientX - rect.left;
4453
+ const relY = e.clientY - rect.top;
4454
+ showTooltip(best, relX, relY);
4455
+ }
4456
+
4457
+ // We need an unattached CPU-side copy of positions for hover hit-testing
4458
+ // because WebGL buffers aren't readable from JS without a roundtrip.
4459
+ let posSnapshot = null;
4460
+
4461
+ // ---- Bootstrap ---------------------------------------------------------
4462
+ setupGL();
4463
+
4464
+ colorPills.forEach(p => {
4465
+ p.addEventListener("click", () => {
4466
+ colorPills.forEach(x => x.classList.toggle("active", x === p));
4467
+ setColorBy(p.dataset.color);
4468
+ });
4469
+ });
4470
+
4471
+ // Defer loading until the umap section is near the viewport — 500K points
4472
+ // doesn't need to fight for bandwidth on first paint.
4473
+ const io = new IntersectionObserver(async (entries) => {
4474
+ if (!entries[0].isIntersecting) return;
4475
+ io.disconnect();
4476
+ try {
4477
+ const pos16 = await loadData();
4478
+ posSnapshot = pos16;
4479
+ setColorBy("species"); // initial coloring + first draw
4480
+ } catch (err) {
4481
+ console.error(err);
4482
+ setStatus("error", "load failed");
4483
+ overlay.textContent = "load failed · " + err.message;
4484
+ }
4485
+ }, { rootMargin: "400px" });
4486
+ io.observe(canvas);
4487
+
4488
+ window.addEventListener("resize", () => requestRedraw());
4489
+ })();
4490
+
4491
  // =========================================================================
4492
  // Carbon banner — animated DNA helix (Canvas 2D)
4493
  //
scripts/gen_fake_umap.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate a fake UMAP scatter dataset for prototyping the §6 viewer.
2
+
3
+ The real embedding-UMAP output (from Dana's Carbon 3B pipeline) will land
4
+ in roughly the same shape. Until then, this gives us a 500K-point fixture
5
+ to develop the WebGL frontend at target scale, so we don't discover
6
+ performance cliffs later.
7
+
8
+ Layout strategy:
9
+ - 24 eukaryotic species grouped into 5 kingdoms (vertebrates, invertebrates,
10
+ plants, fungi, protozoa). Kingdom centers are placed evenly around a
11
+ radius-12 circle in UMAP space, with species centers jittered nearby.
12
+ - Each point is drawn from a 2D Gaussian around its species center.
13
+ - Biotype, strand, and codon phase are roughly orthogonal axes — toggling
14
+ color in the UI reveals different organizations of the same points.
15
+ - Points are output sorted by species so gzip can RLE the category columns
16
+ aggressively (typically 90%+ compression on those bytes).
17
+
18
+ Binary layout (little-endian, 40-byte header + ~4 MB payload for 500K pts):
19
+ uint32 magic 0xCAB0FA1D
20
+ uint32 n_points
21
+ uint32 n_species
22
+ uint32 n_biotypes
23
+ uint32 n_strands
24
+ uint32 n_phases
25
+ float32 x_min, x_max, y_min, y_max (16 bytes — for de-quantization)
26
+ int16 positions[n_points * 2] (interleaved x,y; quantized)
27
+ uint8 species[n_points]
28
+ uint8 biotype[n_points]
29
+ uint8 strand[n_points]
30
+ uint8 phase[n_points]
31
+
32
+ Usage:
33
+ python scripts/gen_fake_umap.py
34
+ """
35
+ import array
36
+ import json
37
+ import math
38
+ import os
39
+ import random
40
+ import struct
41
+ import sys
42
+ import time
43
+
44
+ # --- Dataset shape ---------------------------------------------------------
45
+
46
+ N_POINTS = 500_000
47
+ SEED = 42
48
+
49
+ # 24 eukaryotic species — matches the rough scale of the real run (24 species
50
+ # from Leandro's Gemini-annotated set, 500/species in Dana's first batch).
51
+ SPECIES = [
52
+ # Vertebrates
53
+ "human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
54
+ "dog", "cow", "pig",
55
+ # Invertebrates
56
+ "fly", "worm", "mosquito", "honeybee", "sea_urchin",
57
+ # Plants
58
+ "arabidopsis", "rice", "maize", "wheat", "soybean",
59
+ # Fungi
60
+ "yeast", "neurospora", "candida",
61
+ # Protozoa
62
+ "plasmodium", "trypanosoma",
63
+ ]
64
+
65
+ KINGDOMS = {
66
+ "vertebrates": ["human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
67
+ "dog", "cow", "pig"],
68
+ "invertebrates": ["fly", "worm", "mosquito", "honeybee", "sea_urchin"],
69
+ "plants": ["arabidopsis", "rice", "maize", "wheat", "soybean"],
70
+ "fungi": ["yeast", "neurospora", "candida"],
71
+ "protozoa": ["plasmodium", "trypanosoma"],
72
+ }
73
+
74
+ BIOTYPES = ["protein_coding", "lncRNA", "miRNA", "pseudogene"]
75
+ # Roughly realistic frequencies for a mixed eukaryote set.
76
+ BIOTYPE_WEIGHTS = [0.70, 0.15, 0.08, 0.07]
77
+
78
+ STRANDS = ["+", "-"]
79
+ PHASES = ["0", "1", "2"]
80
+
81
+ HERE = os.path.dirname(os.path.abspath(__file__))
82
+ DATA = os.path.join(os.path.dirname(HERE), "data")
83
+
84
+ # --- Generation ------------------------------------------------------------
85
+
86
+
87
+ def species_centers():
88
+ """Pick a UMAP-space center for each species, kingdom by kingdom."""
89
+ centers = {}
90
+ kingdom_names = list(KINGDOMS.keys())
91
+ for i, kingdom in enumerate(kingdom_names):
92
+ # Kingdom center placed evenly around a circle.
93
+ cx = math.cos(2 * math.pi * i / len(kingdom_names)) * 12.0
94
+ cy = math.sin(2 * math.pi * i / len(kingdom_names)) * 12.0
95
+ for sp in KINGDOMS[kingdom]:
96
+ ox = random.gauss(0, 1.8)
97
+ oy = random.gauss(0, 1.8)
98
+ centers[sp] = (cx + ox, cy + oy)
99
+ return centers
100
+
101
+
102
+ def weighted_choice_idx(weights):
103
+ """Faster than random.choices() for tight loops — returns the index."""
104
+ r = random.random()
105
+ acc = 0.0
106
+ for i, w in enumerate(weights):
107
+ acc += w
108
+ if r < acc:
109
+ return i
110
+ return len(weights) - 1
111
+
112
+
113
+ def generate():
114
+ """Produce all per-point data as parallel arrays (sorted by species)."""
115
+ random.seed(SEED)
116
+ centers = species_centers()
117
+
118
+ xs = array.array("f")
119
+ ys = array.array("f")
120
+ sp_col = bytearray()
121
+ bt_col = bytearray()
122
+ st_col = bytearray()
123
+ ph_col = bytearray()
124
+
125
+ base = N_POINTS // len(SPECIES)
126
+ extra = N_POINTS - base * len(SPECIES)
127
+
128
+ for sp_idx, sp in enumerate(SPECIES):
129
+ n_this = base + (1 if sp_idx < extra else 0)
130
+ cx, cy = centers[sp]
131
+ # Sample n_this points around this species' center.
132
+ for _ in range(n_this):
133
+ xs.append(cx + random.gauss(0, 1.2))
134
+ ys.append(cy + random.gauss(0, 1.2))
135
+ sp_col.append(sp_idx)
136
+ bt_col.append(weighted_choice_idx(BIOTYPE_WEIGHTS))
137
+ st_col.append(random.randrange(2))
138
+ ph_col.append(random.randrange(3))
139
+
140
+ return xs, ys, sp_col, bt_col, st_col, ph_col
141
+
142
+
143
+ def pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col):
144
+ """Pack parallel arrays into the binary layout described in the docstring."""
145
+ n = len(xs)
146
+ x_min, x_max = min(xs), max(xs)
147
+ y_min, y_max = min(ys), max(ys)
148
+
149
+ # Map floats into int16 range [-32767, 32767] using the bounds (so the
150
+ # client can reconstitute float coords if it ever needs to).
151
+ qx = array.array("h")
152
+ qy = array.array("h")
153
+ rx = 65534.0 / (x_max - x_min)
154
+ ry = 65534.0 / (y_max - y_min)
155
+ for i in range(n):
156
+ qx.append(int(round((xs[i] - x_min) * rx - 32767)))
157
+ qy.append(int(round((ys[i] - y_min) * ry - 32767)))
158
+
159
+ # Interleave xy into a single int16 stream (WebGL friendly: stride 4 bytes).
160
+ pos = array.array("h", [0] * (2 * n))
161
+ for i in range(n):
162
+ pos[2 * i] = qx[i]
163
+ pos[2 * i + 1] = qy[i]
164
+
165
+ buf = bytearray()
166
+ # Header — 40 bytes (6 uint32 + 4 float32).
167
+ buf += struct.pack("<6I",
168
+ 0xCAB0FA1D, n,
169
+ len(SPECIES), len(BIOTYPES),
170
+ len(STRANDS), len(PHASES))
171
+ buf += struct.pack("<4f", x_min, x_max, y_min, y_max)
172
+
173
+ buf += pos.tobytes()
174
+ buf += bytes(sp_col)
175
+ buf += bytes(bt_col)
176
+ buf += bytes(st_col)
177
+ buf += bytes(ph_col)
178
+
179
+ return buf, (x_min, x_max, y_min, y_max)
180
+
181
+
182
+ def main():
183
+ os.makedirs(DATA, exist_ok=True)
184
+
185
+ t0 = time.time()
186
+ print(f"generating {N_POINTS:,} fake UMAP points ...", file=sys.stderr)
187
+ xs, ys, sp_col, bt_col, st_col, ph_col = generate()
188
+ print(f" done in {time.time() - t0:.1f}s", file=sys.stderr)
189
+
190
+ t1 = time.time()
191
+ print("packing binary ...", file=sys.stderr)
192
+ buf, bounds = pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col)
193
+ print(f" done in {time.time() - t1:.1f}s", file=sys.stderr)
194
+
195
+ out_bin = os.path.join(DATA, "umap.bin")
196
+ with open(out_bin, "wb") as f:
197
+ f.write(buf)
198
+ print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)
199
+
200
+ out_labels = os.path.join(DATA, "umap_labels.json")
201
+ species_to_kingdom = {sp: kd for kd, members in KINGDOMS.items() for sp in members}
202
+ with open(out_labels, "w") as f:
203
+ json.dump({
204
+ "species": SPECIES,
205
+ "biotypes": BIOTYPES,
206
+ "strands": STRANDS,
207
+ "phases": PHASES,
208
+ "species_kingdom": species_to_kingdom,
209
+ "kingdoms": list(KINGDOMS.keys()),
210
+ "bounds": list(bounds),
211
+ "n_points": len(xs),
212
+ "fake": True,
213
+ }, f, indent=2)
214
+ print(f" wrote {out_labels}", file=sys.stderr)
215
+ print(f"total: {time.time() - t0:.1f}s", file=sys.stderr)
216
+
217
+
218
+ if __name__ == "__main__":
219
+ main()