Spaces:
Running
Running
Merge feature/umap-scatter into feat/folding-3d (local-only)
Browse filesCombines the §6 UMAP scatter and §5 Folding work into one branch for
end-to-end local testing. To be split back into two separate PRs.
demo.html conflict resolution: kept §6 UMAP IIFE from umap-scatter,
kept the Canvas 2D banner comment from folding-3d (perf branch
already merged into main). No code changes, just a comment block.
Co-authored-by: Cursor <cursoragent@cursor.com>
- app.py +28 -0
- data/umap.bin +3 -0
- data/umap_labels.json +84 -0
- demo.html +617 -9
- scripts/gen_fake_umap.py +219 -0
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import os
|
|
| 4 |
|
| 5 |
import httpx
|
| 6 |
from fastapi import FastAPI, Request
|
|
|
|
| 7 |
from fastapi.responses import FileResponse, StreamingResponse
|
| 8 |
from fastapi.staticfiles import StaticFiles
|
| 9 |
from openai import OpenAI
|
|
@@ -69,6 +70,11 @@ def left_pad_to_six(seq: str) -> tuple[str, int]:
|
|
| 69 |
|
| 70 |
|
| 71 |
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
app.mount("/img", StaticFiles(directory=os.path.join(HERE, "img")), name="img")
|
| 73 |
|
| 74 |
|
|
@@ -108,6 +114,28 @@ def species():
|
|
| 108 |
return FileResponse(os.path.join(HERE, "data", "species.json"), media_type="application/json")
|
| 109 |
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
@app.post("/score")
|
| 112 |
async def score(request: Request):
|
| 113 |
"""Return per-token logprobs over a (forced) sequence using echo=True.
|
|
|
|
| 4 |
|
| 5 |
import httpx
|
| 6 |
from fastapi import FastAPI, Request
|
| 7 |
+
from fastapi.middleware.gzip import GZipMiddleware
|
| 8 |
from fastapi.responses import FileResponse, StreamingResponse
|
| 9 |
from fastapi.staticfiles import StaticFiles
|
| 10 |
from openai import OpenAI
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
app = FastAPI()
|
| 73 |
+
# Compress responses >= 1 KB. Mostly aimed at /umap (~4 MB binary blob
|
| 74 |
+
# → ~2 MB on the wire) and the JSON gene/variant/species catalogs.
|
| 75 |
+
# compresslevel=6 is the gzip(1) system default — within ~3% of level 9
|
| 76 |
+
# in ratio but ~5x cheaper in CPU. Worth it on every request.
|
| 77 |
+
app.add_middleware(GZipMiddleware, minimum_size=1024, compresslevel=6)
|
| 78 |
app.mount("/img", StaticFiles(directory=os.path.join(HERE, "img")), name="img")
|
| 79 |
|
| 80 |
|
|
|
|
| 114 |
return FileResponse(os.path.join(HERE, "data", "species.json"), media_type="application/json")
|
| 115 |
|
| 116 |
|
| 117 |
+
@app.get("/umap")
|
| 118 |
+
def umap():
|
| 119 |
+
"""Binary packed scatter (int16 positions + uint8 categories) for §6.
|
| 120 |
+
|
| 121 |
+
The §6 frontend fetches this as an ArrayBuffer and feeds it straight
|
| 122 |
+
into WebGL — no JSON parse, no per-point allocations. See
|
| 123 |
+
scripts/gen_fake_umap.py for the binary layout.
|
| 124 |
+
"""
|
| 125 |
+
return FileResponse(
|
| 126 |
+
os.path.join(HERE, "data", "umap.bin"),
|
| 127 |
+
media_type="application/octet-stream",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@app.get("/umap_labels")
|
| 132 |
+
def umap_labels():
|
| 133 |
+
return FileResponse(
|
| 134 |
+
os.path.join(HERE, "data", "umap_labels.json"),
|
| 135 |
+
media_type="application/json",
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
@app.post("/score")
|
| 140 |
async def score(request: Request):
|
| 141 |
"""Return per-token logprobs over a (forced) sequence using echo=True.
|
data/umap.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c8de51da32a9fa05d12b8b580b8dad36bbe1a257ee6a721b18b4030c12daa92
|
| 3 |
+
size 4000040
|
data/umap_labels.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"species": [
|
| 3 |
+
"human",
|
| 4 |
+
"mouse",
|
| 5 |
+
"rat",
|
| 6 |
+
"chicken",
|
| 7 |
+
"zebrafish",
|
| 8 |
+
"xenopus",
|
| 9 |
+
"dog",
|
| 10 |
+
"cow",
|
| 11 |
+
"pig",
|
| 12 |
+
"fly",
|
| 13 |
+
"worm",
|
| 14 |
+
"mosquito",
|
| 15 |
+
"honeybee",
|
| 16 |
+
"sea_urchin",
|
| 17 |
+
"arabidopsis",
|
| 18 |
+
"rice",
|
| 19 |
+
"maize",
|
| 20 |
+
"wheat",
|
| 21 |
+
"soybean",
|
| 22 |
+
"yeast",
|
| 23 |
+
"neurospora",
|
| 24 |
+
"candida",
|
| 25 |
+
"plasmodium",
|
| 26 |
+
"trypanosoma"
|
| 27 |
+
],
|
| 28 |
+
"biotypes": [
|
| 29 |
+
"protein_coding",
|
| 30 |
+
"lncRNA",
|
| 31 |
+
"miRNA",
|
| 32 |
+
"pseudogene"
|
| 33 |
+
],
|
| 34 |
+
"strands": [
|
| 35 |
+
"+",
|
| 36 |
+
"-"
|
| 37 |
+
],
|
| 38 |
+
"phases": [
|
| 39 |
+
"0",
|
| 40 |
+
"1",
|
| 41 |
+
"2"
|
| 42 |
+
],
|
| 43 |
+
"species_kingdom": {
|
| 44 |
+
"human": "vertebrates",
|
| 45 |
+
"mouse": "vertebrates",
|
| 46 |
+
"rat": "vertebrates",
|
| 47 |
+
"chicken": "vertebrates",
|
| 48 |
+
"zebrafish": "vertebrates",
|
| 49 |
+
"xenopus": "vertebrates",
|
| 50 |
+
"dog": "vertebrates",
|
| 51 |
+
"cow": "vertebrates",
|
| 52 |
+
"pig": "vertebrates",
|
| 53 |
+
"fly": "invertebrates",
|
| 54 |
+
"worm": "invertebrates",
|
| 55 |
+
"mosquito": "invertebrates",
|
| 56 |
+
"honeybee": "invertebrates",
|
| 57 |
+
"sea_urchin": "invertebrates",
|
| 58 |
+
"arabidopsis": "plants",
|
| 59 |
+
"rice": "plants",
|
| 60 |
+
"maize": "plants",
|
| 61 |
+
"wheat": "plants",
|
| 62 |
+
"soybean": "plants",
|
| 63 |
+
"yeast": "fungi",
|
| 64 |
+
"neurospora": "fungi",
|
| 65 |
+
"candida": "fungi",
|
| 66 |
+
"plasmodium": "protozoa",
|
| 67 |
+
"trypanosoma": "protozoa"
|
| 68 |
+
},
|
| 69 |
+
"kingdoms": [
|
| 70 |
+
"vertebrates",
|
| 71 |
+
"invertebrates",
|
| 72 |
+
"plants",
|
| 73 |
+
"fungi",
|
| 74 |
+
"protozoa"
|
| 75 |
+
],
|
| 76 |
+
"bounds": [
|
| 77 |
+
-19.07978630065918,
|
| 78 |
+
17.736162185668945,
|
| 79 |
+
-15.417572021484375,
|
| 80 |
+
16.83187484741211
|
| 81 |
+
],
|
| 82 |
+
"n_points": 500000,
|
| 83 |
+
"fake": true
|
| 84 |
+
}
|
demo.html
CHANGED
|
@@ -291,6 +291,77 @@
|
|
| 291 |
font-size: 9px; color: #aaa; margin-bottom: 8px;
|
| 292 |
}
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
/* --- Gene-completion specifics (§1) --- */
|
| 295 |
.gene-info {
|
| 296 |
font-family: "JetBrains Mono", monospace;
|
|
@@ -1081,21 +1152,57 @@
|
|
| 1081 |
</section>
|
| 1082 |
|
| 1083 |
<!-- ============================================================ -->
|
| 1084 |
-
<!-- §6 — UMAP (
|
| 1085 |
<!-- ============================================================ -->
|
| 1086 |
<section id="umap">
|
| 1087 |
-
<div class="section-num">§6 ·
|
| 1088 |
<div class="section-title">The genome, organized</div>
|
| 1089 |
<p class="lede">
|
| 1090 |
-
Embed
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
</p>
|
| 1095 |
|
| 1096 |
-
<div class="
|
| 1097 |
-
<div class="
|
| 1098 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1099 |
</div>
|
| 1100 |
</section>
|
| 1101 |
|
|
@@ -3880,6 +3987,507 @@ function loadGenes() {
|
|
| 3880 |
}
|
| 3881 |
})();
|
| 3882 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3883 |
// =========================================================================
|
| 3884 |
// Carbon banner — animated DNA helix (Canvas 2D)
|
| 3885 |
//
|
|
|
|
| 291 |
font-size: 9px; color: #aaa; margin-bottom: 8px;
|
| 292 |
}
|
| 293 |
|
| 294 |
+
/* --- UMAP scatter specifics (§6) --- */
|
| 295 |
+
.umap-frame {
|
| 296 |
+
position: relative;
|
| 297 |
+
width: 100%;
|
| 298 |
+
aspect-ratio: 16 / 10;
|
| 299 |
+
background: #fff;
|
| 300 |
+
border: 1px solid #eee;
|
| 301 |
+
overflow: hidden;
|
| 302 |
+
}
|
| 303 |
+
.umap-canvas {
|
| 304 |
+
position: absolute; inset: 0;
|
| 305 |
+
width: 100%; height: 100%;
|
| 306 |
+
display: block;
|
| 307 |
+
cursor: grab;
|
| 308 |
+
touch-action: none;
|
| 309 |
+
}
|
| 310 |
+
.umap-canvas.panning { cursor: grabbing; }
|
| 311 |
+
.umap-tooltip {
|
| 312 |
+
position: absolute;
|
| 313 |
+
pointer-events: none;
|
| 314 |
+
background: #1f1f1d; color: #f7f5ee;
|
| 315 |
+
font-family: "JetBrains Mono", monospace;
|
| 316 |
+
font-size: 10px; line-height: 1.4;
|
| 317 |
+
padding: 6px 9px;
|
| 318 |
+
border-radius: 2px;
|
| 319 |
+
white-space: nowrap;
|
| 320 |
+
opacity: 0;
|
| 321 |
+
transform: translate(8px, -100%);
|
| 322 |
+
transition: opacity 0.12s;
|
| 323 |
+
z-index: 4;
|
| 324 |
+
}
|
| 325 |
+
.umap-tooltip.visible { opacity: 0.96; }
|
| 326 |
+
.umap-tooltip .t-label {
|
| 327 |
+
color: #8c918b;
|
| 328 |
+
text-transform: uppercase; letter-spacing: 1px;
|
| 329 |
+
font-size: 8px;
|
| 330 |
+
margin-right: 4px;
|
| 331 |
+
}
|
| 332 |
+
.umap-status-overlay {
|
| 333 |
+
position: absolute; inset: 0;
|
| 334 |
+
display: flex; align-items: center; justify-content: center;
|
| 335 |
+
color: #aaa;
|
| 336 |
+
font-family: "JetBrains Mono", monospace;
|
| 337 |
+
font-size: 11px; letter-spacing: 1.5px;
|
| 338 |
+
text-transform: uppercase;
|
| 339 |
+
background: rgba(247, 245, 238, 0.85);
|
| 340 |
+
pointer-events: none;
|
| 341 |
+
transition: opacity 0.2s;
|
| 342 |
+
}
|
| 343 |
+
.umap-status-overlay.hidden { opacity: 0; }
|
| 344 |
+
.umap-legend {
|
| 345 |
+
display: flex; flex-wrap: wrap;
|
| 346 |
+
gap: 6px 14px;
|
| 347 |
+
margin-top: 10px;
|
| 348 |
+
font-family: "JetBrains Mono", monospace;
|
| 349 |
+
font-size: 10px;
|
| 350 |
+
color: #666;
|
| 351 |
+
}
|
| 352 |
+
.umap-legend .swatch {
|
| 353 |
+
display: inline-block;
|
| 354 |
+
width: 9px; height: 9px;
|
| 355 |
+
margin-right: 5px;
|
| 356 |
+
vertical-align: middle;
|
| 357 |
+
border-radius: 2px;
|
| 358 |
+
}
|
| 359 |
+
.umap-legend .item {
|
| 360 |
+
display: inline-flex;
|
| 361 |
+
align-items: center;
|
| 362 |
+
cursor: default;
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
/* --- Gene-completion specifics (§1) --- */
|
| 366 |
.gene-info {
|
| 367 |
font-family: "JetBrains Mono", monospace;
|
|
|
|
| 1152 |
</section>
|
| 1153 |
|
| 1154 |
<!-- ============================================================ -->
|
| 1155 |
+
<!-- §6 — UMAP (interactive scatter) -->
|
| 1156 |
<!-- ============================================================ -->
|
| 1157 |
<section id="umap">
|
| 1158 |
+
<div class="section-num">§6 · Embedding space</div>
|
| 1159 |
<div class="section-title">The genome, organized</div>
|
| 1160 |
<p class="lede">
|
| 1161 |
+
Embed half a million sequences from 24 eukaryotes with Carbon, project to 2D
|
| 1162 |
+
with UMAP, color by anything. Switch the coloring and a completely different
|
| 1163 |
+
organization emerges from the same points — the model's embedding space
|
| 1164 |
+
carries multiple axes of biology at once, none of which were ever labeled.
|
| 1165 |
</p>
|
| 1166 |
|
| 1167 |
+
<div class="demo" id="demoUmap">
|
| 1168 |
+
<div class="demo-toolbar">
|
| 1169 |
+
<span>color by</span>
|
| 1170 |
+
<span id="dumap-color-pills" class="pills">
|
| 1171 |
+
<button class="pill active" data-color="species">species</button>
|
| 1172 |
+
<button class="pill" data-color="biotype">biotype</button>
|
| 1173 |
+
<button class="pill" data-color="strand">strand</button>
|
| 1174 |
+
<button class="pill" data-color="phase">codon phase</button>
|
| 1175 |
+
</span>
|
| 1176 |
+
<span class="spacer"></span>
|
| 1177 |
+
<button id="dumap-reset" class="action">↺ reset view</button>
|
| 1178 |
+
<span class="status" id="dumap-status"><span class="dot"></span><span>idle</span></span>
|
| 1179 |
+
</div>
|
| 1180 |
+
|
| 1181 |
+
<div class="gene-info" id="dumap-info">scroll to zoom · drag to pan · hover for details</div>
|
| 1182 |
+
|
| 1183 |
+
<div class="umap-frame">
|
| 1184 |
+
<canvas class="umap-canvas" id="dumap-canvas"></canvas>
|
| 1185 |
+
<div class="umap-tooltip" id="dumap-tooltip"></div>
|
| 1186 |
+
<div class="umap-status-overlay" id="dumap-overlay">loading 500K points · ~2 MB gzipped</div>
|
| 1187 |
+
</div>
|
| 1188 |
+
|
| 1189 |
+
<div class="umap-legend" id="dumap-legend"></div>
|
| 1190 |
+
|
| 1191 |
+
<div class="stat-row" id="dumap-stats">
|
| 1192 |
+
<div class="stat-pair"><span class="stat-pair-label">points</span><span class="stat-pair-val muted" id="dumap-n">—</span></div>
|
| 1193 |
+
<div class="stat-pair"><span class="stat-pair-label">species</span><span class="stat-pair-val muted" id="dumap-nsp">—</span></div>
|
| 1194 |
+
<div class="stat-pair"><span class="stat-pair-label">embedding dim</span><span class="stat-pair-val muted">3072</span></div>
|
| 1195 |
+
<div class="stat-pair"><span class="stat-pair-label">render</span><span class="stat-pair-val muted" id="dumap-fps">—</span></div>
|
| 1196 |
+
</div>
|
| 1197 |
+
</div>
|
| 1198 |
+
|
| 1199 |
+
<div class="takeaway">
|
| 1200 |
+
<strong>What to look for</strong>
|
| 1201 |
+
Switch coloring from <em>species</em> to <em>biotype</em>: same points, completely
|
| 1202 |
+
different organization emerges. The five rough macro-clusters trace the eukaryotic
|
| 1203 |
+
kingdoms — vertebrates, invertebrates, plants, fungi, protozoa — discovered from
|
| 1204 |
+
raw sequence alone. <em>The current 500K dataset is synthetic, awaiting the real
|
| 1205 |
+
Carbon 3B embeddings.</em>
|
| 1206 |
</div>
|
| 1207 |
</section>
|
| 1208 |
|
|
|
|
| 3987 |
}
|
| 3988 |
})();
|
| 3989 |
|
| 3990 |
+
// =========================================================================
|
| 3991 |
+
// §6 — UMAP scatter (WebGL, 500K points)
|
| 3992 |
+
//
|
| 3993 |
+
// Loads a binary-packed scatter (int16 quantized positions + 4 uint8 category
|
| 3994 |
+
// columns) and renders it via WebGL gl.POINTS with a 1D palette texture for
|
| 3995 |
+
// coloring. Toggle between coloring axes (species / biotype / strand / phase)
|
| 3996 |
+
// rebinds a single byte-attribute buffer and swaps the palette texture — no
|
| 3997 |
+
// re-upload of the 500K vertex stream. Hover lookup uses a flat grid index
|
| 3998 |
+
// so picking stays O(small) regardless of total point count.
|
| 3999 |
+
// =========================================================================
|
| 4000 |
+
(function initDemoUmap() {
|
| 4001 |
+
const canvas = document.getElementById("dumap-canvas");
|
| 4002 |
+
if (!canvas) return;
|
| 4003 |
+
const tooltip = document.getElementById("dumap-tooltip");
|
| 4004 |
+
const overlay = document.getElementById("dumap-overlay");
|
| 4005 |
+
const info = document.getElementById("dumap-info");
|
| 4006 |
+
const legend = document.getElementById("dumap-legend");
|
| 4007 |
+
const resetBtn = document.getElementById("dumap-reset");
|
| 4008 |
+
const status = document.getElementById("dumap-status");
|
| 4009 |
+
const statusText = status.querySelector("span:last-child");
|
| 4010 |
+
const colorPills = document.querySelectorAll("#dumap-color-pills .pill");
|
| 4011 |
+
const elN = document.getElementById("dumap-n");
|
| 4012 |
+
const elNsp = document.getElementById("dumap-nsp");
|
| 4013 |
+
const elFps = document.getElementById("dumap-fps");
|
| 4014 |
+
|
| 4015 |
+
// ---- Palettes ----------------------------------------------------------
|
| 4016 |
+
// 24 species are grouped into 5 kingdoms — each kingdom gets a hue band.
|
| 4017 |
+
// Within a band, lightness varies to keep adjacent species distinguishable.
|
| 4018 |
+
const SPECIES_PALETTE = [
|
| 4019 |
+
// vertebrates (9) — blue/indigo band
|
| 4020 |
+
[69,117,180],[97,144,200],[125,170,220],[153,194,240],
|
| 4021 |
+
[120,90,170],[140,110,190],
|
| 4022 |
+
[80,90,150],[100,110,170],[120,130,190],
|
| 4023 |
+
// invertebrates (5) — orange band
|
| 4024 |
+
[217,95,2],[230,120,30],[240,150,60],[250,180,90],[253,210,120],
|
| 4025 |
+
// plants (5) — olive/lime band (intentionally different from Carbon's
|
| 4026 |
+
// signal-green #317f3f so the UI chrome doesn't blend with the data)
|
| 4027 |
+
[85,140,55],[115,165,75],[145,195,100],[175,220,135],[205,240,170],
|
| 4028 |
+
// fungi (3) — magenta/rose band
|
| 4029 |
+
[200,40,120],[220,80,140],[240,130,170],
|
| 4030 |
+
// protozoa (2) — gold band
|
| 4031 |
+
[200,150,30],[230,180,60],
|
| 4032 |
+
];
|
| 4033 |
+
const BIOTYPE_PALETTE = [
|
| 4034 |
+
[49,127,63], // protein_coding — Carbon green
|
| 4035 |
+
[188,46,37], // lncRNA — Carbon red
|
| 4036 |
+
[70,90,140], // miRNA — slate blue
|
| 4037 |
+
[170,170,170], // pseudogene — neutral gray
|
| 4038 |
+
];
|
| 4039 |
+
const STRAND_PALETTE = [
|
| 4040 |
+
[49,127,63], // + (forward)
|
| 4041 |
+
[188,46,37], // - (reverse)
|
| 4042 |
+
];
|
| 4043 |
+
// 3-step ordinal palette (viridis-ish endpoints) — codon phase 0/1/2.
|
| 4044 |
+
const PHASE_PALETTE = [
|
| 4045 |
+
[68,1,84], [33,144,140], [253,231,37],
|
| 4046 |
+
];
|
| 4047 |
+
const PALETTES = {
|
| 4048 |
+
species: SPECIES_PALETTE,
|
| 4049 |
+
biotype: BIOTYPE_PALETTE,
|
| 4050 |
+
strand: STRAND_PALETTE,
|
| 4051 |
+
phase: PHASE_PALETTE,
|
| 4052 |
+
};
|
| 4053 |
+
|
| 4054 |
+
// ---- State -------------------------------------------------------------
|
| 4055 |
+
let gl, program;
|
| 4056 |
+
let posBuf; // int16 interleaved x,y
|
| 4057 |
+
let catBufs = {}; // { species|biotype|strand|phase: GLBuffer of uint8 }
|
| 4058 |
+
let paletteTex;
|
| 4059 |
+
let n = 0;
|
| 4060 |
+
let labels = null; // { species:[], biotypes:[], strands:[], phases:[], bounds:[xmin,xmax,ymin,ymax] }
|
| 4061 |
+
// Raw category bytes — kept on CPU side too for tooltip lookups.
|
| 4062 |
+
let cats = { species: null, biotype: null, strand: null, phase: null };
|
| 4063 |
+
// World bounds + current colorBy axis.
|
| 4064 |
+
let bounds = [0,0,0,0];
|
| 4065 |
+
let colorBy = "species";
|
| 4066 |
+
// Viewport: translate (tx, ty) + scale around origin, in NDC space.
|
| 4067 |
+
// The whole world is fit into [-0.95, 0.95]² at initial zoom.
|
| 4068 |
+
let view = { tx: 0, ty: 0, scale: 1 };
|
| 4069 |
+
let dpr = Math.max(1, window.devicePixelRatio || 1);
|
| 4070 |
+
let needsRedraw = false;
|
| 4071 |
+
// Spatial grid for hover (built once after data load, in normalized world space).
|
| 4072 |
+
let grid = null;
|
| 4073 |
+
|
| 4074 |
+
function setStatus(state, text) {
|
| 4075 |
+
status.classList.remove("streaming", "error");
|
| 4076 |
+
if (state === "streaming") status.classList.add("streaming");
|
| 4077 |
+
if (state === "error") status.classList.add("error");
|
| 4078 |
+
statusText.textContent = text;
|
| 4079 |
+
}
|
| 4080 |
+
|
| 4081 |
+
// ---- WebGL setup -------------------------------------------------------
|
| 4082 |
+
const VS = `
|
| 4083 |
+
attribute vec2 a_pos; // raw int16, normalized via attribPointer (-1..1)
|
| 4084 |
+
attribute float a_cat; // category index (uint8 -> float)
|
| 4085 |
+
uniform vec3 u_xform; // x: scale, y: tx, z: ty
|
| 4086 |
+
uniform float u_pointSize;
|
| 4087 |
+
varying float v_cat;
|
| 4088 |
+
void main() {
|
| 4089 |
+
vec2 world = a_pos * u_xform.x + vec2(u_xform.y, u_xform.z);
|
| 4090 |
+
gl_Position = vec4(world, 0.0, 1.0);
|
| 4091 |
+
gl_PointSize = u_pointSize;
|
| 4092 |
+
v_cat = a_cat;
|
| 4093 |
+
}
|
| 4094 |
+
`;
|
| 4095 |
+
const FS = `
|
| 4096 |
+
precision mediump float;
|
| 4097 |
+
varying float v_cat;
|
| 4098 |
+
uniform sampler2D u_palette;
|
| 4099 |
+
uniform float u_paletteN;
|
| 4100 |
+
uniform float u_alpha;
|
| 4101 |
+
void main() {
|
| 4102 |
+
vec2 d = gl_PointCoord - 0.5;
|
| 4103 |
+
float r = length(d);
|
| 4104 |
+
float aa = smoothstep(0.50, 0.42, r);
|
| 4105 |
+
if (aa <= 0.001) discard;
|
| 4106 |
+
float t = (v_cat + 0.5) / u_paletteN;
|
| 4107 |
+
vec3 color = texture2D(u_palette, vec2(t, 0.5)).rgb;
|
| 4108 |
+
gl_FragColor = vec4(color, aa * u_alpha);
|
| 4109 |
+
}
|
| 4110 |
+
`;
|
| 4111 |
+
function compile(type, src) {
|
| 4112 |
+
const sh = gl.createShader(type);
|
| 4113 |
+
gl.shaderSource(sh, src);
|
| 4114 |
+
gl.compileShader(sh);
|
| 4115 |
+
if (!gl.getShaderParameter(sh, gl.COMPILE_STATUS)) {
|
| 4116 |
+
throw new Error("shader compile: " + gl.getShaderInfoLog(sh));
|
| 4117 |
+
}
|
| 4118 |
+
return sh;
|
| 4119 |
+
}
|
| 4120 |
+
function setupGL() {
|
| 4121 |
+
gl = canvas.getContext("webgl", {
|
| 4122 |
+
antialias: true, alpha: true, premultipliedAlpha: true,
|
| 4123 |
+
preserveDrawingBuffer: false,
|
| 4124 |
+
});
|
| 4125 |
+
if (!gl) throw new Error("WebGL unavailable");
|
| 4126 |
+
program = gl.createProgram();
|
| 4127 |
+
gl.attachShader(program, compile(gl.VERTEX_SHADER, VS));
|
| 4128 |
+
gl.attachShader(program, compile(gl.FRAGMENT_SHADER, FS));
|
| 4129 |
+
gl.linkProgram(program);
|
| 4130 |
+
if (!gl.getProgramParameter(program, gl.LINK_STATUS)) {
|
| 4131 |
+
throw new Error("program link: " + gl.getProgramInfoLog(program));
|
| 4132 |
+
}
|
| 4133 |
+
gl.useProgram(program);
|
| 4134 |
+
|
| 4135 |
+
// Standard premultiplied-alpha additive-ish blending — points blend over
|
| 4136 |
+
// the paper background and over each other cleanly at dense overlaps.
|
| 4137 |
+
gl.enable(gl.BLEND);
|
| 4138 |
+
gl.blendFunc(gl.ONE, gl.ONE_MINUS_SRC_ALPHA);
|
| 4139 |
+
gl.clearColor(1, 1, 1, 0);
|
| 4140 |
+
|
| 4141 |
+
paletteTex = gl.createTexture();
|
| 4142 |
+
gl.bindTexture(gl.TEXTURE_2D, paletteTex);
|
| 4143 |
+
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
|
| 4144 |
+
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
|
| 4145 |
+
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
|
| 4146 |
+
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
|
| 4147 |
+
}
|
| 4148 |
+
|
| 4149 |
+
function uploadPalette(palette) {
|
| 4150 |
+
const n = palette.length;
|
| 4151 |
+
const buf = new Uint8Array(n * 3);
|
| 4152 |
+
for (let i = 0; i < n; i++) {
|
| 4153 |
+
buf[i*3] = palette[i][0];
|
| 4154 |
+
buf[i*3+1] = palette[i][1];
|
| 4155 |
+
buf[i*3+2] = palette[i][2];
|
| 4156 |
+
}
|
| 4157 |
+
gl.bindTexture(gl.TEXTURE_2D, paletteTex);
|
| 4158 |
+
gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGB, n, 1, 0, gl.RGB, gl.UNSIGNED_BYTE, buf);
|
| 4159 |
+
gl.uniform1f(gl.getUniformLocation(program, "u_paletteN"), n);
|
| 4160 |
+
}
|
| 4161 |
+
|
| 4162 |
+
// ---- Data load ---------------------------------------------------------
|
| 4163 |
+
async function loadData() {
|
| 4164 |
+
setStatus("streaming", "loading…");
|
| 4165 |
+
const t0 = performance.now();
|
| 4166 |
+
const [binResp, labelsResp] = await Promise.all([
|
| 4167 |
+
fetch("/umap"),
|
| 4168 |
+
fetch("/umap_labels"),
|
| 4169 |
+
]);
|
| 4170 |
+
if (!binResp.ok) throw new Error("fetch /umap failed: " + binResp.status);
|
| 4171 |
+
const buf = await binResp.arrayBuffer();
|
| 4172 |
+
labels = await labelsResp.json();
|
| 4173 |
+
|
| 4174 |
+
// Parse header (matches scripts/gen_fake_umap.py).
|
| 4175 |
+
const hdrU32 = new Uint32Array(buf, 0, 6);
|
| 4176 |
+
const magic = hdrU32[0];
|
| 4177 |
+
if (magic !== 0xCAB0FA1D) throw new Error("bad magic: " + magic.toString(16));
|
| 4178 |
+
n = hdrU32[1];
|
| 4179 |
+
const nSp = hdrU32[2], nBt = hdrU32[3], nSt = hdrU32[4], nPh = hdrU32[5];
|
| 4180 |
+
const hdrF32 = new Float32Array(buf, 24, 4);
|
| 4181 |
+
bounds = [hdrF32[0], hdrF32[1], hdrF32[2], hdrF32[3]];
|
| 4182 |
+
|
| 4183 |
+
let off = 40;
|
| 4184 |
+
const pos16 = new Int16Array(buf, off, n * 2); off += n * 2 * 2;
|
| 4185 |
+
cats.species = new Uint8Array(buf, off, n); off += n;
|
| 4186 |
+
cats.biotype = new Uint8Array(buf, off, n); off += n;
|
| 4187 |
+
cats.strand = new Uint8Array(buf, off, n); off += n;
|
| 4188 |
+
cats.phase = new Uint8Array(buf, off, n); off += n;
|
| 4189 |
+
|
| 4190 |
+
// Upload to GPU.
|
| 4191 |
+
posBuf = gl.createBuffer();
|
| 4192 |
+
gl.bindBuffer(gl.ARRAY_BUFFER, posBuf);
|
| 4193 |
+
gl.bufferData(gl.ARRAY_BUFFER, pos16, gl.STATIC_DRAW);
|
| 4194 |
+
for (const key of ["species", "biotype", "strand", "phase"]) {
|
| 4195 |
+
const b = gl.createBuffer();
|
| 4196 |
+
gl.bindBuffer(gl.ARRAY_BUFFER, b);
|
| 4197 |
+
gl.bufferData(gl.ARRAY_BUFFER, cats[key], gl.STATIC_DRAW);
|
| 4198 |
+
catBufs[key] = b;
|
| 4199 |
+
}
|
| 4200 |
+
|
| 4201 |
+
// Wire attributes (position is constant; category attribute is rebound on toggle).
|
| 4202 |
+
const posLoc = gl.getAttribLocation(program, "a_pos");
|
| 4203 |
+
gl.bindBuffer(gl.ARRAY_BUFFER, posBuf);
|
| 4204 |
+
gl.enableVertexAttribArray(posLoc);
|
| 4205 |
+
// normalize=true → int16 mapped to [-1, 1] in shader — exactly the
|
| 4206 |
+
// quantization we did in the Python packer.
|
| 4207 |
+
gl.vertexAttribPointer(posLoc, 2, gl.SHORT, true, 0, 0);
|
| 4208 |
+
|
| 4209 |
+
// Build spatial grid (in [-1, 1]² normalized world space).
|
| 4210 |
+
buildGrid(pos16);
|
| 4211 |
+
|
| 4212 |
+
elN.textContent = n.toLocaleString("en-US");
|
| 4213 |
+
elN.classList.remove("muted");
|
| 4214 |
+
elNsp.textContent = labels.species.length;
|
| 4215 |
+
elNsp.classList.remove("muted");
|
| 4216 |
+
|
| 4217 |
+
const ms = (performance.now() - t0) | 0;
|
| 4218 |
+
setStatus("idle", `loaded ${(n/1000)|0}k pts · ${ms} ms`);
|
| 4219 |
+
info.textContent = `${n.toLocaleString("en-US")} sequences · ${labels.species.length} eukaryotic species · drag to pan, wheel to zoom`;
|
| 4220 |
+
overlay.classList.add("hidden");
|
| 4221 |
+
|
| 4222 |
+
return pos16;
|
| 4223 |
+
}
|
| 4224 |
+
|
| 4225 |
+
// ---- Spatial grid (hover picking) --------------------------------------
|
| 4226 |
+
// We store, per cell, a list of point indices whose normalized (x,y) falls
|
| 4227 |
+
// in that cell. At hover, look up the cell under the cursor plus the 8
|
| 4228 |
+
// neighbors, then scan for the nearest point within a screen-space radius.
|
| 4229 |
+
const GRID_N = 128;
|
| 4230 |
+
function buildGrid(pos16) {
|
| 4231 |
+
const cells = new Array(GRID_N * GRID_N);
|
| 4232 |
+
for (let i = 0; i < cells.length; i++) cells[i] = null;
|
| 4233 |
+
for (let i = 0; i < n; i++) {
|
| 4234 |
+
// pos16 entries are in [-32767, 32767] → normalize to [0, GRID_N).
|
| 4235 |
+
const x = (pos16[2*i] + 32767) / 65534;
|
| 4236 |
+
const y = (pos16[2*i + 1] + 32767) / 65534;
|
| 4237 |
+
const cx = Math.min(GRID_N - 1, Math.max(0, (x * GRID_N) | 0));
|
| 4238 |
+
const cy = Math.min(GRID_N - 1, Math.max(0, (y * GRID_N) | 0));
|
| 4239 |
+
const id = cy * GRID_N + cx;
|
| 4240 |
+
const list = cells[id];
|
| 4241 |
+
if (list === null) cells[id] = [i];
|
| 4242 |
+
else list.push(i);
|
| 4243 |
+
}
|
| 4244 |
+
grid = cells;
|
| 4245 |
+
}
|
| 4246 |
+
|
| 4247 |
+
// ---- Render ------------------------------------------------------------
|
| 4248 |
+
function resize() {
|
| 4249 |
+
const rect = canvas.getBoundingClientRect();
|
| 4250 |
+
if (rect.width === 0 || rect.height === 0) return false;
|
| 4251 |
+
dpr = Math.max(1, window.devicePixelRatio || 1);
|
| 4252 |
+
const w = Math.round(rect.width * dpr);
|
| 4253 |
+
const h = Math.round(rect.height * dpr);
|
| 4254 |
+
if (canvas.width !== w || canvas.height !== h) {
|
| 4255 |
+
canvas.width = w; canvas.height = h;
|
| 4256 |
+
}
|
| 4257 |
+
gl.viewport(0, 0, w, h);
|
| 4258 |
+
return true;
|
| 4259 |
+
}
|
| 4260 |
+
|
| 4261 |
+
let lastFrameTs = 0, frameCount = 0, fpsTs = 0;
|
| 4262 |
+
function draw() {
|
| 4263 |
+
needsRedraw = false;
|
| 4264 |
+
if (!resize()) return;
|
| 4265 |
+
gl.clear(gl.COLOR_BUFFER_BIT);
|
| 4266 |
+
|
| 4267 |
+
// The vertex shader does world = pos * scale + (tx, ty). We choose scale
|
| 4268 |
+
// so the data (normalized to [-1, 1]) fits in [-0.92, 0.92] of NDC at
|
| 4269 |
+
// zoom 1, with a tiny margin so points at the edge aren't clipped.
|
| 4270 |
+
const baseScale = 0.92;
|
| 4271 |
+
gl.uniform3f(gl.getUniformLocation(program, "u_xform"),
|
| 4272 |
+
baseScale * view.scale, view.tx, view.ty);
|
| 4273 |
+
// Point size scales sub-linearly with zoom — denser areas stay readable
|
| 4274 |
+
// but the dots get visibly bigger when you zoom in.
|
| 4275 |
+
const ps = Math.min(8.0, Math.max(1.4, 1.4 + 0.6 * Math.log2(view.scale + 1))) * dpr;
|
| 4276 |
+
gl.uniform1f(gl.getUniformLocation(program, "u_pointSize"), ps);
|
| 4277 |
+
// Alpha falls off slightly with zoom-out so the dense cloud doesn't burn.
|
| 4278 |
+
const alpha = Math.min(0.85, Math.max(0.35, 0.35 + 0.18 * Math.log2(view.scale + 1)));
|
| 4279 |
+
gl.uniform1f(gl.getUniformLocation(program, "u_alpha"), alpha);
|
| 4280 |
+
|
| 4281 |
+
gl.drawArrays(gl.POINTS, 0, n);
|
| 4282 |
+
|
| 4283 |
+
// FPS counter — sampled, not per-frame.
|
| 4284 |
+
const now = performance.now();
|
| 4285 |
+
frameCount++;
|
| 4286 |
+
if (now - fpsTs > 500) {
|
| 4287 |
+
const fps = (frameCount * 1000) / (now - fpsTs);
|
| 4288 |
+
elFps.textContent = `${fps.toFixed(0)} fps`;
|
| 4289 |
+
elFps.classList.remove("muted");
|
| 4290 |
+
fpsTs = now;
|
| 4291 |
+
frameCount = 0;
|
| 4292 |
+
}
|
| 4293 |
+
lastFrameTs = now;
|
| 4294 |
+
}
|
| 4295 |
+
function requestRedraw() {
|
| 4296 |
+
if (needsRedraw) return;
|
| 4297 |
+
needsRedraw = true;
|
| 4298 |
+
requestAnimationFrame(draw);
|
| 4299 |
+
}
|
| 4300 |
+
|
| 4301 |
+
// ---- Color toggle ------------------------------------------------------
|
| 4302 |
+
function setColorBy(key) {
|
| 4303 |
+
colorBy = key;
|
| 4304 |
+
const catLoc = gl.getAttribLocation(program, "a_cat");
|
| 4305 |
+
gl.bindBuffer(gl.ARRAY_BUFFER, catBufs[key]);
|
| 4306 |
+
gl.enableVertexAttribArray(catLoc);
|
| 4307 |
+
// Unnormalized — we want the raw byte value in the shader.
|
| 4308 |
+
gl.vertexAttribPointer(catLoc, 1, gl.UNSIGNED_BYTE, false, 0, 0);
|
| 4309 |
+
uploadPalette(PALETTES[key]);
|
| 4310 |
+
renderLegend();
|
| 4311 |
+
requestRedraw();
|
| 4312 |
+
}
|
| 4313 |
+
|
| 4314 |
+
// ---- Legend ------------------------------------------------------------
|
| 4315 |
+
function renderLegend() {
|
| 4316 |
+
if (!labels) return;
|
| 4317 |
+
const palette = PALETTES[colorBy];
|
| 4318 |
+
const itemLabels = (colorBy === "species") ? labels.species
|
| 4319 |
+
: (colorBy === "biotype") ? labels.biotypes
|
| 4320 |
+
: (colorBy === "strand") ? labels.strands
|
| 4321 |
+
: labels.phases;
|
| 4322 |
+
legend.innerHTML = itemLabels.map((name, i) => {
|
| 4323 |
+
const [r, g, b] = palette[i % palette.length];
|
| 4324 |
+
return `<span class="item"><span class="swatch" style="background:rgb(${r},${g},${b})"></span>${name}</span>`;
|
| 4325 |
+
}).join("");
|
| 4326 |
+
}
|
| 4327 |
+
|
| 4328 |
+
// ---- Pan / zoom / hover ------------------------------------------------
|
| 4329 |
+
function resetView() { view = { tx: 0, ty: 0, scale: 1 }; requestRedraw(); }
|
| 4330 |
+
|
| 4331 |
+
// Convert a clientX/Y to NDC (-1..1) and to normalized data space ([-1, 1]).
|
| 4332 |
+
function clientToNDC(e) {
|
| 4333 |
+
const rect = canvas.getBoundingClientRect();
|
| 4334 |
+
return {
|
| 4335 |
+
x: ((e.clientX - rect.left) / rect.width) * 2 - 1,
|
| 4336 |
+
y: -((e.clientY - rect.top) / rect.height) * 2 + 1,
|
| 4337 |
+
};
|
| 4338 |
+
}
|
| 4339 |
+
function ndcToData(ndc) {
|
| 4340 |
+
const baseScale = 0.92;
|
| 4341 |
+
return {
|
| 4342 |
+
x: (ndc.x - view.tx) / (baseScale * view.scale),
|
| 4343 |
+
y: (ndc.y - view.ty) / (baseScale * view.scale),
|
| 4344 |
+
};
|
| 4345 |
+
}
|
| 4346 |
+
|
| 4347 |
+
let panning = false, panLast = null;
|
| 4348 |
+
canvas.addEventListener("pointerdown", e => {
|
| 4349 |
+
canvas.setPointerCapture(e.pointerId);
|
| 4350 |
+
panning = true;
|
| 4351 |
+
panLast = { x: e.clientX, y: e.clientY };
|
| 4352 |
+
canvas.classList.add("panning");
|
| 4353 |
+
hideTooltip();
|
| 4354 |
+
});
|
| 4355 |
+
canvas.addEventListener("pointermove", e => {
|
| 4356 |
+
if (panning) {
|
| 4357 |
+
const rect = canvas.getBoundingClientRect();
|
| 4358 |
+
const dx = ((e.clientX - panLast.x) / rect.width) * 2;
|
| 4359 |
+
const dy = -((e.clientY - panLast.y) / rect.height) * 2;
|
| 4360 |
+
view.tx += dx; view.ty += dy;
|
| 4361 |
+
panLast = { x: e.clientX, y: e.clientY };
|
| 4362 |
+
requestRedraw();
|
| 4363 |
+
} else {
|
| 4364 |
+
handleHover(e);
|
| 4365 |
+
}
|
| 4366 |
+
});
|
| 4367 |
+
function endPan(e) {
|
| 4368 |
+
if (!panning) return;
|
| 4369 |
+
panning = false;
|
| 4370 |
+
canvas.classList.remove("panning");
|
| 4371 |
+
try { canvas.releasePointerCapture(e.pointerId); } catch {}
|
| 4372 |
+
}
|
| 4373 |
+
canvas.addEventListener("pointerup", endPan);
|
| 4374 |
+
canvas.addEventListener("pointercancel", endPan);
|
| 4375 |
+
canvas.addEventListener("pointerleave", () => hideTooltip());
|
| 4376 |
+
|
| 4377 |
+
canvas.addEventListener("wheel", e => {
|
| 4378 |
+
e.preventDefault();
|
| 4379 |
+
const ndc = clientToNDC(e);
|
| 4380 |
+
// Zoom factor — natural feeling on both trackpad and mouse wheel.
|
| 4381 |
+
const factor = Math.exp(-e.deltaY * 0.0018);
|
| 4382 |
+
const newScale = Math.min(50, Math.max(0.5, view.scale * factor));
|
| 4383 |
+
const k = newScale / view.scale;
|
| 4384 |
+
// Zoom around the cursor: shift translate so the point under the cursor
|
| 4385 |
+
// stays under the cursor.
|
| 4386 |
+
view.tx = ndc.x - (ndc.x - view.tx) * k;
|
| 4387 |
+
view.ty = ndc.y - (ndc.y - view.ty) * k;
|
| 4388 |
+
view.scale = newScale;
|
| 4389 |
+
requestRedraw();
|
| 4390 |
+
hideTooltip();
|
| 4391 |
+
}, { passive: false });
|
| 4392 |
+
|
| 4393 |
+
resetBtn.addEventListener("click", resetView);
|
| 4394 |
+
|
| 4395 |
+
// ---- Hover picking -----------------------------------------------------
|
| 4396 |
+
function showTooltip(idx, x, y) {
|
| 4397 |
+
const sp = labels.species[cats.species[idx]];
|
| 4398 |
+
const bt = labels.biotypes[cats.biotype[idx]];
|
| 4399 |
+
const st = labels.strands[cats.strand[idx]];
|
| 4400 |
+
const ph = labels.phases[cats.phase[idx]];
|
| 4401 |
+
tooltip.innerHTML =
|
| 4402 |
+
`<div><span class="t-label">species</span>${sp}</div>` +
|
| 4403 |
+
`<div><span class="t-label">biotype</span>${bt}</div>` +
|
| 4404 |
+
`<div><span class="t-label">strand</span>${st} <span class="t-label">phase</span>${ph}</div>`;
|
| 4405 |
+
tooltip.style.left = x + "px";
|
| 4406 |
+
tooltip.style.top = y + "px";
|
| 4407 |
+
tooltip.classList.add("visible");
|
| 4408 |
+
}
|
| 4409 |
+
function hideTooltip() { tooltip.classList.remove("visible"); }
|
| 4410 |
+
|
| 4411 |
+
function handleHover(e) {
|
| 4412 |
+
if (!grid) return;
|
| 4413 |
+
const ndc = clientToNDC(e);
|
| 4414 |
+
const data = ndcToData(ndc);
|
| 4415 |
+
// Convert data-space (-1..1) into grid coords.
|
| 4416 |
+
const gx = (data.x + 1) * 0.5 * GRID_N;
|
| 4417 |
+
const gy = (data.y + 1) * 0.5 * GRID_N;
|
| 4418 |
+
const cx = Math.floor(gx), cy = Math.floor(gy);
|
| 4419 |
+
if (cx < -1 || cx > GRID_N || cy < -1 || cy > GRID_N) return hideTooltip();
|
| 4420 |
+
|
| 4421 |
+
// Adaptive search radius: at higher zoom, we want a tighter pick radius.
|
| 4422 |
+
// ~8px screen radius converted to data space.
|
| 4423 |
+
const rect = canvas.getBoundingClientRect();
|
| 4424 |
+
const screenR = 8;
|
| 4425 |
+
const dataR = (screenR / rect.width) * 2 / (0.92 * view.scale);
|
| 4426 |
+
const dataR2 = dataR * dataR;
|
| 4427 |
+
|
| 4428 |
+
let best = -1, bestD2 = dataR2;
|
| 4429 |
+
const cellSpan = Math.max(1, Math.ceil(dataR * GRID_N * 0.5) + 1);
|
| 4430 |
+
for (let dy = -cellSpan; dy <= cellSpan; dy++) {
|
| 4431 |
+
const yy = cy + dy;
|
| 4432 |
+
if (yy < 0 || yy >= GRID_N) continue;
|
| 4433 |
+
for (let dx = -cellSpan; dx <= cellSpan; dx++) {
|
| 4434 |
+
const xx = cx + dx;
|
| 4435 |
+
if (xx < 0 || xx >= GRID_N) continue;
|
| 4436 |
+
const list = grid[yy * GRID_N + xx];
|
| 4437 |
+
if (!list) continue;
|
| 4438 |
+
for (let k = 0; k < list.length; k++) {
|
| 4439 |
+
const idx = list[k];
|
| 4440 |
+
// Recompute the point's normalized [-1, 1] position from posBuf16
|
| 4441 |
+
// — we don't keep it on CPU, but we can re-derive from int16 cheaply.
|
| 4442 |
+
const px = posSnapshot[2*idx] / 32767;
|
| 4443 |
+
const py = posSnapshot[2*idx + 1] / 32767;
|
| 4444 |
+
const ex = px - data.x, ey = py - data.y;
|
| 4445 |
+
const d2 = ex*ex + ey*ey;
|
| 4446 |
+
if (d2 < bestD2) { bestD2 = d2; best = idx; }
|
| 4447 |
+
}
|
| 4448 |
+
}
|
| 4449 |
+
}
|
| 4450 |
+
if (best === -1) return hideTooltip();
|
| 4451 |
+
// Place tooltip near cursor, offset to the right & above.
|
| 4452 |
+
const relX = e.clientX - rect.left;
|
| 4453 |
+
const relY = e.clientY - rect.top;
|
| 4454 |
+
showTooltip(best, relX, relY);
|
| 4455 |
+
}
|
| 4456 |
+
|
| 4457 |
+
// We need an unattached CPU-side copy of positions for hover hit-testing
|
| 4458 |
+
// because WebGL buffers aren't readable from JS without a roundtrip.
|
| 4459 |
+
let posSnapshot = null;
|
| 4460 |
+
|
| 4461 |
+
// ---- Bootstrap ---------------------------------------------------------
|
| 4462 |
+
setupGL();
|
| 4463 |
+
|
| 4464 |
+
colorPills.forEach(p => {
|
| 4465 |
+
p.addEventListener("click", () => {
|
| 4466 |
+
colorPills.forEach(x => x.classList.toggle("active", x === p));
|
| 4467 |
+
setColorBy(p.dataset.color);
|
| 4468 |
+
});
|
| 4469 |
+
});
|
| 4470 |
+
|
| 4471 |
+
// Defer loading until the umap section is near the viewport — 500K points
|
| 4472 |
+
// doesn't need to fight for bandwidth on first paint.
|
| 4473 |
+
const io = new IntersectionObserver(async (entries) => {
|
| 4474 |
+
if (!entries[0].isIntersecting) return;
|
| 4475 |
+
io.disconnect();
|
| 4476 |
+
try {
|
| 4477 |
+
const pos16 = await loadData();
|
| 4478 |
+
posSnapshot = pos16;
|
| 4479 |
+
setColorBy("species"); // initial coloring + first draw
|
| 4480 |
+
} catch (err) {
|
| 4481 |
+
console.error(err);
|
| 4482 |
+
setStatus("error", "load failed");
|
| 4483 |
+
overlay.textContent = "load failed · " + err.message;
|
| 4484 |
+
}
|
| 4485 |
+
}, { rootMargin: "400px" });
|
| 4486 |
+
io.observe(canvas);
|
| 4487 |
+
|
| 4488 |
+
window.addEventListener("resize", () => requestRedraw());
|
| 4489 |
+
})();
|
| 4490 |
+
|
| 4491 |
// =========================================================================
|
| 4492 |
// Carbon banner — animated DNA helix (Canvas 2D)
|
| 4493 |
//
|
scripts/gen_fake_umap.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate a fake UMAP scatter dataset for prototyping the §6 viewer.
|
| 2 |
+
|
| 3 |
+
The real embedding-UMAP output (from Dana's Carbon 3B pipeline) will land
|
| 4 |
+
in roughly the same shape. Until then, this gives us a 500K-point fixture
|
| 5 |
+
to develop the WebGL frontend at target scale, so we don't discover
|
| 6 |
+
performance cliffs later.
|
| 7 |
+
|
| 8 |
+
Layout strategy:
|
| 9 |
+
- 24 eukaryotic species grouped into 5 kingdoms (vertebrates, invertebrates,
|
| 10 |
+
plants, fungi, protozoa). Kingdom centers are placed evenly around a
|
| 11 |
+
radius-12 circle in UMAP space, with species centers jittered nearby.
|
| 12 |
+
- Each point is drawn from a 2D Gaussian around its species center.
|
| 13 |
+
- Biotype, strand, and codon phase are roughly orthogonal axes — toggling
|
| 14 |
+
color in the UI reveals different organizations of the same points.
|
| 15 |
+
- Points are output sorted by species so gzip can RLE the category columns
|
| 16 |
+
aggressively (typically 90%+ compression on those bytes).
|
| 17 |
+
|
| 18 |
+
Binary layout (little-endian, 40-byte header + ~4 MB payload for 500K pts):
|
| 19 |
+
uint32 magic 0xCAB0FA1D
|
| 20 |
+
uint32 n_points
|
| 21 |
+
uint32 n_species
|
| 22 |
+
uint32 n_biotypes
|
| 23 |
+
uint32 n_strands
|
| 24 |
+
uint32 n_phases
|
| 25 |
+
float32 x_min, x_max, y_min, y_max (16 bytes — for de-quantization)
|
| 26 |
+
int16 positions[n_points * 2] (interleaved x,y; quantized)
|
| 27 |
+
uint8 species[n_points]
|
| 28 |
+
uint8 biotype[n_points]
|
| 29 |
+
uint8 strand[n_points]
|
| 30 |
+
uint8 phase[n_points]
|
| 31 |
+
|
| 32 |
+
Usage:
|
| 33 |
+
python scripts/gen_fake_umap.py
|
| 34 |
+
"""
|
| 35 |
+
import array
|
| 36 |
+
import json
|
| 37 |
+
import math
|
| 38 |
+
import os
|
| 39 |
+
import random
|
| 40 |
+
import struct
|
| 41 |
+
import sys
|
| 42 |
+
import time
|
| 43 |
+
|
| 44 |
+
# --- Dataset shape ---------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
N_POINTS = 500_000
|
| 47 |
+
SEED = 42
|
| 48 |
+
|
| 49 |
+
# 24 eukaryotic species — matches the rough scale of the real run (24 species
|
| 50 |
+
# from Leandro's Gemini-annotated set, 500/species in Dana's first batch).
|
| 51 |
+
SPECIES = [
|
| 52 |
+
# Vertebrates
|
| 53 |
+
"human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
|
| 54 |
+
"dog", "cow", "pig",
|
| 55 |
+
# Invertebrates
|
| 56 |
+
"fly", "worm", "mosquito", "honeybee", "sea_urchin",
|
| 57 |
+
# Plants
|
| 58 |
+
"arabidopsis", "rice", "maize", "wheat", "soybean",
|
| 59 |
+
# Fungi
|
| 60 |
+
"yeast", "neurospora", "candida",
|
| 61 |
+
# Protozoa
|
| 62 |
+
"plasmodium", "trypanosoma",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
KINGDOMS = {
|
| 66 |
+
"vertebrates": ["human", "mouse", "rat", "chicken", "zebrafish", "xenopus",
|
| 67 |
+
"dog", "cow", "pig"],
|
| 68 |
+
"invertebrates": ["fly", "worm", "mosquito", "honeybee", "sea_urchin"],
|
| 69 |
+
"plants": ["arabidopsis", "rice", "maize", "wheat", "soybean"],
|
| 70 |
+
"fungi": ["yeast", "neurospora", "candida"],
|
| 71 |
+
"protozoa": ["plasmodium", "trypanosoma"],
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
BIOTYPES = ["protein_coding", "lncRNA", "miRNA", "pseudogene"]
|
| 75 |
+
# Roughly realistic frequencies for a mixed eukaryote set.
|
| 76 |
+
BIOTYPE_WEIGHTS = [0.70, 0.15, 0.08, 0.07]
|
| 77 |
+
|
| 78 |
+
STRANDS = ["+", "-"]
|
| 79 |
+
PHASES = ["0", "1", "2"]
|
| 80 |
+
|
| 81 |
+
HERE = os.path.dirname(os.path.abspath(__file__))
|
| 82 |
+
DATA = os.path.join(os.path.dirname(HERE), "data")
|
| 83 |
+
|
| 84 |
+
# --- Generation ------------------------------------------------------------
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def species_centers():
|
| 88 |
+
"""Pick a UMAP-space center for each species, kingdom by kingdom."""
|
| 89 |
+
centers = {}
|
| 90 |
+
kingdom_names = list(KINGDOMS.keys())
|
| 91 |
+
for i, kingdom in enumerate(kingdom_names):
|
| 92 |
+
# Kingdom center placed evenly around a circle.
|
| 93 |
+
cx = math.cos(2 * math.pi * i / len(kingdom_names)) * 12.0
|
| 94 |
+
cy = math.sin(2 * math.pi * i / len(kingdom_names)) * 12.0
|
| 95 |
+
for sp in KINGDOMS[kingdom]:
|
| 96 |
+
ox = random.gauss(0, 1.8)
|
| 97 |
+
oy = random.gauss(0, 1.8)
|
| 98 |
+
centers[sp] = (cx + ox, cy + oy)
|
| 99 |
+
return centers
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def weighted_choice_idx(weights):
|
| 103 |
+
"""Faster than random.choices() for tight loops — returns the index."""
|
| 104 |
+
r = random.random()
|
| 105 |
+
acc = 0.0
|
| 106 |
+
for i, w in enumerate(weights):
|
| 107 |
+
acc += w
|
| 108 |
+
if r < acc:
|
| 109 |
+
return i
|
| 110 |
+
return len(weights) - 1
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def generate():
|
| 114 |
+
"""Produce all per-point data as parallel arrays (sorted by species)."""
|
| 115 |
+
random.seed(SEED)
|
| 116 |
+
centers = species_centers()
|
| 117 |
+
|
| 118 |
+
xs = array.array("f")
|
| 119 |
+
ys = array.array("f")
|
| 120 |
+
sp_col = bytearray()
|
| 121 |
+
bt_col = bytearray()
|
| 122 |
+
st_col = bytearray()
|
| 123 |
+
ph_col = bytearray()
|
| 124 |
+
|
| 125 |
+
base = N_POINTS // len(SPECIES)
|
| 126 |
+
extra = N_POINTS - base * len(SPECIES)
|
| 127 |
+
|
| 128 |
+
for sp_idx, sp in enumerate(SPECIES):
|
| 129 |
+
n_this = base + (1 if sp_idx < extra else 0)
|
| 130 |
+
cx, cy = centers[sp]
|
| 131 |
+
# Sample n_this points around this species' center.
|
| 132 |
+
for _ in range(n_this):
|
| 133 |
+
xs.append(cx + random.gauss(0, 1.2))
|
| 134 |
+
ys.append(cy + random.gauss(0, 1.2))
|
| 135 |
+
sp_col.append(sp_idx)
|
| 136 |
+
bt_col.append(weighted_choice_idx(BIOTYPE_WEIGHTS))
|
| 137 |
+
st_col.append(random.randrange(2))
|
| 138 |
+
ph_col.append(random.randrange(3))
|
| 139 |
+
|
| 140 |
+
return xs, ys, sp_col, bt_col, st_col, ph_col
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col):
|
| 144 |
+
"""Pack parallel arrays into the binary layout described in the docstring."""
|
| 145 |
+
n = len(xs)
|
| 146 |
+
x_min, x_max = min(xs), max(xs)
|
| 147 |
+
y_min, y_max = min(ys), max(ys)
|
| 148 |
+
|
| 149 |
+
# Map floats into int16 range [-32767, 32767] using the bounds (so the
|
| 150 |
+
# client can reconstitute float coords if it ever needs to).
|
| 151 |
+
qx = array.array("h")
|
| 152 |
+
qy = array.array("h")
|
| 153 |
+
rx = 65534.0 / (x_max - x_min)
|
| 154 |
+
ry = 65534.0 / (y_max - y_min)
|
| 155 |
+
for i in range(n):
|
| 156 |
+
qx.append(int(round((xs[i] - x_min) * rx - 32767)))
|
| 157 |
+
qy.append(int(round((ys[i] - y_min) * ry - 32767)))
|
| 158 |
+
|
| 159 |
+
# Interleave xy into a single int16 stream (WebGL friendly: stride 4 bytes).
|
| 160 |
+
pos = array.array("h", [0] * (2 * n))
|
| 161 |
+
for i in range(n):
|
| 162 |
+
pos[2 * i] = qx[i]
|
| 163 |
+
pos[2 * i + 1] = qy[i]
|
| 164 |
+
|
| 165 |
+
buf = bytearray()
|
| 166 |
+
# Header — 40 bytes (6 uint32 + 4 float32).
|
| 167 |
+
buf += struct.pack("<6I",
|
| 168 |
+
0xCAB0FA1D, n,
|
| 169 |
+
len(SPECIES), len(BIOTYPES),
|
| 170 |
+
len(STRANDS), len(PHASES))
|
| 171 |
+
buf += struct.pack("<4f", x_min, x_max, y_min, y_max)
|
| 172 |
+
|
| 173 |
+
buf += pos.tobytes()
|
| 174 |
+
buf += bytes(sp_col)
|
| 175 |
+
buf += bytes(bt_col)
|
| 176 |
+
buf += bytes(st_col)
|
| 177 |
+
buf += bytes(ph_col)
|
| 178 |
+
|
| 179 |
+
return buf, (x_min, x_max, y_min, y_max)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def main():
|
| 183 |
+
os.makedirs(DATA, exist_ok=True)
|
| 184 |
+
|
| 185 |
+
t0 = time.time()
|
| 186 |
+
print(f"generating {N_POINTS:,} fake UMAP points ...", file=sys.stderr)
|
| 187 |
+
xs, ys, sp_col, bt_col, st_col, ph_col = generate()
|
| 188 |
+
print(f" done in {time.time() - t0:.1f}s", file=sys.stderr)
|
| 189 |
+
|
| 190 |
+
t1 = time.time()
|
| 191 |
+
print("packing binary ...", file=sys.stderr)
|
| 192 |
+
buf, bounds = pack_binary(xs, ys, sp_col, bt_col, st_col, ph_col)
|
| 193 |
+
print(f" done in {time.time() - t1:.1f}s", file=sys.stderr)
|
| 194 |
+
|
| 195 |
+
out_bin = os.path.join(DATA, "umap.bin")
|
| 196 |
+
with open(out_bin, "wb") as f:
|
| 197 |
+
f.write(buf)
|
| 198 |
+
print(f" wrote {out_bin} ({len(buf):,} bytes uncompressed)", file=sys.stderr)
|
| 199 |
+
|
| 200 |
+
out_labels = os.path.join(DATA, "umap_labels.json")
|
| 201 |
+
species_to_kingdom = {sp: kd for kd, members in KINGDOMS.items() for sp in members}
|
| 202 |
+
with open(out_labels, "w") as f:
|
| 203 |
+
json.dump({
|
| 204 |
+
"species": SPECIES,
|
| 205 |
+
"biotypes": BIOTYPES,
|
| 206 |
+
"strands": STRANDS,
|
| 207 |
+
"phases": PHASES,
|
| 208 |
+
"species_kingdom": species_to_kingdom,
|
| 209 |
+
"kingdoms": list(KINGDOMS.keys()),
|
| 210 |
+
"bounds": list(bounds),
|
| 211 |
+
"n_points": len(xs),
|
| 212 |
+
"fake": True,
|
| 213 |
+
}, f, indent=2)
|
| 214 |
+
print(f" wrote {out_labels}", file=sys.stderr)
|
| 215 |
+
print(f"total: {time.time() - t0:.1f}s", file=sys.stderr)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
if __name__ == "__main__":
|
| 219 |
+
main()
|