// Phase 1 minimal viewer: // - white scene background to match viser's paper theme // - hand MESH (778v + 1538f) and SKELETON (21 joints + bones) toggleable // - YouTube iframe drives playback (autoplay + clip-loop, muted) // - depth.mp4 + flow.mp4 sync via the same wall-clock as YouTube (fps now // derived from manifest.start_sec/end_sec → matches real time) // Object point clouds intentionally NOT rendered in Phase 1. import * as THREE from 'three'; import { OrbitControls } from 'three/addons/controls/OrbitControls.js'; import { PLYLoader } from 'three/addons/loaders/PLYLoader.js'; // ── params ──────────────────────────────────────────────────── const u = new URL(window.location.href); const SAMPLE_ID = u.searchParams.get('id') || '-4eiU1SHpzY_319.4_323.4'; const ASSET_BASE = u.searchParams.get('base') || `../_dist/samples/${SAMPLE_ID}`; // MANO 21-joint kinematic edges (matches exo2ego_pipeline/config.py). const HAND_EDGES = [ [0,1],[1,2],[2,3],[3,4], [0,5],[5,6],[6,7],[7,8], [0,9],[9,10],[10,11],[11,12], [0,13],[13,14],[14,15],[15,16], [0,17],[17,18],[18,19],[19,20], ]; // 5 fingers × 4 bones each, deeper-saturated to read on light bg. const FINGER_COLOURS = [ [0.85, 0.18, 0.20], // thumb · red [0.95, 0.55, 0.10], // index · orange [0.20, 0.65, 0.25], // middle · green [0.85, 0.75, 0.05], // ring · yellow [0.18, 0.45, 0.85], // pinky · blue ]; // ── state ───────────────────────────────────────────────────── const state = { scene: null, fps: 10, T: 0, joints: null, jointsShape: null, // [T, MAX_HANDS, 21, 3] verts: null, vertsShape: null, // [T, MAX_HANDS, 778, 3] faces: null, // Int32Array (1538 * 3) hands: [], // [{ mesh, skeleton, dots }] objects: [], // [{ pts, obb, oid }] poses: null, // Float32Array (T, N_obj, 4, 4) row-major posesShape: null, obbs: null, // Float32Array (N_obj, 8, 3) three: null, toggle: { mesh: true, skeleton: true, joints: true, objects: true, obb: false }, }; // ── bootstrap ───────────────────────────────────────────────── async function main() { document.getElementById('sub').textContent = `id = ${SAMPLE_ID} · base = ${ASSET_BASE}`; initThree(); const scene = await fetchJSON(`${ASSET_BASE}/scene.json`); state.scene = scene; state.fps = scene.fps; state.T = scene.stats.n_frames; document.getElementById('scene-meta').textContent = JSON.stringify({ id: scene.id, youtube_id: scene.video_source.youtube_id, fps: scene.fps, duration_s: scene.duration, n_frames: scene.stats.n_frames, n_objects: scene.stats.n_objects, has_grasp: scene.stats.has_grasp, license: scene._license, }, null, 2); // Load binary assets const r = scene.reconstruction; const [jBuf, vBuf, fBuf, pBuf, oBuf] = await Promise.all([ fetchBin(`${ASSET_BASE}/${r.hand_joints}`), fetchBin(`${ASSET_BASE}/${r.hand_verts}`), r.hand_faces ? fetchBin(`${ASSET_BASE}/${r.hand_faces}`) : Promise.resolve(null), r.object_pose_traj ? fetchBin(`${ASSET_BASE}/${r.object_pose_traj}`) : Promise.resolve(null), r.object_obb ? fetchBin(`${ASSET_BASE}/${r.object_obb}`) : Promise.resolve(null), ]); state.joints = new Float32Array(jBuf); state.jointsShape = [state.T, 2, 21, 3]; state.verts = new Float32Array(vBuf); state.vertsShape = [state.T, 2, 778, 3]; if (fBuf) state.faces = new Int32Array(fBuf); if (pBuf) { state.poses = new Float32Array(pBuf); state.posesShape = r.object_pose_shape; } if (oBuf) state.obbs = new Float32Array(oBuf); buildHandObjects(); await buildObjectMeshes(scene); fitCameraToHand(); // Wire video + slider setupVideos(scene); setupSlider(); setupGUI(); setFrame(0); animate(); } // ── three boilerplate ───────────────────────────────────────── function initThree() { const canvas = document.getElementById('three-canvas'); const stage = canvas.parentElement; const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: false }); renderer.setPixelRatio(window.devicePixelRatio); const scene = new THREE.Scene(); // Match viser's paper theme: warm off-white. scene.background = new THREE.Color('#f6f4ee'); const camera = new THREE.PerspectiveCamera(35, 16/10, 0.005, 50); camera.position.set(0, -0.3, 1.2); // Gravity in our pipeline is roughly -Y; align Three.js up accordingly so // OrbitControls "up" feels right (we'll re-target to hand center later). camera.up.set(0, -1, 0); const controls = new OrbitControls(camera, canvas); controls.target.set(0, 0, 1.0); controls.enableDamping = true; controls.dampingFactor = 0.08; // Lighting: ambient + a single soft directional. Mesh material is unshaded // (MeshStandardMaterial with low metalness) so it reads on white. scene.add(new THREE.AmbientLight(0xffffff, 0.85)); const dir = new THREE.DirectionalLight(0xffffff, 0.55); dir.position.set(0.5, -0.8, 0.5); scene.add(dir); // Ground grid (matches viser's gravity-aligned grid). XZ plane. const grid = new THREE.GridHelper(2, 20, 0x999999, 0xcccccc); grid.material.opacity = 0.6; grid.material.transparent = true; scene.add(grid); // Axes helper at world origin scene.add(new THREE.AxesHelper(0.15)); function resize() { const w = stage.clientWidth, h = stage.clientHeight; renderer.setSize(w, h, false); camera.aspect = w / h; camera.updateProjectionMatrix(); } window.addEventListener('resize', resize); resize(); state.three = { scene, camera, renderer, controls }; } function buildHandObjects() { // edge colours: 20 segments × 2 endpoints × 3 channels const edgeColors = new Float32Array(HAND_EDGES.length * 2 * 3); HAND_EDGES.forEach((_e, ei) => { const fi = Math.floor(ei / 4) % 5; const c = FINGER_COLOURS[fi]; for (let s = 0; s < 2; s++) { edgeColors[(ei * 2 + s) * 3 + 0] = c[0]; edgeColors[(ei * 2 + s) * 3 + 1] = c[1]; edgeColors[(ei * 2 + s) * 3 + 2] = c[2]; } }); for (let h = 0; h < 2; h++) { // ── mesh ── const meshGeom = new THREE.BufferGeometry(); meshGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(778 * 3), 3)); if (state.faces) { meshGeom.setIndex(new THREE.Uint32BufferAttribute(state.faces.slice(), 1)); } const meshMat = new THREE.MeshStandardMaterial({ color: h === 0 ? 0xc89089 : 0x8aa3c0, // skin-pink left, blue-grey right roughness: 0.9, metalness: 0.0, side: THREE.DoubleSide, transparent: true, opacity: 0.92, flatShading: false, }); const mesh = new THREE.Mesh(meshGeom, meshMat); mesh.visible = false; state.three.scene.add(mesh); // ── skeleton ── const skelGeom = new THREE.BufferGeometry(); skelGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(HAND_EDGES.length * 2 * 3), 3)); skelGeom.setAttribute('color', new THREE.BufferAttribute(edgeColors.slice(), 3)); const skelMat = new THREE.LineBasicMaterial({ vertexColors: true }); const skeleton = new THREE.LineSegments(skelGeom, skelMat); skeleton.visible = false; state.three.scene.add(skeleton); // ── joint dots ── const dotGeom = new THREE.BufferGeometry(); dotGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(21 * 3), 3)); const dotMat = new THREE.PointsMaterial({ color: 0x222222, size: 0.013, sizeAttenuation: true, }); const dots = new THREE.Points(dotGeom, dotMat); dots.visible = false; state.three.scene.add(dots); state.hands.push({ mesh, skeleton, dots }); } } // Load each object's PLY, scale by scale_correction (the missing factor that // brings canonical points into world units), build a Points + OBB wireframe. // Per-frame we drive the matrix from object_pose.bin; at rest the OBB sits at // the canonical box corners since obbs[oi] are stored in mesh-local frame. async function buildObjectMeshes(scene) { const loader = new PLYLoader(); const objs = scene.reconstruction.objects || []; for (let oi = 0; oi < objs.length; oi++) { const o = objs[oi]; const url = `${ASSET_BASE}/${o.ply}`; let geom; try { geom = await new Promise((res, rej) => loader.load(url, res, undefined, rej)); } catch (e) { console.warn(`obj ${o.id}: PLY load failed`, e); continue; } // Scale geometry vertices by scale_correction so per-frame pose_R/pose_t // (which expects canonical-scaled-to-world points) is applied correctly. const sc = (o.scale_correction != null) ? o.scale_correction : 1.0; if (Math.abs(sc - 1.0) > 1e-6) { const pos = geom.getAttribute('position'); for (let i = 0; i < pos.count * 3; i++) pos.array[i] *= sc; pos.needsUpdate = true; geom.computeBoundingSphere(); } const hasColor = geom.getAttribute('color') !== undefined; const mat = new THREE.PointsMaterial({ size: 0.003, sizeAttenuation: true, vertexColors: hasColor, color: hasColor ? 0xffffff : new THREE.Color(o.color_hex), }); const pts = new THREE.Points(geom, mat); pts.matrixAutoUpdate = false; state.three.scene.add(pts); // OBB wireframe: 12 edges of an axis-aligned-ish box from 8 corner indices. const obbGeom = new THREE.BufferGeometry(); obbGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(12 * 2 * 3), 3)); const obbMat = new THREE.LineBasicMaterial({ color: new THREE.Color(o.color_hex) }); const obb = new THREE.LineSegments(obbGeom, obbMat); obb.visible = false; state.three.scene.add(obb); state.objects.push({ pts, obb, oid: o.id, color_hex: o.color_hex }); } } // 8-corner index pairs to form the 12 edges of an axis-aligned bounding box. // SAM3D obb_corners ordering: it's the 8 cuboid vertices but the exact // permutation depends on the producer; convex-hull edges are robust. const OBB_EDGES = [ [0,1],[1,2],[2,3],[3,0], [4,5],[5,6],[6,7],[7,4], [0,4],[1,5],[2,6],[3,7], ]; function fitCameraToHand() { // Centre OrbitControls on the median wrist position over all frames. const T = state.jointsShape[0], H = state.jointsShape[1]; const stride21 = 21 * 3, strideH = stride21 * H; const xs = [], ys = [], zs = []; for (let t = 0; t < T; t++) { for (let h = 0; h < H; h++) { const off = t * strideH + h * stride21 + 0; // wrist = joint 0 const x = state.joints[off], y = state.joints[off+1], z = state.joints[off+2]; if (Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(z)) { xs.push(x); ys.push(y); zs.push(z); } } } if (xs.length === 0) return; const median = (a) => a.slice().sort((p, q) => p - q)[Math.floor(a.length / 2)]; const cx = median(xs), cy = median(ys), cz = median(zs); const c = state.three.controls; c.target.set(cx, cy, cz); // Position camera ~0.6m back along -Z, slightly above hand (in viser's // gravity-up frame, "above" = -Y). state.three.camera.position.set(cx, cy - 0.25, cz - 0.6); c.update(); } // ── per-frame update ────────────────────────────────────────── function setFrame(t) { t = Math.max(0, Math.min(state.T - 1, Math.round(t))); const H = state.jointsShape[1]; const J = 21, V = 778; const stride_j_h = J * 3, stride_j_t = stride_j_h * H; const stride_v_h = V * 3, stride_v_t = stride_v_h * H; const showMesh = state.toggle?.mesh !== false; const showSkel = state.toggle?.skeleton !== false; const showDots = state.toggle?.joints !== false; for (let h = 0; h < H; h++) { const off_j = t * stride_j_t + h * stride_j_h; const off_v = t * stride_v_t + h * stride_v_h; const valid = Number.isFinite(state.joints[off_j]); const hand = state.hands[h]; hand.mesh.visible = valid && showMesh; hand.skeleton.visible = valid && showSkel; hand.dots.visible = valid && showDots; if (!valid) continue; // skeleton edges const sArr = hand.skeleton.geometry.getAttribute('position').array; HAND_EDGES.forEach(([a, b], ei) => { sArr[ei*6+0] = state.joints[off_j + a*3 + 0]; sArr[ei*6+1] = state.joints[off_j + a*3 + 1]; sArr[ei*6+2] = state.joints[off_j + a*3 + 2]; sArr[ei*6+3] = state.joints[off_j + b*3 + 0]; sArr[ei*6+4] = state.joints[off_j + b*3 + 1]; sArr[ei*6+5] = state.joints[off_j + b*3 + 2]; }); hand.skeleton.geometry.getAttribute('position').needsUpdate = true; // joint dots const dArr = hand.dots.geometry.getAttribute('position').array; for (let j = 0; j < J; j++) { dArr[j*3+0] = state.joints[off_j + j*3 + 0]; dArr[j*3+1] = state.joints[off_j + j*3 + 1]; dArr[j*3+2] = state.joints[off_j + j*3 + 2]; } hand.dots.geometry.getAttribute('position').needsUpdate = true; // mesh verts if (state.verts && state.faces) { const vAttr = hand.mesh.geometry.getAttribute('position'); const vArr = vAttr.array; for (let i = 0; i < V * 3; i++) vArr[i] = state.verts[off_v + i]; vAttr.needsUpdate = true; hand.mesh.geometry.computeVertexNormals(); hand.mesh.geometry.computeBoundingSphere(); } } // ── objects ── if (state.poses && state.objects.length) { const N = state.posesShape[1]; const strideObj = 16; const strideFrame = strideObj * N; const showObj = state.toggle.objects !== false; const showObb = state.toggle.obb !== false; for (let oi = 0; oi < state.objects.length; oi++) { const o = state.objects[oi]; const off = t * strideFrame + oi * strideObj; // Numpy float32 .tofile() is row-major; Matrix4.set takes row-major args. o.pts.matrix.set( state.poses[off+0], state.poses[off+1], state.poses[off+2], state.poses[off+3], state.poses[off+4], state.poses[off+5], state.poses[off+6], state.poses[off+7], state.poses[off+8], state.poses[off+9], state.poses[off+10], state.poses[off+11], state.poses[off+12], state.poses[off+13], state.poses[off+14], state.poses[off+15], ); o.pts.matrixWorldNeedsUpdate = true; o.pts.visible = showObj; // OBB: 8 mesh-local corners → push through same per-frame matrix. if (state.obbs) { const cornersOff = oi * 24; const m = o.pts.matrix; const obbArr = o.obb.geometry.getAttribute('position').array; const corner = new THREE.Vector3(); const xformed = []; for (let k = 0; k < 8; k++) { corner.set(state.obbs[cornersOff + k*3 + 0], state.obbs[cornersOff + k*3 + 1], state.obbs[cornersOff + k*3 + 2]); corner.applyMatrix4(m); xformed.push(corner.x, corner.y, corner.z); } OBB_EDGES.forEach(([a, b], ei) => { obbArr[ei*6+0] = xformed[a*3+0]; obbArr[ei*6+1] = xformed[a*3+1]; obbArr[ei*6+2] = xformed[a*3+2]; obbArr[ei*6+3] = xformed[b*3+0]; obbArr[ei*6+4] = xformed[b*3+1]; obbArr[ei*6+5] = xformed[b*3+2]; }); o.obb.geometry.getAttribute('position').needsUpdate = true; } o.obb.visible = showObb; } } document.getElementById('frame').value = String(t); document.getElementById('frame-label').textContent = `${t + 1} / ${state.T}`; } // ── playback synchronisation ────────────────────────────────── function setupVideos(scene) { const fps = scene.fps; const dv = document.getElementById('depth-video'); const fv = document.getElementById('flow-video'); const av = document.getElementById('annotated-video'); dv.src = `${ASSET_BASE}/${scene.reconstruction.depth_video}`; fv.src = `${ASSET_BASE}/${scene.reconstruction.flow_video}`; // Phase 1: no annotated mp4. Re-use depth.mp4 in the spare pane. av.src = `${ASSET_BASE}/${scene.reconstruction.depth_video}`; const yt = document.getElementById('yt-iframe'); const v = scene.video_source; if (v && v.youtube_id) { const start = Math.floor(v.start_seconds || 0); const end = Math.ceil(v.end_seconds || (start + scene.duration)); // autoplay=1 + mute=1 → satisfies browser autoplay gate // loop=1 + playlist= → YouTube quirk: single-video loops require playlist // Playback re-enters at `start` once it reaches `end`. const params = new URLSearchParams({ autoplay: '1', mute: '1', loop: '1', playlist: v.youtube_id, start: String(start), end: String(end), controls: '1', rel: '0', modestbranding: '1', playsinline: '1', }); yt.src = `https://www.youtube.com/embed/${v.youtube_id}?${params.toString()}`; } // Master clock = depth.mp4 (encoded at native fps to match real-time). dv.addEventListener('timeupdate', () => { setFrame(Math.floor(dv.currentTime * fps)); }); } function setupSlider() { const slider = document.getElementById('frame'); slider.max = String(state.T - 1); slider.addEventListener('input', () => { const t = parseInt(slider.value, 10); const dv = document.getElementById('depth-video'); const fv = document.getElementById('flow-video'); const tt = t / state.fps; dv.currentTime = tt; fv.currentTime = tt; setFrame(t); }); document.getElementById('play').addEventListener('click', () => { const dv = document.getElementById('depth-video'); const fv = document.getElementById('flow-video'); if (dv.paused) { dv.play(); fv.play(); } else { dv.pause(); fv.pause(); } }); } function setupGUI() { // state.toggle initialised in `state` declaration above. const ids = { mesh: 't-mesh', skeleton: 't-skeleton', joints: 't-joints', objects: 't-objects', obb: 't-obb', }; const apply = () => { Object.entries(ids).forEach(([k, id]) => { state.toggle[k] = document.getElementById(id).checked; }); setFrame(parseInt(document.getElementById('frame').value, 10) || 0); }; Object.values(ids).forEach((id) => { const el = document.getElementById(id); if (el) { el.checked = state.toggle[Object.keys(ids).find(k => ids[k] === id)]; el.addEventListener('change', apply); } }); } function animate() { state.three.controls.update(); state.three.renderer.render(state.three.scene, state.three.camera); requestAnimationFrame(animate); } // ── helpers ────────────────────────────────────────────────── async function fetchJSON(url) { const r = await fetch(url); if (!r.ok) throw new Error(`${url}: ${r.status}`); return r.json(); } async function fetchBin(url) { const r = await fetch(url); if (!r.ok) throw new Error(`${url}: ${r.status}`); return await r.arrayBuffer(); } main().catch(e => { console.error(e); document.getElementById('sub').textContent = `error: ${e.message}`; });