Spaces:
Running
Running
| // Phase 1 minimal viewer: | |
| // - white scene background to match viser's paper theme | |
| // - hand MESH (778v + 1538f) and SKELETON (21 joints + bones) toggleable | |
| // - YouTube iframe drives playback (autoplay + clip-loop, muted) | |
| // - depth.mp4 + flow.mp4 sync via the same wall-clock as YouTube (fps now | |
| // derived from manifest.start_sec/end_sec β matches real time) | |
| // Object point clouds intentionally NOT rendered in Phase 1. | |
| import * as THREE from 'three'; | |
| import { OrbitControls } from 'three/addons/controls/OrbitControls.js'; | |
| import { PLYLoader } from 'three/addons/loaders/PLYLoader.js'; | |
| // ββ params ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const u = new URL(window.location.href); | |
| const SAMPLE_ID = u.searchParams.get('id') || '-4eiU1SHpzY_319.4_323.4'; | |
| const ASSET_BASE = u.searchParams.get('base') || `../_dist/samples/${SAMPLE_ID}`; | |
| // MANO 21-joint kinematic edges (matches exo2ego_pipeline/config.py). | |
| const HAND_EDGES = [ | |
| [],[1,2],[2,3],[3,4], | |
| [],[5,6],[6,7],[7,8], | |
| [],[9,10],[10,11],[11,12], | |
| [],[13,14],[14,15],[15,16], | |
| [],[17,18],[18,19],[19,20], | |
| ]; | |
| // 5 fingers Γ 4 bones each, deeper-saturated to read on light bg. | |
| const FINGER_COLOURS = [ | |
| [], // thumb Β· red | |
| [], // index Β· orange | |
| [], // middle Β· green | |
| [], // ring Β· yellow | |
| [], // pinky Β· blue | |
| ]; | |
| // ββ state βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const state = { | |
| scene: null, | |
| fps: 10, | |
| T: 0, | |
| joints: null, | |
| jointsShape: null, // [T, MAX_HANDS, 21, 3] | |
| verts: null, | |
| vertsShape: null, // [T, MAX_HANDS, 778, 3] | |
| faces: null, // Int32Array (1538 * 3) | |
| hands: [], // [{ mesh, skeleton, dots }] | |
| objects: [], // [{ pts, obb, oid }] | |
| poses: null, // Float32Array (T, N_obj, 4, 4) row-major | |
| posesShape: null, | |
| obbs: null, // Float32Array (N_obj, 8, 3) | |
| three: null, | |
| toggle: { mesh: true, skeleton: true, joints: true, objects: true, obb: false }, | |
| }; | |
| // ββ bootstrap βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function main() { | |
| document.getElementById('sub').textContent = | |
| `id = ${SAMPLE_ID} Β· base = ${ASSET_BASE}`; | |
| initThree(); | |
| const scene = await fetchJSON(`${ASSET_BASE}/scene.json`); | |
| state.scene = scene; | |
| state.fps = scene.fps; | |
| state.T = scene.stats.n_frames; | |
| document.getElementById('scene-meta').textContent = JSON.stringify({ | |
| id: scene.id, | |
| youtube_id: scene.video_source.youtube_id, | |
| fps: scene.fps, | |
| duration_s: scene.duration, | |
| n_frames: scene.stats.n_frames, | |
| n_objects: scene.stats.n_objects, | |
| has_grasp: scene.stats.has_grasp, | |
| license: scene._license, | |
| }, null, 2); | |
| // Load binary assets | |
| const r = scene.reconstruction; | |
| const [jBuf, vBuf, fBuf, pBuf, oBuf] = await Promise.all([ | |
| fetchBin(`${ASSET_BASE}/${r.hand_joints}`), | |
| fetchBin(`${ASSET_BASE}/${r.hand_verts}`), | |
| r.hand_faces ? fetchBin(`${ASSET_BASE}/${r.hand_faces}`) : Promise.resolve(null), | |
| r.object_pose_traj ? fetchBin(`${ASSET_BASE}/${r.object_pose_traj}`) : Promise.resolve(null), | |
| r.object_obb ? fetchBin(`${ASSET_BASE}/${r.object_obb}`) : Promise.resolve(null), | |
| ]); | |
| state.joints = new Float32Array(jBuf); | |
| state.jointsShape = [state.T, 2, 21, 3]; | |
| state.verts = new Float32Array(vBuf); | |
| state.vertsShape = [state.T, 2, 778, 3]; | |
| if (fBuf) state.faces = new Int32Array(fBuf); | |
| if (pBuf) { | |
| state.poses = new Float32Array(pBuf); | |
| state.posesShape = r.object_pose_shape; | |
| } | |
| if (oBuf) state.obbs = new Float32Array(oBuf); | |
| buildHandObjects(); | |
| await buildObjectMeshes(scene); | |
| fitCameraToHand(); | |
| // Wire video + slider | |
| setupVideos(scene); | |
| setupSlider(); | |
| setupGUI(); | |
| setFrame(0); | |
| animate(); | |
| } | |
| // ββ three boilerplate βββββββββββββββββββββββββββββββββββββββββ | |
| function initThree() { | |
| const canvas = document.getElementById('three-canvas'); | |
| const stage = canvas.parentElement; | |
| const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: false }); | |
| renderer.setPixelRatio(window.devicePixelRatio); | |
| const scene = new THREE.Scene(); | |
| // Match viser's paper theme: warm off-white. | |
| scene.background = new THREE.Color('#f6f4ee'); | |
| const camera = new THREE.PerspectiveCamera(35, 16/10, 0.005, 50); | |
| camera.position.set(0, -0.3, 1.2); | |
| // Gravity in our pipeline is roughly -Y; align Three.js up accordingly so | |
| // OrbitControls "up" feels right (we'll re-target to hand center later). | |
| camera.up.set(0, -1, 0); | |
| const controls = new OrbitControls(camera, canvas); | |
| controls.target.set(0, 0, 1.0); | |
| controls.enableDamping = true; | |
| controls.dampingFactor = 0.08; | |
| // Lighting: ambient + a single soft directional. Mesh material is unshaded | |
| // (MeshStandardMaterial with low metalness) so it reads on white. | |
| scene.add(new THREE.AmbientLight(0xffffff, 0.85)); | |
| const dir = new THREE.DirectionalLight(0xffffff, 0.55); | |
| dir.position.set(0.5, -0.8, 0.5); | |
| scene.add(dir); | |
| // Ground grid (matches viser's gravity-aligned grid). XZ plane. | |
| const grid = new THREE.GridHelper(2, 20, 0x999999, 0xcccccc); | |
| grid.material.opacity = 0.6; | |
| grid.material.transparent = true; | |
| scene.add(grid); | |
| // Axes helper at world origin | |
| scene.add(new THREE.AxesHelper(0.15)); | |
| function resize() { | |
| const w = stage.clientWidth, h = stage.clientHeight; | |
| renderer.setSize(w, h, false); | |
| camera.aspect = w / h; | |
| camera.updateProjectionMatrix(); | |
| } | |
| window.addEventListener('resize', resize); | |
| resize(); | |
| state.three = { scene, camera, renderer, controls }; | |
| } | |
| function buildHandObjects() { | |
| // edge colours: 20 segments Γ 2 endpoints Γ 3 channels | |
| const edgeColors = new Float32Array(HAND_EDGES.length * 2 * 3); | |
| HAND_EDGES.forEach((_e, ei) => { | |
| const fi = Math.floor(ei / 4) % 5; | |
| const c = FINGER_COLOURS[fi]; | |
| for (let s = 0; s < 2; s++) { | |
| edgeColors[(ei * 2 + s) * 3 + 0] = c[0]; | |
| edgeColors[(ei * 2 + s) * 3 + 1] = c[1]; | |
| edgeColors[(ei * 2 + s) * 3 + 2] = c[2]; | |
| } | |
| }); | |
| for (let h = 0; h < 2; h++) { | |
| // ββ mesh ββ | |
| const meshGeom = new THREE.BufferGeometry(); | |
| meshGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(778 * 3), 3)); | |
| if (state.faces) { | |
| meshGeom.setIndex(new THREE.Uint32BufferAttribute(state.faces.slice(), 1)); | |
| } | |
| const meshMat = new THREE.MeshStandardMaterial({ | |
| color: h === 0 ? 0xc89089 : 0x8aa3c0, // skin-pink left, blue-grey right | |
| roughness: 0.9, metalness: 0.0, | |
| side: THREE.DoubleSide, | |
| transparent: true, opacity: 0.92, | |
| flatShading: false, | |
| }); | |
| const mesh = new THREE.Mesh(meshGeom, meshMat); | |
| mesh.visible = false; | |
| state.three.scene.add(mesh); | |
| // ββ skeleton ββ | |
| const skelGeom = new THREE.BufferGeometry(); | |
| skelGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(HAND_EDGES.length * 2 * 3), 3)); | |
| skelGeom.setAttribute('color', new THREE.BufferAttribute(edgeColors.slice(), 3)); | |
| const skelMat = new THREE.LineBasicMaterial({ vertexColors: true }); | |
| const skeleton = new THREE.LineSegments(skelGeom, skelMat); | |
| skeleton.visible = false; | |
| state.three.scene.add(skeleton); | |
| // ββ joint dots ββ | |
| const dotGeom = new THREE.BufferGeometry(); | |
| dotGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(21 * 3), 3)); | |
| const dotMat = new THREE.PointsMaterial({ | |
| color: 0x222222, size: 0.013, sizeAttenuation: true, | |
| }); | |
| const dots = new THREE.Points(dotGeom, dotMat); | |
| dots.visible = false; | |
| state.three.scene.add(dots); | |
| state.hands.push({ mesh, skeleton, dots }); | |
| } | |
| } | |
| // Load each object's PLY, scale by scale_correction (the missing factor that | |
| // brings canonical points into world units), build a Points + OBB wireframe. | |
| // Per-frame we drive the matrix from object_pose.bin; at rest the OBB sits at | |
| // the canonical box corners since obbs[oi] are stored in mesh-local frame. | |
| async function buildObjectMeshes(scene) { | |
| const loader = new PLYLoader(); | |
| const objs = scene.reconstruction.objects || []; | |
| for (let oi = 0; oi < objs.length; oi++) { | |
| const o = objs[oi]; | |
| const url = `${ASSET_BASE}/${o.ply}`; | |
| let geom; | |
| try { | |
| geom = await new Promise((res, rej) => loader.load(url, res, undefined, rej)); | |
| } catch (e) { | |
| console.warn(`obj ${o.id}: PLY load failed`, e); | |
| continue; | |
| } | |
| // Scale geometry vertices by scale_correction so per-frame pose_R/pose_t | |
| // (which expects canonical-scaled-to-world points) is applied correctly. | |
| const sc = (o.scale_correction != null) ? o.scale_correction : 1.0; | |
| if (Math.abs(sc - 1.0) > 1e-6) { | |
| const pos = geom.getAttribute('position'); | |
| for (let i = 0; i < pos.count * 3; i++) pos.array[i] *= sc; | |
| pos.needsUpdate = true; | |
| geom.computeBoundingSphere(); | |
| } | |
| const hasColor = geom.getAttribute('color') !== undefined; | |
| const mat = new THREE.PointsMaterial({ | |
| size: 0.003, | |
| sizeAttenuation: true, | |
| vertexColors: hasColor, | |
| color: hasColor ? 0xffffff : new THREE.Color(o.color_hex), | |
| }); | |
| const pts = new THREE.Points(geom, mat); | |
| pts.matrixAutoUpdate = false; | |
| state.three.scene.add(pts); | |
| // OBB wireframe: 12 edges of an axis-aligned-ish box from 8 corner indices. | |
| const obbGeom = new THREE.BufferGeometry(); | |
| obbGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(12 * 2 * 3), 3)); | |
| const obbMat = new THREE.LineBasicMaterial({ color: new THREE.Color(o.color_hex) }); | |
| const obb = new THREE.LineSegments(obbGeom, obbMat); | |
| obb.visible = false; | |
| state.three.scene.add(obb); | |
| state.objects.push({ pts, obb, oid: o.id, color_hex: o.color_hex }); | |
| } | |
| } | |
| // 8-corner index pairs to form the 12 edges of an axis-aligned bounding box. | |
| // SAM3D obb_corners ordering: it's the 8 cuboid vertices but the exact | |
| // permutation depends on the producer; convex-hull edges are robust. | |
| const OBB_EDGES = [ | |
| [],[1,2],[2,3],[3,0], | |
| [],[5,6],[6,7],[7,4], | |
| [],[1,5],[2,6],[3,7], | |
| ]; | |
| function fitCameraToHand() { | |
| // Centre OrbitControls on the median wrist position over all frames. | |
| const T = state.jointsShape[0], H = state.jointsShape[1]; | |
| const stride21 = 21 * 3, strideH = stride21 * H; | |
| const xs = [], ys = [], zs = []; | |
| for (let t = 0; t < T; t++) { | |
| for (let h = 0; h < H; h++) { | |
| const off = t * strideH + h * stride21 + 0; // wrist = joint 0 | |
| const x = state.joints[off], y = state.joints[off+1], z = state.joints[off+2]; | |
| if (Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(z)) { | |
| xs.push(x); ys.push(y); zs.push(z); | |
| } | |
| } | |
| } | |
| if (xs.length === 0) return; | |
| const median = (a) => a.slice().sort((p, q) => p - q)[Math.floor(a.length / 2)]; | |
| const cx = median(xs), cy = median(ys), cz = median(zs); | |
| const c = state.three.controls; | |
| c.target.set(cx, cy, cz); | |
| // Position camera ~0.6m back along -Z, slightly above hand (in viser's | |
| // gravity-up frame, "above" = -Y). | |
| state.three.camera.position.set(cx, cy - 0.25, cz - 0.6); | |
| c.update(); | |
| } | |
| // ββ per-frame update ββββββββββββββββββββββββββββββββββββββββββ | |
| function setFrame(t) { | |
| t = Math.max(0, Math.min(state.T - 1, Math.round(t))); | |
| const H = state.jointsShape[1]; | |
| const J = 21, V = 778; | |
| const stride_j_h = J * 3, stride_j_t = stride_j_h * H; | |
| const stride_v_h = V * 3, stride_v_t = stride_v_h * H; | |
| const showMesh = state.toggle?.mesh !== false; | |
| const showSkel = state.toggle?.skeleton !== false; | |
| const showDots = state.toggle?.joints !== false; | |
| for (let h = 0; h < H; h++) { | |
| const off_j = t * stride_j_t + h * stride_j_h; | |
| const off_v = t * stride_v_t + h * stride_v_h; | |
| const valid = Number.isFinite(state.joints[off_j]); | |
| const hand = state.hands[h]; | |
| hand.mesh.visible = valid && showMesh; | |
| hand.skeleton.visible = valid && showSkel; | |
| hand.dots.visible = valid && showDots; | |
| if (!valid) continue; | |
| // skeleton edges | |
| const sArr = hand.skeleton.geometry.getAttribute('position').array; | |
| HAND_EDGES.forEach(([a, b], ei) => { | |
| sArr[ei*6+0] = state.joints[off_j + a*3 + 0]; | |
| sArr[ei*6+1] = state.joints[off_j + a*3 + 1]; | |
| sArr[ei*6+2] = state.joints[off_j + a*3 + 2]; | |
| sArr[ei*6+3] = state.joints[off_j + b*3 + 0]; | |
| sArr[ei*6+4] = state.joints[off_j + b*3 + 1]; | |
| sArr[ei*6+5] = state.joints[off_j + b*3 + 2]; | |
| }); | |
| hand.skeleton.geometry.getAttribute('position').needsUpdate = true; | |
| // joint dots | |
| const dArr = hand.dots.geometry.getAttribute('position').array; | |
| for (let j = 0; j < J; j++) { | |
| dArr[j*3+0] = state.joints[off_j + j*3 + 0]; | |
| dArr[j*3+1] = state.joints[off_j + j*3 + 1]; | |
| dArr[j*3+2] = state.joints[off_j + j*3 + 2]; | |
| } | |
| hand.dots.geometry.getAttribute('position').needsUpdate = true; | |
| // mesh verts | |
| if (state.verts && state.faces) { | |
| const vAttr = hand.mesh.geometry.getAttribute('position'); | |
| const vArr = vAttr.array; | |
| for (let i = 0; i < V * 3; i++) vArr[i] = state.verts[off_v + i]; | |
| vAttr.needsUpdate = true; | |
| hand.mesh.geometry.computeVertexNormals(); | |
| hand.mesh.geometry.computeBoundingSphere(); | |
| } | |
| } | |
| // ββ objects ββ | |
| if (state.poses && state.objects.length) { | |
| const N = state.posesShape[1]; | |
| const strideObj = 16; | |
| const strideFrame = strideObj * N; | |
| const showObj = state.toggle.objects !== false; | |
| const showObb = state.toggle.obb !== false; | |
| for (let oi = 0; oi < state.objects.length; oi++) { | |
| const o = state.objects[oi]; | |
| const off = t * strideFrame + oi * strideObj; | |
| // Numpy float32 .tofile() is row-major; Matrix4.set takes row-major args. | |
| o.pts.matrix.set( | |
| state.poses[off+0], state.poses[off+1], state.poses[off+2], state.poses[off+3], | |
| state.poses[off+4], state.poses[off+5], state.poses[off+6], state.poses[off+7], | |
| state.poses[off+8], state.poses[off+9], state.poses[off+10], state.poses[off+11], | |
| state.poses[off+12], state.poses[off+13], state.poses[off+14], state.poses[off+15], | |
| ); | |
| o.pts.matrixWorldNeedsUpdate = true; | |
| o.pts.visible = showObj; | |
| // OBB: 8 mesh-local corners β push through same per-frame matrix. | |
| if (state.obbs) { | |
| const cornersOff = oi * 24; | |
| const m = o.pts.matrix; | |
| const obbArr = o.obb.geometry.getAttribute('position').array; | |
| const corner = new THREE.Vector3(); | |
| const xformed = []; | |
| for (let k = 0; k < 8; k++) { | |
| corner.set(state.obbs[cornersOff + k*3 + 0], | |
| state.obbs[cornersOff + k*3 + 1], | |
| state.obbs[cornersOff + k*3 + 2]); | |
| corner.applyMatrix4(m); | |
| xformed.push(corner.x, corner.y, corner.z); | |
| } | |
| OBB_EDGES.forEach(([a, b], ei) => { | |
| obbArr[ei*6+0] = xformed[a*3+0]; | |
| obbArr[ei*6+1] = xformed[a*3+1]; | |
| obbArr[ei*6+2] = xformed[a*3+2]; | |
| obbArr[ei*6+3] = xformed[b*3+0]; | |
| obbArr[ei*6+4] = xformed[b*3+1]; | |
| obbArr[ei*6+5] = xformed[b*3+2]; | |
| }); | |
| o.obb.geometry.getAttribute('position').needsUpdate = true; | |
| } | |
| o.obb.visible = showObb; | |
| } | |
| } | |
| document.getElementById('frame').value = String(t); | |
| document.getElementById('frame-label').textContent = `${t + 1} / ${state.T}`; | |
| } | |
| // ββ playback synchronisation ββββββββββββββββββββββββββββββββββ | |
| function setupVideos(scene) { | |
| const fps = scene.fps; | |
| const dv = document.getElementById('depth-video'); | |
| const fv = document.getElementById('flow-video'); | |
| const av = document.getElementById('annotated-video'); | |
| dv.src = `${ASSET_BASE}/${scene.reconstruction.depth_video}`; | |
| fv.src = `${ASSET_BASE}/${scene.reconstruction.flow_video}`; | |
| // Phase 1: no annotated mp4. Re-use depth.mp4 in the spare pane. | |
| av.src = `${ASSET_BASE}/${scene.reconstruction.depth_video}`; | |
| const yt = document.getElementById('yt-iframe'); | |
| const v = scene.video_source; | |
| if (v && v.youtube_id) { | |
| const start = Math.floor(v.start_seconds || 0); | |
| const end = Math.ceil(v.end_seconds || (start + scene.duration)); | |
| // autoplay=1 + mute=1 β satisfies browser autoplay gate | |
| // loop=1 + playlist=<id> β YouTube quirk: single-video loops require playlist | |
| // Playback re-enters at `start` once it reaches `end`. | |
| const params = new URLSearchParams({ | |
| autoplay: '1', mute: '1', loop: '1', playlist: v.youtube_id, | |
| start: String(start), end: String(end), | |
| controls: '1', rel: '0', modestbranding: '1', playsinline: '1', | |
| }); | |
| yt.src = `https://www.youtube.com/embed/${v.youtube_id}?${params.toString()}`; | |
| } | |
| // Master clock = depth.mp4 (encoded at native fps to match real-time). | |
| dv.addEventListener('timeupdate', () => { | |
| setFrame(Math.floor(dv.currentTime * fps)); | |
| }); | |
| } | |
| function setupSlider() { | |
| const slider = document.getElementById('frame'); | |
| slider.max = String(state.T - 1); | |
| slider.addEventListener('input', () => { | |
| const t = parseInt(slider.value, 10); | |
| const dv = document.getElementById('depth-video'); | |
| const fv = document.getElementById('flow-video'); | |
| const tt = t / state.fps; | |
| dv.currentTime = tt; fv.currentTime = tt; | |
| setFrame(t); | |
| }); | |
| document.getElementById('play').addEventListener('click', () => { | |
| const dv = document.getElementById('depth-video'); | |
| const fv = document.getElementById('flow-video'); | |
| if (dv.paused) { dv.play(); fv.play(); } else { dv.pause(); fv.pause(); } | |
| }); | |
| } | |
| function setupGUI() { | |
| // state.toggle initialised in `state` declaration above. | |
| const ids = { | |
| mesh: 't-mesh', skeleton: 't-skeleton', joints: 't-joints', | |
| objects: 't-objects', obb: 't-obb', | |
| }; | |
| const apply = () => { | |
| Object.entries(ids).forEach(([k, id]) => { | |
| state.toggle[k] = document.getElementById(id).checked; | |
| }); | |
| setFrame(parseInt(document.getElementById('frame').value, 10) || 0); | |
| }; | |
| Object.values(ids).forEach((id) => { | |
| const el = document.getElementById(id); | |
| if (el) { | |
| el.checked = state.toggle[Object.keys(ids).find(k => ids[k] === id)]; | |
| el.addEventListener('change', apply); | |
| } | |
| }); | |
| } | |
| function animate() { | |
| state.three.controls.update(); | |
| state.three.renderer.render(state.three.scene, state.three.camera); | |
| requestAnimationFrame(animate); | |
| } | |
| // ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async function fetchJSON(url) { | |
| const r = await fetch(url); | |
| if (!r.ok) throw new Error(`${url}: ${r.status}`); | |
| return r.json(); | |
| } | |
| async function fetchBin(url) { | |
| const r = await fetch(url); | |
| if (!r.ok) throw new Error(`${url}: ${r.status}`); | |
| return await r.arrayBuffer(); | |
| } | |
| main().catch(e => { | |
| console.error(e); | |
| document.getElementById('sub').textContent = `error: ${e.message}`; | |
| }); | |