EgoInfinity / js /sample_viewer.minimal.js.bak
VectorW's picture
Initial commit
66d097c
Raw
History Blame Contribute Delete
19.5 kB
// Phase 1 minimal viewer:
// - white scene background to match viser's paper theme
// - hand MESH (778v + 1538f) and SKELETON (21 joints + bones) toggleable
// - YouTube iframe drives playback (autoplay + clip-loop, muted)
// - depth.mp4 + flow.mp4 sync via the same wall-clock as YouTube (fps now
// derived from manifest.start_sec/end_sec β†’ matches real time)
// Object point clouds intentionally NOT rendered in Phase 1.
import * as THREE from 'three';
import { OrbitControls } from 'three/addons/controls/OrbitControls.js';
import { PLYLoader } from 'three/addons/loaders/PLYLoader.js';
// ── params ────────────────────────────────────────────────────
const u = new URL(window.location.href);
const SAMPLE_ID = u.searchParams.get('id') || '-4eiU1SHpzY_319.4_323.4';
const ASSET_BASE = u.searchParams.get('base') || `../_dist/samples/${SAMPLE_ID}`;
// MANO 21-joint kinematic edges (matches exo2ego_pipeline/config.py).
const HAND_EDGES = [
[0,1],[1,2],[2,3],[3,4],
[0,5],[5,6],[6,7],[7,8],
[0,9],[9,10],[10,11],[11,12],
[0,13],[13,14],[14,15],[15,16],
[0,17],[17,18],[18,19],[19,20],
];
// 5 fingers Γ— 4 bones each, deeper-saturated to read on light bg.
const FINGER_COLOURS = [
[0.85, 0.18, 0.20], // thumb Β· red
[0.95, 0.55, 0.10], // index Β· orange
[0.20, 0.65, 0.25], // middle Β· green
[0.85, 0.75, 0.05], // ring Β· yellow
[0.18, 0.45, 0.85], // pinky Β· blue
];
// ── state ─────────────────────────────────────────────────────
const state = {
scene: null,
fps: 10,
T: 0,
joints: null,
jointsShape: null, // [T, MAX_HANDS, 21, 3]
verts: null,
vertsShape: null, // [T, MAX_HANDS, 778, 3]
faces: null, // Int32Array (1538 * 3)
hands: [], // [{ mesh, skeleton, dots }]
objects: [], // [{ pts, obb, oid }]
poses: null, // Float32Array (T, N_obj, 4, 4) row-major
posesShape: null,
obbs: null, // Float32Array (N_obj, 8, 3)
three: null,
toggle: { mesh: true, skeleton: true, joints: true, objects: true, obb: false },
};
// ── bootstrap ─────────────────────────────────────────────────
async function main() {
document.getElementById('sub').textContent =
`id = ${SAMPLE_ID} Β· base = ${ASSET_BASE}`;
initThree();
const scene = await fetchJSON(`${ASSET_BASE}/scene.json`);
state.scene = scene;
state.fps = scene.fps;
state.T = scene.stats.n_frames;
document.getElementById('scene-meta').textContent = JSON.stringify({
id: scene.id,
youtube_id: scene.video_source.youtube_id,
fps: scene.fps,
duration_s: scene.duration,
n_frames: scene.stats.n_frames,
n_objects: scene.stats.n_objects,
has_grasp: scene.stats.has_grasp,
license: scene._license,
}, null, 2);
// Load binary assets
const r = scene.reconstruction;
const [jBuf, vBuf, fBuf, pBuf, oBuf] = await Promise.all([
fetchBin(`${ASSET_BASE}/${r.hand_joints}`),
fetchBin(`${ASSET_BASE}/${r.hand_verts}`),
r.hand_faces ? fetchBin(`${ASSET_BASE}/${r.hand_faces}`) : Promise.resolve(null),
r.object_pose_traj ? fetchBin(`${ASSET_BASE}/${r.object_pose_traj}`) : Promise.resolve(null),
r.object_obb ? fetchBin(`${ASSET_BASE}/${r.object_obb}`) : Promise.resolve(null),
]);
state.joints = new Float32Array(jBuf);
state.jointsShape = [state.T, 2, 21, 3];
state.verts = new Float32Array(vBuf);
state.vertsShape = [state.T, 2, 778, 3];
if (fBuf) state.faces = new Int32Array(fBuf);
if (pBuf) {
state.poses = new Float32Array(pBuf);
state.posesShape = r.object_pose_shape;
}
if (oBuf) state.obbs = new Float32Array(oBuf);
buildHandObjects();
await buildObjectMeshes(scene);
fitCameraToHand();
// Wire video + slider
setupVideos(scene);
setupSlider();
setupGUI();
setFrame(0);
animate();
}
// ── three boilerplate ─────────────────────────────────────────
function initThree() {
const canvas = document.getElementById('three-canvas');
const stage = canvas.parentElement;
const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: false });
renderer.setPixelRatio(window.devicePixelRatio);
const scene = new THREE.Scene();
// Match viser's paper theme: warm off-white.
scene.background = new THREE.Color('#f6f4ee');
const camera = new THREE.PerspectiveCamera(35, 16/10, 0.005, 50);
camera.position.set(0, -0.3, 1.2);
// Gravity in our pipeline is roughly -Y; align Three.js up accordingly so
// OrbitControls "up" feels right (we'll re-target to hand center later).
camera.up.set(0, -1, 0);
const controls = new OrbitControls(camera, canvas);
controls.target.set(0, 0, 1.0);
controls.enableDamping = true;
controls.dampingFactor = 0.08;
// Lighting: ambient + a single soft directional. Mesh material is unshaded
// (MeshStandardMaterial with low metalness) so it reads on white.
scene.add(new THREE.AmbientLight(0xffffff, 0.85));
const dir = new THREE.DirectionalLight(0xffffff, 0.55);
dir.position.set(0.5, -0.8, 0.5);
scene.add(dir);
// Ground grid (matches viser's gravity-aligned grid). XZ plane.
const grid = new THREE.GridHelper(2, 20, 0x999999, 0xcccccc);
grid.material.opacity = 0.6;
grid.material.transparent = true;
scene.add(grid);
// Axes helper at world origin
scene.add(new THREE.AxesHelper(0.15));
function resize() {
const w = stage.clientWidth, h = stage.clientHeight;
renderer.setSize(w, h, false);
camera.aspect = w / h;
camera.updateProjectionMatrix();
}
window.addEventListener('resize', resize);
resize();
state.three = { scene, camera, renderer, controls };
}
function buildHandObjects() {
// edge colours: 20 segments Γ— 2 endpoints Γ— 3 channels
const edgeColors = new Float32Array(HAND_EDGES.length * 2 * 3);
HAND_EDGES.forEach((_e, ei) => {
const fi = Math.floor(ei / 4) % 5;
const c = FINGER_COLOURS[fi];
for (let s = 0; s < 2; s++) {
edgeColors[(ei * 2 + s) * 3 + 0] = c[0];
edgeColors[(ei * 2 + s) * 3 + 1] = c[1];
edgeColors[(ei * 2 + s) * 3 + 2] = c[2];
}
});
for (let h = 0; h < 2; h++) {
// ── mesh ──
const meshGeom = new THREE.BufferGeometry();
meshGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(778 * 3), 3));
if (state.faces) {
meshGeom.setIndex(new THREE.Uint32BufferAttribute(state.faces.slice(), 1));
}
const meshMat = new THREE.MeshStandardMaterial({
color: h === 0 ? 0xc89089 : 0x8aa3c0, // skin-pink left, blue-grey right
roughness: 0.9, metalness: 0.0,
side: THREE.DoubleSide,
transparent: true, opacity: 0.92,
flatShading: false,
});
const mesh = new THREE.Mesh(meshGeom, meshMat);
mesh.visible = false;
state.three.scene.add(mesh);
// ── skeleton ──
const skelGeom = new THREE.BufferGeometry();
skelGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(HAND_EDGES.length * 2 * 3), 3));
skelGeom.setAttribute('color', new THREE.BufferAttribute(edgeColors.slice(), 3));
const skelMat = new THREE.LineBasicMaterial({ vertexColors: true });
const skeleton = new THREE.LineSegments(skelGeom, skelMat);
skeleton.visible = false;
state.three.scene.add(skeleton);
// ── joint dots ──
const dotGeom = new THREE.BufferGeometry();
dotGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(21 * 3), 3));
const dotMat = new THREE.PointsMaterial({
color: 0x222222, size: 0.013, sizeAttenuation: true,
});
const dots = new THREE.Points(dotGeom, dotMat);
dots.visible = false;
state.three.scene.add(dots);
state.hands.push({ mesh, skeleton, dots });
}
}
// Load each object's PLY, scale by scale_correction (the missing factor that
// brings canonical points into world units), build a Points + OBB wireframe.
// Per-frame we drive the matrix from object_pose.bin; at rest the OBB sits at
// the canonical box corners since obbs[oi] are stored in mesh-local frame.
async function buildObjectMeshes(scene) {
const loader = new PLYLoader();
const objs = scene.reconstruction.objects || [];
for (let oi = 0; oi < objs.length; oi++) {
const o = objs[oi];
const url = `${ASSET_BASE}/${o.ply}`;
let geom;
try {
geom = await new Promise((res, rej) => loader.load(url, res, undefined, rej));
} catch (e) {
console.warn(`obj ${o.id}: PLY load failed`, e);
continue;
}
// Scale geometry vertices by scale_correction so per-frame pose_R/pose_t
// (which expects canonical-scaled-to-world points) is applied correctly.
const sc = (o.scale_correction != null) ? o.scale_correction : 1.0;
if (Math.abs(sc - 1.0) > 1e-6) {
const pos = geom.getAttribute('position');
for (let i = 0; i < pos.count * 3; i++) pos.array[i] *= sc;
pos.needsUpdate = true;
geom.computeBoundingSphere();
}
const hasColor = geom.getAttribute('color') !== undefined;
const mat = new THREE.PointsMaterial({
size: 0.003,
sizeAttenuation: true,
vertexColors: hasColor,
color: hasColor ? 0xffffff : new THREE.Color(o.color_hex),
});
const pts = new THREE.Points(geom, mat);
pts.matrixAutoUpdate = false;
state.three.scene.add(pts);
// OBB wireframe: 12 edges of an axis-aligned-ish box from 8 corner indices.
const obbGeom = new THREE.BufferGeometry();
obbGeom.setAttribute('position', new THREE.BufferAttribute(new Float32Array(12 * 2 * 3), 3));
const obbMat = new THREE.LineBasicMaterial({ color: new THREE.Color(o.color_hex) });
const obb = new THREE.LineSegments(obbGeom, obbMat);
obb.visible = false;
state.three.scene.add(obb);
state.objects.push({ pts, obb, oid: o.id, color_hex: o.color_hex });
}
}
// 8-corner index pairs to form the 12 edges of an axis-aligned bounding box.
// SAM3D obb_corners ordering: it's the 8 cuboid vertices but the exact
// permutation depends on the producer; convex-hull edges are robust.
const OBB_EDGES = [
[0,1],[1,2],[2,3],[3,0],
[4,5],[5,6],[6,7],[7,4],
[0,4],[1,5],[2,6],[3,7],
];
function fitCameraToHand() {
// Centre OrbitControls on the median wrist position over all frames.
const T = state.jointsShape[0], H = state.jointsShape[1];
const stride21 = 21 * 3, strideH = stride21 * H;
const xs = [], ys = [], zs = [];
for (let t = 0; t < T; t++) {
for (let h = 0; h < H; h++) {
const off = t * strideH + h * stride21 + 0; // wrist = joint 0
const x = state.joints[off], y = state.joints[off+1], z = state.joints[off+2];
if (Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(z)) {
xs.push(x); ys.push(y); zs.push(z);
}
}
}
if (xs.length === 0) return;
const median = (a) => a.slice().sort((p, q) => p - q)[Math.floor(a.length / 2)];
const cx = median(xs), cy = median(ys), cz = median(zs);
const c = state.three.controls;
c.target.set(cx, cy, cz);
// Position camera ~0.6m back along -Z, slightly above hand (in viser's
// gravity-up frame, "above" = -Y).
state.three.camera.position.set(cx, cy - 0.25, cz - 0.6);
c.update();
}
// ── per-frame update ──────────────────────────────────────────
function setFrame(t) {
t = Math.max(0, Math.min(state.T - 1, Math.round(t)));
const H = state.jointsShape[1];
const J = 21, V = 778;
const stride_j_h = J * 3, stride_j_t = stride_j_h * H;
const stride_v_h = V * 3, stride_v_t = stride_v_h * H;
const showMesh = state.toggle?.mesh !== false;
const showSkel = state.toggle?.skeleton !== false;
const showDots = state.toggle?.joints !== false;
for (let h = 0; h < H; h++) {
const off_j = t * stride_j_t + h * stride_j_h;
const off_v = t * stride_v_t + h * stride_v_h;
const valid = Number.isFinite(state.joints[off_j]);
const hand = state.hands[h];
hand.mesh.visible = valid && showMesh;
hand.skeleton.visible = valid && showSkel;
hand.dots.visible = valid && showDots;
if (!valid) continue;
// skeleton edges
const sArr = hand.skeleton.geometry.getAttribute('position').array;
HAND_EDGES.forEach(([a, b], ei) => {
sArr[ei*6+0] = state.joints[off_j + a*3 + 0];
sArr[ei*6+1] = state.joints[off_j + a*3 + 1];
sArr[ei*6+2] = state.joints[off_j + a*3 + 2];
sArr[ei*6+3] = state.joints[off_j + b*3 + 0];
sArr[ei*6+4] = state.joints[off_j + b*3 + 1];
sArr[ei*6+5] = state.joints[off_j + b*3 + 2];
});
hand.skeleton.geometry.getAttribute('position').needsUpdate = true;
// joint dots
const dArr = hand.dots.geometry.getAttribute('position').array;
for (let j = 0; j < J; j++) {
dArr[j*3+0] = state.joints[off_j + j*3 + 0];
dArr[j*3+1] = state.joints[off_j + j*3 + 1];
dArr[j*3+2] = state.joints[off_j + j*3 + 2];
}
hand.dots.geometry.getAttribute('position').needsUpdate = true;
// mesh verts
if (state.verts && state.faces) {
const vAttr = hand.mesh.geometry.getAttribute('position');
const vArr = vAttr.array;
for (let i = 0; i < V * 3; i++) vArr[i] = state.verts[off_v + i];
vAttr.needsUpdate = true;
hand.mesh.geometry.computeVertexNormals();
hand.mesh.geometry.computeBoundingSphere();
}
}
// ── objects ──
if (state.poses && state.objects.length) {
const N = state.posesShape[1];
const strideObj = 16;
const strideFrame = strideObj * N;
const showObj = state.toggle.objects !== false;
const showObb = state.toggle.obb !== false;
for (let oi = 0; oi < state.objects.length; oi++) {
const o = state.objects[oi];
const off = t * strideFrame + oi * strideObj;
// Numpy float32 .tofile() is row-major; Matrix4.set takes row-major args.
o.pts.matrix.set(
state.poses[off+0], state.poses[off+1], state.poses[off+2], state.poses[off+3],
state.poses[off+4], state.poses[off+5], state.poses[off+6], state.poses[off+7],
state.poses[off+8], state.poses[off+9], state.poses[off+10], state.poses[off+11],
state.poses[off+12], state.poses[off+13], state.poses[off+14], state.poses[off+15],
);
o.pts.matrixWorldNeedsUpdate = true;
o.pts.visible = showObj;
// OBB: 8 mesh-local corners β†’ push through same per-frame matrix.
if (state.obbs) {
const cornersOff = oi * 24;
const m = o.pts.matrix;
const obbArr = o.obb.geometry.getAttribute('position').array;
const corner = new THREE.Vector3();
const xformed = [];
for (let k = 0; k < 8; k++) {
corner.set(state.obbs[cornersOff + k*3 + 0],
state.obbs[cornersOff + k*3 + 1],
state.obbs[cornersOff + k*3 + 2]);
corner.applyMatrix4(m);
xformed.push(corner.x, corner.y, corner.z);
}
OBB_EDGES.forEach(([a, b], ei) => {
obbArr[ei*6+0] = xformed[a*3+0];
obbArr[ei*6+1] = xformed[a*3+1];
obbArr[ei*6+2] = xformed[a*3+2];
obbArr[ei*6+3] = xformed[b*3+0];
obbArr[ei*6+4] = xformed[b*3+1];
obbArr[ei*6+5] = xformed[b*3+2];
});
o.obb.geometry.getAttribute('position').needsUpdate = true;
}
o.obb.visible = showObb;
}
}
document.getElementById('frame').value = String(t);
document.getElementById('frame-label').textContent = `${t + 1} / ${state.T}`;
}
// ── playback synchronisation ──────────────────────────────────
function setupVideos(scene) {
const fps = scene.fps;
const dv = document.getElementById('depth-video');
const fv = document.getElementById('flow-video');
const av = document.getElementById('annotated-video');
dv.src = `${ASSET_BASE}/${scene.reconstruction.depth_video}`;
fv.src = `${ASSET_BASE}/${scene.reconstruction.flow_video}`;
// Phase 1: no annotated mp4. Re-use depth.mp4 in the spare pane.
av.src = `${ASSET_BASE}/${scene.reconstruction.depth_video}`;
const yt = document.getElementById('yt-iframe');
const v = scene.video_source;
if (v && v.youtube_id) {
const start = Math.floor(v.start_seconds || 0);
const end = Math.ceil(v.end_seconds || (start + scene.duration));
// autoplay=1 + mute=1 β†’ satisfies browser autoplay gate
// loop=1 + playlist=<id> β†’ YouTube quirk: single-video loops require playlist
// Playback re-enters at `start` once it reaches `end`.
const params = new URLSearchParams({
autoplay: '1', mute: '1', loop: '1', playlist: v.youtube_id,
start: String(start), end: String(end),
controls: '1', rel: '0', modestbranding: '1', playsinline: '1',
});
yt.src = `https://www.youtube.com/embed/${v.youtube_id}?${params.toString()}`;
}
// Master clock = depth.mp4 (encoded at native fps to match real-time).
dv.addEventListener('timeupdate', () => {
setFrame(Math.floor(dv.currentTime * fps));
});
}
function setupSlider() {
const slider = document.getElementById('frame');
slider.max = String(state.T - 1);
slider.addEventListener('input', () => {
const t = parseInt(slider.value, 10);
const dv = document.getElementById('depth-video');
const fv = document.getElementById('flow-video');
const tt = t / state.fps;
dv.currentTime = tt; fv.currentTime = tt;
setFrame(t);
});
document.getElementById('play').addEventListener('click', () => {
const dv = document.getElementById('depth-video');
const fv = document.getElementById('flow-video');
if (dv.paused) { dv.play(); fv.play(); } else { dv.pause(); fv.pause(); }
});
}
function setupGUI() {
// state.toggle initialised in `state` declaration above.
const ids = {
mesh: 't-mesh', skeleton: 't-skeleton', joints: 't-joints',
objects: 't-objects', obb: 't-obb',
};
const apply = () => {
Object.entries(ids).forEach(([k, id]) => {
state.toggle[k] = document.getElementById(id).checked;
});
setFrame(parseInt(document.getElementById('frame').value, 10) || 0);
};
Object.values(ids).forEach((id) => {
const el = document.getElementById(id);
if (el) {
el.checked = state.toggle[Object.keys(ids).find(k => ids[k] === id)];
el.addEventListener('change', apply);
}
});
}
function animate() {
state.three.controls.update();
state.three.renderer.render(state.three.scene, state.three.camera);
requestAnimationFrame(animate);
}
// ── helpers ──────────────────────────────────────────────────
async function fetchJSON(url) {
const r = await fetch(url);
if (!r.ok) throw new Error(`${url}: ${r.status}`);
return r.json();
}
async function fetchBin(url) {
const r = await fetch(url);
if (!r.ok) throw new Error(`${url}: ${r.status}`);
return await r.arrayBuffer();
}
main().catch(e => {
console.error(e);
document.getElementById('sub').textContent = `error: ${e.message}`;
});