"use client"; /** * Multi-track timeline for v3.1 language atoms — like a video-editing * scrubber, but stacked vertically by style and split into two banded * sections that mirror the two language columns: * * PERSISTENT (language_persistent — broadcast across every frame): * - task_aug: task phrasings shown as point-in-time ticks at episode start. * - subtask: filled spans from each emit time until the next subtask emit * (or episode end). Numbered. Resizable edges; the empty subtask track * also accepts drag-to-create. * - plan: filled (read-only) spans from each plan emit until the next plan * refresh (or episode end) — a plan is the active state until superseded, * so it reads as a span, not an instantaneous event. * - memory: tick marks (state snapshots captured at subtask boundaries). * * EVENTS (language_events — fire on a single frame): * - interjections + speech: combined event track. * - vqa: event track. * * Interactions: * - Click a marker → seek + select (handled by the panel's listening to * `selectAtom` via context). * - Drag a subtask span's left edge → retime that subtask's start. * - Drag a subtask span's right edge → retime the *next* subtask's start * (since the right edge of subtask[i] *is* the start of subtask[i+1]). * - Drag from empty area on the subtask track → create a new subtask span; * a label popup appears at the draw end so you can name it. * - Drag the playhead handle (or click anywhere on the track band) → scrub * the video time. Pauses the player while dragging. * - Hover over any marker → custom tooltip shows the atom's content. */ import React, { useEffect, useMemo, useRef, useState } from "react"; import { useTime } from "../context/time-context"; import { useAnnotations } from "../context/annotations-context"; import { classifyVqa, isSpeechAtom, parseVqaAnswer, type LanguageAtom, } from "../types/language.types"; const LABEL_WIDTH = 84; const DRAG_THRESHOLD_PX = 4; // `render` controls how a lane draws: "span-edit" = resizable + drag-to-create // (subtask), "span-ro" = read-only spans (task_aug / plan), "tick" = point // markers. const TRACK_GROUPS = [ { column: "persistent", title: "Persistent", sub: "language_persistent · broadcast across every frame", tracks: [ // task_aug applies to the whole episode (it's a rephrasing of the task, // stored at t0 but persistent across every frame), so it reads as a // full-episode span — matching how the annotation pipeline treats it. // We collapse all rephrasings into a single full-width bar with a ×N badge; // clicking opens a popover listing every phrasing. { key: "task_aug", label: "task aug", color: "#38bdf8", render: "task-aug", }, { key: "subtask", label: "subtask", color: "#ffd21e", render: "span-edit", }, { key: "plan", label: "plan", color: "#5b8cff", render: "span-ro" }, { key: "memory", label: "memory", color: "#b78bff", render: "tick" }, ], }, { column: "events", title: "Events", sub: "language_events · fire on a single frame", tracks: [ { key: "interjection", label: "speech", color: "#ef5350", render: "tick", }, { key: "vqa", label: "vqa", color: "#34d399", render: "tick" }, ], }, ] as const; interface Props { /** Episode duration in seconds. */ duration: number; } interface Tooltip { x: number; y: number; meta: string; text: string; } interface DragState { kind: "edge" | "playhead" | "create"; /** Atom index whose timestamp is being moved (edge / create). */ atomIdx?: number; /** Episode-second timestamps captured at drag start (for cancel/clamp). */ origTs?: number; prevTs?: number; // previous subtask's timestamp (lower bound) nextTs?: number; // next subtask's timestamp (upper bound, exclusive) /** For drag-to-create only. */ startTs?: number; endTs?: number; } interface PendingCreate { start: number; end: number; /** Anchor for the label popup (canvas-relative px). */ anchorX: number; anchorY: number; } export const AnnotationsTimeline: React.FC = ({ duration }) => { const { atoms, addAtom, updateAtom, snap, selectAtom } = useAnnotations(); const { currentTime, seek, setIsPlaying } = useTime(); const trackBandRef = useRef(null); const [tooltip, setTooltip] = useState(null); const [drag, setDrag] = useState(null); const [pendingCreate, setPendingCreate] = useState( null, ); const [createLabel, setCreateLabel] = useState(""); // Pause + select helper const jumpAndSelect = React.useCallback( (ts: number, idx: number | null) => { seek(ts, "external"); setIsPlaying(false); if (idx != null) selectAtom(idx); }, [seek, setIsPlaying, selectAtom], ); // ============ Lane derivation ============ const lanes = useMemo(() => { type SpanMarker = { kind: "span"; start: number; end: number; label: string; atom: LanguageAtom; atomIdx: number; // index of the *start* atom in atoms[] }; type TickMarker = { kind: "tick"; t: number; label: string; atom: LanguageAtom; atomIdx: number; subtype?: string; }; const subtask: SpanMarker[] = []; const task_aug: SpanMarker[] = []; const plan: SpanMarker[] = []; const memory: TickMarker[] = []; const interjection: TickMarker[] = []; const vqa: TickMarker[] = []; // Subtasks → spans, sorted by ts. Track the original atom index so drag // operations can update via updateAtom(idx, ...). const subWithIdx = atoms .map((a, i) => ({ a, i })) .filter(({ a }) => a.style === "subtask") .sort((x, y) => x.a.timestamp - y.a.timestamp); subWithIdx.forEach(({ a, i }, k) => { const start = a.timestamp; const end = k + 1 < subWithIdx.length ? subWithIdx[k + 1].a.timestamp : duration; subtask.push({ kind: "span", start, end, label: a.content || "", atom: a, atomIdx: i, }); }); // Plans → read-only spans: a plan is the active state from its emit time // until the next plan refresh (or episode end), exactly like a subtask // span. Rendering it as a span (not a tick) makes its persistent nature // visible — it isn't a point-in-time event. const planWithIdx = atoms .map((a, i) => ({ a, i })) .filter(({ a }) => a.style === "plan") .sort((x, y) => x.a.timestamp - y.a.timestamp); planWithIdx.forEach(({ a, i }, k) => { const start = a.timestamp; const end = k + 1 < planWithIdx.length ? planWithIdx[k + 1].a.timestamp : duration; plan.push({ kind: "span", start, end, label: a.content || "plan", atom: a, atomIdx: i, }); }); // Task augmentations → full-episode spans: each is a rephrasing of the // task and applies to the whole episode (persistent, stored at t0), so it // spans [t0, t_last] rather than sitting as a tick at the start. atoms.forEach((a, i) => { if (a.style === "task_aug") { task_aug.push({ kind: "span", start: 0, end: duration, label: a.content || "task augmentation", atom: a, atomIdx: i, }); } }); atoms.forEach((a, i) => { if (a.style === "memory") { memory.push({ kind: "tick", t: a.timestamp, label: a.content || "memory", atom: a, atomIdx: i, }); } else if (a.style === "interjection" || isSpeechAtom(a)) { interjection.push({ kind: "tick", t: a.timestamp, label: a.style === "interjection" ? a.content || "" : "say(…)", atom: a, atomIdx: i, subtype: a.style === "interjection" ? "user" : "speech", }); } else if (a.style === "vqa" && a.role === "assistant") { const parsed = parseVqaAnswer(a.content); const kind = parsed ? classifyVqa(parsed) : null; vqa.push({ kind: "tick", t: a.timestamp, label: kind || "vqa", atom: a, atomIdx: i, subtype: kind || undefined, }); } }); return { task_aug, subtask, plan, memory, interjection, vqa, subWithIdx }; }, [atoms, duration]); // ============ Pixel <-> time mapping ============ // The full-width track band (no label margin) is `trackBandRef`. Convert // mouse client.x to a 0..duration timestamp. const trackXToTs = (clientX: number): number => { const r = trackBandRef.current?.getBoundingClientRect(); if (!r || !duration) return 0; const frac = Math.max(0, Math.min(1, (clientX - r.left) / r.width)); return frac * duration; }; // ============ Event-track click → seek + select ============ const onTickClick = (e: React.MouseEvent, atomIdx: number, t: number) => { e.stopPropagation(); jumpAndSelect(t, atomIdx); }; // ============ task_aug collapsed-bar click ============ // All phrasings share t0, so there is no spatial way to disambiguate them // on the track — clicking just selects the first one (the full list is // shown on hover). The inspector + rail still expose every rewording. const onTaskAugClick = (e: React.MouseEvent) => { e.stopPropagation(); const augs = lanes.task_aug; if (augs.length === 0) return; jumpAndSelect(0, augs[0].atomIdx); }; // ============ Subtask span drag ============ const onSpanBodyClick = ( e: React.MouseEvent, atomIdx: number, start: number, ) => { if (drag || pendingCreate) return; e.stopPropagation(); jumpAndSelect(start, atomIdx); }; const onEdgeDown = ( e: React.PointerEvent, side: "l" | "r", spanK: number, ) => { e.stopPropagation(); e.preventDefault(); (e.target as HTMLElement).setPointerCapture?.(e.pointerId); const sub = lanes.subWithIdx; // Left edge of span k → moves sub[k] timestamp. // Right edge of span k → moves sub[k+1] timestamp (if exists). const idxToMove = side === "l" ? spanK : spanK + 1; if (idxToMove < 0 || idxToMove >= sub.length) return; const target = sub[idxToMove]; const lower = idxToMove > 0 ? sub[idxToMove - 1].a.timestamp : 0; const upper = idxToMove + 1 < sub.length ? sub[idxToMove + 1].a.timestamp : duration; setDrag({ kind: "edge", atomIdx: target.i, origTs: target.a.timestamp, prevTs: lower, nextTs: upper, }); }; // ============ Drag-to-create new subtask span ============ const onSubtaskTrackDown = (e: React.PointerEvent) => { // Only fire when the mousedown lands on the track itself, not on a // child span/edge (those stop propagation in their own handlers). if (drag || pendingCreate) return; if (e.button !== 0) return; (e.target as HTMLElement).setPointerCapture?.(e.pointerId); const ts = snap(trackXToTs(e.clientX)); setDrag({ kind: "create", startTs: ts, endTs: ts }); }; // ============ Playhead drag ============ const onPlayheadDown = (e: React.PointerEvent) => { e.stopPropagation(); e.preventDefault(); (e.target as HTMLElement).setPointerCapture?.(e.pointerId); setIsPlaying(false); setDrag({ kind: "playhead" }); }; const onTrackBandClick = (e: React.MouseEvent) => { // Clicks anywhere on the track band that bubbled up: seek to that point. if (drag || pendingCreate) return; if ((e.target as HTMLElement).dataset.role === "ruler") { // Already handled by the dedicated ruler bar } const ts = trackXToTs(e.clientX); seek(ts, "external"); setIsPlaying(false); }; // ============ Global pointermove / pointerup for drag commits ============ useEffect(() => { if (!drag) return; const move = (e: PointerEvent) => { const ts = trackXToTs(e.clientX); if (drag.kind === "playhead") { seek(Math.max(0, Math.min(duration, ts)), "external"); return; } if (drag.kind === "edge" && drag.atomIdx != null) { const lower = drag.prevTs ?? 0; const upper = drag.nextTs ?? duration; const clamped = Math.max(lower + 0.001, Math.min(upper - 0.001, ts)); const snapped = snap(clamped); updateAtom(drag.atomIdx, { timestamp: snapped }); return; } if (drag.kind === "create") { const snapped = snap(Math.max(0, Math.min(duration, ts))); setDrag((d) => (d ? { ...d, endTs: snapped } : d)); } }; const up = (e: PointerEvent) => { if (drag.kind === "create") { const a = Math.min(drag.startTs ?? 0, drag.endTs ?? 0); const b = Math.max(drag.startTs ?? 0, drag.endTs ?? 0); const distFrac = Math.abs(b - a) / Math.max(0.001, duration); // Need at least a few px of drag to count, otherwise treat as click. const trackWidth = trackBandRef.current?.getBoundingClientRect().width ?? 1; if (distFrac * trackWidth >= DRAG_THRESHOLD_PX) { // Anchor the label popup at the upper-right of the new span. const r = trackBandRef.current?.getBoundingClientRect(); if (r) { const xFrac = b / Math.max(0.001, duration); setPendingCreate({ start: a, end: b, anchorX: r.left + xFrac * r.width + 4, anchorY: r.top - 8, }); } } else { // Tap, not drag — treat as a seek to that point. seek(a, "external"); setIsPlaying(false); } } else if (drag.kind === "edge" && drag.atomIdx != null) { // Already updated in `move`; nothing more to do beyond final snap. } setDrag(null); // We don't release pointerCapture here because the original target is // already cleaned up by the browser when we release the pointer. void e; }; window.addEventListener("pointermove", move); window.addEventListener("pointerup", up); return () => { window.removeEventListener("pointermove", move); window.removeEventListener("pointerup", up); }; }, [drag, duration, seek, setIsPlaying, snap, updateAtom]); // ============ Tooltip helpers ============ const showTip = (e: React.MouseEvent, meta: string, text: string) => { setTooltip({ x: e.clientX + 12, y: e.clientY + 12, meta, text, }); }; const moveTip = (e: React.MouseEvent) => { setTooltip((t) => (t ? { ...t, x: e.clientX + 12, y: e.clientY + 12 } : t)); }; const hideTip = () => setTooltip(null); // ============ Pending-create label popup commit ============ const commitPendingCreate = () => { if (!pendingCreate) return; const text = createLabel.trim(); if (!text) { setPendingCreate(null); setCreateLabel(""); return; } addAtom({ role: "assistant", content: text, style: "subtask", timestamp: snap(pendingCreate.start), camera: null, tool_calls: null, }); // The next subtask boundary is implicit (next sibling's timestamp); if // the user wants a different end they can drag the right edge afterwards. setPendingCreate(null); setCreateLabel(""); }; const cancelPendingCreate = () => { setPendingCreate(null); setCreateLabel(""); }; // ============ Render ============ if (!duration) return null; return (
Annotations timeline {currentTime.toFixed(2)}s / {duration.toFixed(2)}s
{/* Time-axis ruler — clicking it scrubs */}
{ const ts = trackXToTs(e.clientX); seek(ts, "external"); setIsPlaying(false); }} > {Array.from({ length: Math.floor(duration / 5) + 1 }).map((_, i) => { const t = i * 5; const left = (t / duration) * 100; return (
{t}s
); })}
{/* Tracks, grouped into Persistent / Events sections that mirror the two language columns. The whole region is position:relative so the playhead can span its full height via top/bottom (no brittle per-track pixel math that section headers would throw off). The playhead's x uses calc() to start at the track band's left edge (after the LABEL_WIDTH label column + 10px gap). */} {(() => { const bandLeft = `${LABEL_WIDTH + 10}px`; const playheadLeft = `calc(${bandLeft} + ${ duration ? currentTime / duration : 0 } * (100% - ${bandLeft}))`; return (
{TRACK_GROUPS.map((group) => (
{group.title} {group.sub}
{group.tracks.map((tk) => (
{tk.label}
{/* Editable subtask spans (resize + drag-to-create) */} {tk.render === "span-edit" && lanes.subtask.map((s, k) => { const left = (s.start / duration) * 100; const width = Math.max( 0.3, ((s.end - s.start) / duration) * 100, ); return (
onSpanBodyClick(e, s.atomIdx, s.start) } onMouseEnter={(e) => showTip( e, `subtask · ${s.start.toFixed(2)}s → ${s.end.toFixed(2)}s`, s.label, ) } onMouseMove={moveTip} onMouseLeave={hideTip} > {k} {s.label}
onEdgeDown(e, "l", k)} /> {k + 1 < lanes.subtask.length && (
onEdgeDown(e, "r", k)} /> )}
); })} {/* Drag-to-create preview rectangle (subtask only) */} {tk.render === "span-edit" && drag?.kind === "create" && (
)} {/* Collapsed task-augmentation bar: one full-width bar (rephrasings carry no temporal info), with a ×N badge when there is more than one. Click selects the single phrasing, or opens the rewordings popover. */} {tk.render === "task-aug" && lanes.task_aug.length > 0 && (() => { const augs = lanes.task_aug; const primary = augs[0]; const count = augs.length; return (
showTip( e, `task aug · ${count} phrasing${count > 1 ? "s" : ""}`, count > 1 ? augs.map((s) => `• ${s.label}`).join("\n") : primary.label, ) } onMouseMove={moveTip} onMouseLeave={hideTip} > {primary.label} {count > 1 && ( ×{count} )}
); })()} {/* Read-only persistent spans (plan is active until its next refresh). Click seeks + selects; no resize. */} {tk.render === "span-ro" && ( lanes[tk.key as "plan"] as Array<{ kind: "span"; start: number; end: number; label: string; atom: LanguageAtom; atomIdx: number; }> ).map((s, k) => { const left = (s.start / duration) * 100; const width = Math.max( 0.3, ((s.end - s.start) / duration) * 100, ); return (
onSpanBodyClick(e, s.atomIdx, s.start) } onMouseEnter={(e) => showTip( e, `${tk.label} · ${s.start.toFixed(2)}s → ${s.end.toFixed(2)}s`, s.label, ) } onMouseMove={moveTip} onMouseLeave={hideTip} > {s.label}
); })} {/* Point-in-time tick markers (task_aug / memory / interjection / vqa) */} {tk.render === "tick" && ( lanes[ tk.key as "memory" | "interjection" | "vqa" ] as Array<{ kind: "tick"; t: number; label: string; atom: LanguageAtom; atomIdx: number; subtype?: string; }> ).map((m, i) => { const left = (m.t / duration) * 100; return (
onTickClick(e, m.atomIdx, m.t)} onMouseEnter={(e) => showTip( e, `${tk.label}${m.subtype ? ` · ${m.subtype}` : ""} · ${m.t.toFixed(3)}s`, m.label, ) } onMouseMove={moveTip} onMouseLeave={hideTip} /> ); })}
))}
))} {/* Playhead — spans the full tracks region via top/bottom. */}
); })()} {/* Tooltip */} {tooltip && (
{tooltip.meta}
{tooltip.text}
)} {/* Drag-to-create label popup */} {pendingCreate && (
subtask {pendingCreate.start.toFixed(2)}s → {pendingCreate.end.toFixed(2)} s
setCreateLabel(e.target.value)} onKeyDown={(e) => { if (e.key === "Enter") commitPendingCreate(); if (e.key === "Escape") cancelPendingCreate(); }} />
)}
); };