"use client"; import "./annotations-skin.css"; /** * Editor UI for v3.1 language atoms. * * Three vertical sections: * 1. Inline quick-add bar above the timeline (style picker + label + Add). * 2. Annotations timeline (in `annotations-timeline.tsx`). * 3. Workspace below the timeline: * - Left rail: full atom list grouped by style; click to select. * - Right pane: editor for the selected atom (or empty state). * * Bbox / keypoint VQA atoms are still added through the canvas overlay's * quick-label popup; the inline quick-add covers subtask / plan / memory / * interjection / speech / count / attribute / spatial. */ import React, { useMemo, useState } from "react"; import { useTime } from "../context/time-context"; import { useAnnotations } from "../context/annotations-context"; import { buildSpeechAtom, classifyVqa, isSpeechAtom, parseVqaAnswer, speechText, type LanguageAtom, } from "../types/language.types"; import { exportDataset as apiExport, isAnnotateBackendEnabled, } from "../utils/annotationsClient"; interface Props { cameraKeys: string[]; } function fmtTime(s: number): string { return s.toFixed(3) + "s"; } function StylePill({ style }: { style: string | null }) { const cls = style ?? "speech"; return {style ?? "speech"}; } /** * Highlight a row when its timestamp is within ~half a frame of currentTime. */ function isActiveAt(ts: number, currentTime: number, fps = 30): boolean { return Math.abs(ts - currentTime) < 0.5 / fps; } type QuickAddKind = | "task_aug" | "subtask" | "plan" | "memory" | "interjection" | "speech" | "count" | "attribute" | "spatial"; interface QuickAddField { name: string; placeholder: string; type?: "text" | "number"; width?: string; grow?: boolean; } interface QuickAddBuildCtx { ts: number; vqaCamera: string | null; } interface QuickAddDef { kind: QuickAddKind; label: string; /** When true, the displayed timestamp is 0 (atom is pinned to episode start). */ atEpisodeStart?: boolean; fields: QuickAddField[]; build: ( values: Record, ctx: QuickAddBuildCtx, ) => LanguageAtom[] | null; } // Each text-style atom kind (and the simpler VQA shapes) is one entry: how // it appears in the dropdown, what fields the user fills, and how those // values map to one or two language atoms. const QUICK_ADD_DEFS: QuickAddDef[] = [ { kind: "task_aug", label: "task augmentation", atEpisodeStart: true, fields: [ { name: "label", placeholder: "pick up the blue cube and place it in the green box", grow: true, }, ], build: ({ label }) => { const text = label.trim(); if (!text) return null; return [ { role: "user", content: text, style: "task_aug", timestamp: 0, camera: null, tool_calls: null, }, ]; }, }, { kind: "subtask", label: "subtask", fields: [ { name: "label", placeholder: "grasp the handle of the sponge", grow: true, }, ], build: ({ label }, { ts }) => { const text = label.trim(); if (!text) return null; return [ { role: "assistant", content: text, style: "subtask", timestamp: ts, camera: null, tool_calls: null, }, ]; }, }, { kind: "plan", label: "plan", fields: [ { name: "label", placeholder: "1. grab sponge / 2. wipe / 3. tidy", grow: true, }, ], build: ({ label }, { ts }) => { const text = label.trim(); if (!text) return null; return [ { role: "assistant", content: text, style: "plan", timestamp: ts, camera: null, tool_calls: null, }, ]; }, }, { kind: "memory", label: "memory", fields: [ { name: "label", placeholder: "sponge picked up; counter still dirty", grow: true, }, ], build: ({ label }, { ts }) => { const text = label.trim(); if (!text) return null; return [ { role: "assistant", content: text, style: "memory", timestamp: ts, camera: null, tool_calls: null, }, ]; }, }, { kind: "interjection", label: "interjection (user)", fields: [ { name: "label", placeholder: "user: actually skip the wipe…", grow: true, }, ], build: ({ label }, { ts }) => { const text = label.trim(); if (!text) return null; return [ { role: "user", content: text, style: "interjection", timestamp: ts, camera: null, tool_calls: null, }, ]; }, }, { kind: "speech", label: "speech (robot say)", fields: [ { name: "label", placeholder: "robot say: Got it, skipping the wipe.", grow: true, }, ], build: ({ label }, { ts }) => { const text = label.trim(); if (!text) return null; return [buildSpeechAtom(ts, text)]; }, }, { kind: "count", label: "vqa: count", fields: [ { name: "label", placeholder: "object label (e.g. cup)", grow: true }, { name: "count", placeholder: "count", type: "number", width: "80px" }, ], build: ({ label, count }, { ts, vqaCamera }) => { const text = label.trim(); if (!text || !count) return null; return [ { role: "user", content: `How many ${text}?`, style: "vqa", timestamp: ts, camera: vqaCamera, tool_calls: null, }, { role: "assistant", content: JSON.stringify({ label: text, count: Number(count) }), style: "vqa", timestamp: ts, camera: vqaCamera, tool_calls: null, }, ]; }, }, { kind: "attribute", label: "vqa: attribute", fields: [ { name: "label", placeholder: "label", width: "120px" }, { name: "attribute", placeholder: "attribute (color)", width: "120px" }, { name: "value", placeholder: "value (red)", grow: true }, ], build: ({ label, attribute, value }, { ts, vqaCamera }) => { const text = label.trim(); if (!text || !attribute || !value) return null; return [ { role: "user", content: `What ${attribute} is the ${text}?`, style: "vqa", timestamp: ts, camera: vqaCamera, tool_calls: null, }, { role: "assistant", content: JSON.stringify({ label: text, attribute, value }), style: "vqa", timestamp: ts, camera: vqaCamera, tool_calls: null, }, ]; }, }, { kind: "spatial", label: "vqa: spatial relation", fields: [ { name: "subject", placeholder: "subject", width: "100px" }, { name: "relation", placeholder: "relation (right_of)", width: "130px" }, { name: "object", placeholder: "object", grow: true }, ], build: ({ subject, relation, object }, { ts, vqaCamera }) => { if (!subject || !relation || !object) return null; return [ { role: "user", content: `Where is the ${subject} relative to the ${object}?`, style: "vqa", timestamp: ts, camera: vqaCamera, tool_calls: null, }, { role: "assistant", content: JSON.stringify({ subject, relation, object }), style: "vqa", timestamp: ts, camera: vqaCamera, tool_calls: null, }, ]; }, }, ]; const QUICK_ADD_DEFS_BY_KIND: Record = QUICK_ADD_DEFS.reduce( (acc, def) => { acc[def.kind] = def; return acc; }, {} as Record, ); interface RailGroupDef { key: string; title: string; dotClass: string; // Which v3.1 language column this style is written to. Used to group the // rail under "Persistent" vs "Events" headers so it's clear at a glance // that task_aug / subtask / plan / memory broadcast across the whole // episode (language_persistent) while interjection / speech / vqa fire on // a single frame (language_events). Mirrors columnForStyle() exactly. column: "persistent" | "events"; match: ( atom: LanguageAtom, otherCamera: (a: LanguageAtom) => boolean, ) => boolean; label: ( atom: LanguageAtom, helpers: { activeCamera: string | null; firstLine: (s: string | null) => string; }, ) => string; } const RAIL_GROUPS: RailGroupDef[] = [ { key: "task_aug", title: "task aug", dotClass: "dot-task-aug", column: "persistent", match: (a) => a.style === "task_aug", label: (a) => a.content || "(empty)", }, { key: "subtask", title: "subtask", dotClass: "dot-subtask", column: "persistent", match: (a) => a.style === "subtask", label: (a) => a.content || "(empty)", }, { key: "plan", title: "plan", dotClass: "dot-plan", column: "persistent", match: (a) => a.style === "plan", label: (a, { firstLine }) => firstLine(a.content), }, { key: "memory", title: "memory", dotClass: "dot-memory", column: "persistent", match: (a) => a.style === "memory", label: (a, { firstLine }) => firstLine(a.content), }, { key: "interjection", title: "interjection", dotClass: "dot-interjection", column: "events", match: (a) => a.style === "interjection", label: (a) => a.content || "(empty)", }, { key: "speech", title: "speech", dotClass: "dot-speech", column: "events", match: (a) => isSpeechAtom(a), label: (a) => speechText(a) || "(empty)", }, { key: "vqa", title: "vqa", dotClass: "dot-vqa", column: "events", match: (a, otherCamera) => a.style === "vqa" && !otherCamera(a), label: (a, { activeCamera }) => { const role = a.role === "user" ? "Q" : "A"; const t = a.content || ""; const cameraSuffix = a.camera && a.camera !== activeCamera ? ` [${a.camera}]` : ""; return `${role}: ${t.slice(0, 60)}${t.length > 60 ? "…" : ""}${cameraSuffix}`; }, }, ]; function useJump(): (ts: number) => void { const { seek, setIsPlaying } = useTime(); return React.useCallback( (ts: number) => { seek(ts, "external"); setIsPlaying(false); }, [seek, setIsPlaying], ); } export const AnnotationsPanel: React.FC = ({ cameraKeys }) => { const { atoms, addAtoms, updateAtom, deleteAtom, snap, save, saving, dirty, backendEnabled, activeCamera, setActiveCamera, setDrawMode, selectedIdx, selectAtom, ident, } = useAnnotations(); const { currentTime } = useTime(); // ============ Inline quick-add state ============ const [qaKind, setQaKind] = useState("subtask"); const [qaValues, setQaValues] = useState>({}); const [exportStatus, setExportStatus] = useState(null); const qaDef = QUICK_ADD_DEFS_BY_KIND[qaKind]; // Initialize active camera once cameras arrive. React.useEffect(() => { if (!activeCamera && cameraKeys.length > 0) setActiveCamera(cameraKeys[0]); }, [activeCamera, cameraKeys, setActiveCamera]); // The Annotations tab keeps the canvas overlay in "auto" mode the whole // time — drag = bbox, click = keypoint. React.useEffect(() => { setDrawMode("auto"); return () => setDrawMode("off"); }, [setDrawMode]); // ============ Atom grouping for the rail ============ // The rail shows one section per atom-kind. Each kind is a single config // entry: how to detect atoms in this kind, and how to label them in the row. // VQA filters out other-camera answers when the dataset has multiple // cameras so the rail mirrors the active video. const groups = useMemo(() => { const firstLine = (s: string | null) => (s || "").split("\n")[0] || "(empty)"; const otherCamera = (a: LanguageAtom): boolean => !!activeCamera && cameraKeys.length > 1 && a.camera != null && a.camera !== activeCamera; return RAIL_GROUPS.map((def) => { const entries = atoms .map((atom, idx) => ({ atom, idx })) .filter(({ atom }) => def.match(atom, otherCamera)) .map(({ atom, idx }) => ({ atom, idx, label: def.label(atom, { activeCamera, firstLine }), })) .sort((a, b) => a.atom.timestamp - b.atom.timestamp); return { def, entries }; }); }, [atoms, activeCamera, cameraKeys.length]); // ============ Quick-add handler ============ // VQA quick-adds inherit the active camera so per-camera filtering shows // them in the right rail / overlay. Non-VQA atoms stay camera-agnostic // (the def's `build` ignores `vqaCamera` for those). const handleQuickAdd = () => { const ts = snap(currentTime); const vqaCamera = activeCamera ?? cameraKeys[0] ?? null; const newAtoms = qaDef.build(qaValues, { ts, vqaCamera }); if (!newAtoms || !newAtoms.length) return; addAtoms(newAtoms); // Select the freshly added atom (last one added) so the editor opens for it. selectAtom(atoms.length + newAtoms.length - 1); setQaValues({}); }; // ============ Save / export ============ const handleSave = async () => { const r = await save(); if (!r.ok) { setExportStatus(`Save failed: ${r.error || "unknown"}`); } else { setExportStatus( r.path ? `Saved episode to ${r.path}` : "Saved episode (backend did not report a path — update/restart backend/app.py).", ); } }; const handleSaveDataset = async () => { if (!isAnnotateBackendEnabled()) { setExportStatus( "Backend not configured. Set NEXT_PUBLIC_ANNOTATE_BACKEND_URL and run backend/app.py.", ); return; } setExportStatus("Saving dataset…"); try { const r = await apiExport(ident); setExportStatus( `Saved dataset to ${r.output_dir} (persistent: ${r.persistent_rows}, events: ${r.event_rows}).`, ); } catch (e) { setExportStatus( `Save dataset failed: ${e instanceof Error ? e.message : String(e)}`, ); } }; const selectedAtom = selectedIdx != null && selectedIdx >= 0 && selectedIdx < atoms.length ? atoms[selectedIdx] : null; // ============ Render ============ return (

Language annotations {dirty && unsaved}

Select an atom from the timeline or list, then edit it in the inspector.

{!backendEnabled && ( backend offline — edits saved to sessionStorage only )}
{exportStatus &&
{exportStatus}
}
Add text annotation

Adds task phrasing, subtask, plan, memory, speech, or non-spatial VQA atoms. Task phrasings are saved at episode start.

t = {qaDef.atEpisodeStart ? fmtTime(0) : fmtTime(currentTime)} {qaDef.fields.map((f, i) => ( setQaValues((v) => ({ ...v, [f.name]: e.target.value })) } onKeyDown={ i === qaDef.fields.length - 1 ? (e) => e.key === "Enter" && handleQuickAdd() : undefined } /> ))}
Annotations

{atoms.length} atoms in this episode

{fmtTime(currentTime)}
{atoms.length === 0 && (
No annotations yet.
Add text above or draw on the active video.
)} {(["persistent", "events"] as const).map((column) => { const colGroups = groups.filter(({ def }) => def.column === column); const total = colGroups.reduce( (n, { entries }) => n + entries.length, 0, ); if (total === 0) return null; return (
{column === "persistent" ? "Persistent" : "Events"} {column === "persistent" ? "language_persistent · broadcast across every frame" : "language_events · fire on a single frame"}
{colGroups.map(({ def, entries }) => ( ))}
); })}
{selectedAtom == null ? (
Inspector

Select an annotation from the list or timeline, or draw a new bbox/keypoint on the video.

) : ( updateAtom(selectedIdx as number, updates)} onDelete={() => deleteAtom(selectedAtom)} /> )}
); }; // --------------------------------------------------------------------------- // Rail group — one row per atom, click selects. // --------------------------------------------------------------------------- const RailGroup: React.FC<{ title: string; dotClass: string; entries: { atom: LanguageAtom; idx: number; label: string }[]; currentTime: number; }> = ({ title, dotClass, entries, currentTime }) => { const { selectedIdx, selectAtom } = useAnnotations(); const jump = useJump(); if (entries.length === 0) return null; return (
{title} {entries.length}
{entries.map(({ atom, idx, label }) => { const sel = idx === selectedIdx; const active = isActiveAt(atom.timestamp, currentTime); return (
{ selectAtom(idx); jump(atom.timestamp); }} > {fmtTime(atom.timestamp)} {label}
); })}
); }; // --------------------------------------------------------------------------- // AtomEditor — form for the currently selected atom. // --------------------------------------------------------------------------- const AtomEditor: React.FC<{ atom: LanguageAtom; cameraKeys: string[]; onChange: (updates: Partial) => void; onDelete: () => void; }> = ({ atom, cameraKeys, onChange, onDelete }) => { const jump = useJump(); const { snap } = useAnnotations(); const isSpeech = isSpeechAtom(atom); const cameraLabel = atom.camera ?? "all cameras"; const roleLabel = isSpeech ? "speech" : atom.role; const [timestampDraft, setTimestampDraft] = useState(() => String(atom.timestamp), ); React.useEffect(() => { setTimestampDraft(String(atom.timestamp)); }, [atom.timestamp]); const commitTimestamp = React.useCallback( (raw = timestampDraft) => { const next = Number(raw); if (!Number.isFinite(next) || next < 0) { setTimestampDraft(String(atom.timestamp)); return; } onChange({ timestamp: next }); setTimestampDraft(String(next)); }, [atom.timestamp, onChange, timestampDraft], ); const commitSnappedTimestamp = () => { const parsed = Number(timestampDraft); const next = snap(Number.isFinite(parsed) ? parsed : atom.timestamp); onChange({ timestamp: next }); setTimestampDraft(String(next)); }; return (
{fmtTime(atom.timestamp)} {roleLabel} · {cameraLabel}
setTimestampDraft(e.target.value)} onBlur={() => commitTimestamp()} onKeyDown={(e) => { if (e.key === "Enter") commitTimestamp(); if (e.key === "Escape") setTimestampDraft(String(atom.timestamp)); }} />
{/* Content / role-specific fields */} {(atom.style === "task_aug" || atom.style === "subtask" || atom.style === "plan" || atom.style === "memory" || atom.style === "interjection") && (
{atom.style === "task_aug" || atom.style === "subtask" || atom.style === "interjection" ? (