| { |
| "source": "results/episode_task_suite/summary_report.json", |
| "scope": { |
| "episode_count": 1, |
| "num_frames": 5821, |
| "num_windows": 1161, |
| "feature_dim": 8546, |
| "window_frames": 20, |
| "stride_frames": 5, |
| "warning": "These walkthroughs explain task contracts on one public sample episode; cross-episode performance requires held-out episodes." |
| }, |
| "shared_pipeline": [ |
| "Read annotation.hdf5 and synchronized video-derived features.", |
| "Slice the episode into 20-frame windows with stride 5.", |
| "Build a 8,546-dimensional aligned feature vector from the synchronized modality groups.", |
| "Construct a task-specific target from labels, future frames, paired windows, or modality splits.", |
| "Train a minimal head and, when enabled, a neural MLP head.", |
| "Write metrics, predictions, and model artifacts for downstream exploration." |
| ], |
| "tasks": { |
| "timeline_action": { |
| "plain_goal": "Look at one short multimodal window and name what action is happening now.", |
| "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", |
| "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", |
| "middle_modules": [ |
| "Window builder slices the episode into short overlapping windows.", |
| "Feature assembler concatenates all current feature blocks.", |
| "Label builder reads the action annotation for the center of the window.", |
| "Classifier head maps the window vector to one action class.", |
| "Evaluator compares predicted action labels against the held-out chronological segment." |
| ], |
| "output": "A single action class for the current window.", |
| "junior_tip": "This is like asking: given this tiny movie clip plus sensor readings, what is the person doing right now?", |
| "failure_mode": "The one-episode chronological split contains future action classes that were not present in training, so low test macro-F1 is expected.", |
| "display_name": "Action Recognition", |
| "research_name": "Egocentric Action Recognition", |
| "task_family": "supervised", |
| "architecture_family": "multiclass classifier", |
| "primary_direction": "C. Egocentric Vision & Interaction", |
| "card_blurb": "Recognize the current manipulation action from synchronized visual, motion, inertial, pose, and annotation context.", |
| "input_short": "20-frame multimodal window", |
| "process_short": "window features -> action label builder -> classifier", |
| "output_short": "current action class", |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "poster_modality": "video", |
| "task": "timeline_action", |
| "artifact_id": "timeline_action", |
| "metric": { |
| "key": "macro_f1", |
| "name": "macro-F1", |
| "direction": "higher", |
| "minimal": 0.05, |
| "neural_mlp": 0.014814814814814814 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "timeline_subtask": { |
| "plain_goal": "Predict the higher-level task stage for the current window.", |
| "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", |
| "input": "The same all-modality window vector used by action recognition.", |
| "middle_modules": [ |
| "Window builder creates the current temporal slice.", |
| "Feature assembler keeps all available modality blocks.", |
| "Subtask label builder maps the current timestamp to a subtask annotation.", |
| "Classifier head predicts the subtask class.", |
| "Evaluator reports class-balanced scores so rare subtasks matter." |
| ], |
| "output": "A single subtask label for the current window.", |
| "junior_tip": "Action is the verb; subtask is the chapter of the activity.", |
| "failure_mode": "Single-episode ordering means some later subtasks appear only in test, so this is a pipeline check rather than a general benchmark.", |
| "display_name": "Procedure Step Recognition", |
| "research_name": "Temporal Subtask Recognition", |
| "task_family": "supervised", |
| "architecture_family": "multiclass classifier", |
| "primary_direction": "C. Egocentric Vision & Interaction", |
| "card_blurb": "Recognize the broader activity stage so fine actions become a readable procedure timeline.", |
| "input_short": "20-frame multimodal window", |
| "process_short": "window features -> subtask label builder -> classifier", |
| "output_short": "current procedure step", |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "poster_modality": "language", |
| "task": "timeline_subtask", |
| "artifact_id": "timeline_subtask", |
| "metric": { |
| "key": "macro_f1", |
| "name": "macro-F1", |
| "direction": "higher", |
| "minimal": 0.05056355513846935, |
| "neural_mlp": 0.02810810810810811 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "transition_detection": { |
| "plain_goal": "Detect whether the current window is near a boundary between actions.", |
| "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", |
| "input": "One all-modality window vector plus labels derived from action-change timestamps.", |
| "middle_modules": [ |
| "Boundary builder scans action labels over time and marks windows near a change.", |
| "Feature assembler supplies all current modality features.", |
| "Binary classifier predicts steady vs boundary.", |
| "Boundary matcher checks whether predicted boundary times are close to true boundary times.", |
| "Evaluator reports macro-F1 and timing error, not just accuracy." |
| ], |
| "output": "A binary label: boundary or steady.", |
| "junior_tip": "This is the model's way of saying: something just changed here.", |
| "failure_mode": "Boundaries are rare, so high accuracy can be misleading if the model predicts steady too often.", |
| "display_name": "Action Boundary Detection", |
| "research_name": "Temporal Action Segmentation", |
| "task_family": "diagnostic", |
| "architecture_family": "binary classifier", |
| "primary_direction": "C. Egocentric Vision & Interaction", |
| "card_blurb": "Detect the local moment where the episode changes from one action segment to the next.", |
| "input_short": "current window with boundary target", |
| "process_short": "action changes -> boundary labels -> binary classifier", |
| "output_short": "boundary or steady", |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "poster_modality": "pose_slam", |
| "task": "transition_detection", |
| "artifact_id": "transition_detection", |
| "metric": { |
| "key": "macro_f1", |
| "name": "macro-F1", |
| "direction": "higher", |
| "minimal": 0.6118237590630229, |
| "neural_mlp": 0.5862068965517241 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "next_action": { |
| "plain_goal": "Use the current window to guess the action that will happen shortly after it.", |
| "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", |
| "input": "The current all-modality window vector at time t.", |
| "middle_modules": [ |
| "Window builder picks a current time window.", |
| "Future label builder shifts the action target by 20 frames.", |
| "Feature assembler uses only current information, not future features.", |
| "Classifier head predicts the future action class.", |
| "Evaluator checks whether the future action label is correct." |
| ], |
| "output": "A single action class for t+20 frames.", |
| "junior_tip": "This is short-horizon intention prediction: what will the person do next?", |
| "failure_mode": "The public sample has unseen future classes in the chronological test split, which makes this very hard with one episode.", |
| "display_name": "Next-Action Prediction", |
| "research_name": "Short-Horizon Intention Prediction", |
| "task_family": "supervised", |
| "architecture_family": "future-label classifier", |
| "primary_direction": "C. Egocentric Vision & Interaction", |
| "card_blurb": "Forecast the near-future action from the current observations only.", |
| "input_short": "current window at time t", |
| "process_short": "current features -> future label shift -> classifier", |
| "output_short": "action at t+20 frames", |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "poster_modality": "video", |
| "task": "next_action", |
| "artifact_id": "next_action", |
| "metric": { |
| "key": "macro_f1", |
| "name": "macro-F1", |
| "direction": "higher", |
| "minimal": 0.05925925925925927, |
| "neural_mlp": 0.04186046511627907 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "hand_trajectory_forecast": { |
| "plain_goal": "Predict where the hands will move over the next few frames.", |
| "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", |
| "input": "The current all-modality window vector at time t.", |
| "middle_modules": [ |
| "Window builder chooses the current sensor window.", |
| "Target builder extracts future left/right hand 3D joints from motion capture.", |
| "Regression head predicts a continuous trajectory, not a class label.", |
| "Output reshaper interprets the vector as future frames and joints.", |
| "Evaluator computes MPJPE, the average 3D joint-position error." |
| ], |
| "output": "A future trajectory vector for left and right hand joints.", |
| "junior_tip": "Instead of naming an action, this task draws the next hand path in 3D.", |
| "failure_mode": "It is still a window-level forecast, not a full policy or long-horizon motion generator.", |
| "display_name": "Hand Trajectory Forecasting", |
| "research_name": "3D Hand Motion Forecasting", |
| "task_family": "forecast", |
| "architecture_family": "continuous regressor", |
| "primary_direction": "A. Human Modeling & Motion Understanding", |
| "card_blurb": "Predict the future 3D left/right hand path from the current multimodal state.", |
| "input_short": "current multimodal window", |
| "process_short": "current features -> future mocap target -> regression head", |
| "output_short": "future hand-joint trajectory", |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "pose_slam", |
| "inertial" |
| ], |
| "poster_modality": "motion_capture", |
| "task": "hand_trajectory_forecast", |
| "artifact_id": "hand_trajectory_forecast", |
| "metric": { |
| "key": "mpjpe", |
| "name": "MPJPE", |
| "direction": "lower", |
| "minimal": 0.8646570444107056, |
| "neural_mlp": 0.10785018652677536 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "contact_prediction": { |
| "plain_goal": "Predict whether the body or hand is in contact with something.", |
| "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", |
| "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", |
| "middle_modules": [ |
| "Feature selector removes contact-label and caption-label blocks.", |
| "Target builder converts contact annotations into a binary label.", |
| "Binary classifier predicts contact vs no contact.", |
| "Evaluator reports macro-F1 and accuracy.", |
| "Degeneracy checker records whether only one class appears." |
| ], |
| "output": "A binary contact label.", |
| "junior_tip": "This is a simple physical-interaction probe: is the person touching something now?", |
| "failure_mode": "The current public sample is degenerate for this task because one class dominates, so perfect score does not mean the model learned contact physics.", |
| "display_name": "Contact State Prediction", |
| "research_name": "Human-Object Contact Prediction", |
| "task_family": "supervised", |
| "architecture_family": "binary classifier", |
| "primary_direction": "A. Human Modeling & Motion Understanding", |
| "card_blurb": "Predict whether body or hand contact with the scene is occurring without leaking contact labels.", |
| "input_short": "non-contact, non-caption features", |
| "process_short": "feature filter -> contact target -> binary classifier", |
| "output_short": "contact or no contact", |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "inertial" |
| ], |
| "poster_modality": "motion_capture", |
| "task": "contact_prediction", |
| "artifact_id": "contact_prediction", |
| "metric": { |
| "key": "macro_f1", |
| "name": "macro-F1", |
| "direction": "higher", |
| "minimal": 1.0, |
| "neural_mlp": 1.0 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "object_relevance": { |
| "plain_goal": "Predict which objects matter in the current window.", |
| "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", |
| "middle_modules": [ |
| "Object vocabulary builder collects object labels from annotations.", |
| "Feature selector removes caption-derived label blocks.", |
| "Multi-label target builder creates a multi-hot object vector.", |
| "Sigmoid heads predict each object's relevance independently.", |
| "Evaluator reports micro-F1 and exact-match quality." |
| ], |
| "output": "A multi-label object set for the current window.", |
| "junior_tip": "A window can involve more than one object, so this is not a one-class classifier.", |
| "failure_mode": "Object labels are sparse and language-derived, so this is currently a weak object-centric probe.", |
| "display_name": "Object Relevance Prediction", |
| "research_name": "Object-Centric Interaction Recognition", |
| "task_family": "supervised", |
| "architecture_family": "multi-label classifier", |
| "primary_direction": "C. Egocentric Vision & Interaction", |
| "card_blurb": "Infer which objects are relevant to the current manipulation window from non-caption features.", |
| "input_short": "non-caption multimodal features", |
| "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", |
| "output_short": "relevant object set", |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "poster_modality": "video", |
| "task": "object_relevance", |
| "artifact_id": "object_relevance", |
| "metric": { |
| "key": "micro_f1", |
| "name": "micro-F1", |
| "direction": "higher", |
| "minimal": 0.18034382095361662, |
| "neural_mlp": 0.1679279279279279 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "caption_grounding": { |
| "plain_goal": "Given a text-like query from annotation, find the matching time window.", |
| "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", |
| "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", |
| "middle_modules": [ |
| "Query builder converts annotation words into a compact query representation.", |
| "Candidate builder gathers held-out sensor windows.", |
| "Projection head maps sensor windows into the query space.", |
| "Ranker scores candidates by cosine similarity.", |
| "Evaluator reports MRR and top-k retrieval accuracy." |
| ], |
| "output": "A ranked list of windows, with the correct matching window ideally near rank 1.", |
| "junior_tip": "This is search: type a description, retrieve the matching moment.", |
| "failure_mode": "Bag-of-objects text features are too simple for rich language grounding.", |
| "display_name": "Language Grounding", |
| "research_name": "Language-to-Moment Grounding", |
| "task_family": "retrieval", |
| "architecture_family": "retrieval ranker", |
| "primary_direction": "C. Egocentric Vision & Interaction", |
| "card_blurb": "Retrieve the matching time window for an annotation-derived text query.", |
| "input_short": "text-like query and candidate windows", |
| "process_short": "query features -> candidate index -> cosine ranker", |
| "output_short": "ranked matching moments", |
| "modalities": [ |
| "language", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "poster_modality": "language", |
| "task": "caption_grounding", |
| "artifact_id": "caption_grounding", |
| "metric": { |
| "key": "mrr", |
| "name": "MRR", |
| "direction": "higher", |
| "minimal": 0.016023479050338015, |
| "neural_mlp": 0.01684125567132316 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "cross_modal_retrieval": { |
| "plain_goal": "Use one group of modalities to retrieve the matching window from another group.", |
| "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", |
| "middle_modules": [ |
| "Feature splitter separates query modalities from target modalities.", |
| "Projection head maps the query vector into target-modality space.", |
| "Candidate index stores target vectors from held-out windows.", |
| "Ranker retrieves nearest candidates by cosine similarity.", |
| "Evaluator reports MRR, top-1, top-5, and top-10 accuracy." |
| ], |
| "output": "A ranked list of candidate depth/video windows.", |
| "junior_tip": "This checks whether different sensors agree about the same moment in time.", |
| "failure_mode": "Good retrieval means useful alignment signal, but it is not yet 3D reconstruction or rendering.", |
| "display_name": "Cross-Modal Retrieval", |
| "research_name": "Multimodal Representation Retrieval", |
| "task_family": "retrieval", |
| "architecture_family": "two-tower retrieval head", |
| "primary_direction": "D. Scene Reconstruction & World Modeling", |
| "card_blurb": "Use motion, IMU, and camera-pose signals to retrieve the matching depth/video window.", |
| "input_short": "motion/IMU/pose query; depth/video candidates", |
| "process_short": "modality split -> projection -> nearest-neighbor ranker", |
| "output_short": "ranked visual windows", |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "poster_modality": "depth", |
| "task": "cross_modal_retrieval", |
| "artifact_id": "cross_modal_retrieval", |
| "metric": { |
| "key": "mrr", |
| "name": "MRR", |
| "direction": "higher", |
| "minimal": 0.26925966892956127, |
| "neural_mlp": 0.1299971898648288 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "modality_reconstruction": { |
| "plain_goal": "Predict one modality feature block from other modality blocks.", |
| "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", |
| "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", |
| "middle_modules": [ |
| "Feature splitter defines source and target modality blocks.", |
| "Scaler normalizes source and target vectors using train statistics.", |
| "Regression head predicts the target feature vector.", |
| "Inverse scaler returns predictions to target scale.", |
| "Evaluator reports MSE, MAE, and R2." |
| ], |
| "output": "A reconstructed depth/video feature vector.", |
| "junior_tip": "This is feature-level imagination: can the model infer what another sensor would see?", |
| "failure_mode": "This reconstructs compressed features, not raw pixels, depth maps, meshes, NeRFs, or Gaussian splats.", |
| "display_name": "Cross-Modal Reconstruction", |
| "research_name": "Modality Feature Reconstruction", |
| "task_family": "forecast", |
| "architecture_family": "feature regressor", |
| "primary_direction": "B. 3D/4D Reconstruction & Neural Rendering", |
| "card_blurb": "Predict compressed depth/video feature vectors from motion, IMU, and camera-pose features.", |
| "input_short": "motion, IMU, and camera/pose features", |
| "process_short": "source-target split -> scaler -> regression head", |
| "output_short": "reconstructed depth/video vector", |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "poster_modality": "depth", |
| "task": "modality_reconstruction", |
| "artifact_id": "modality_reconstruction", |
| "metric": { |
| "key": "r2", |
| "name": "R2", |
| "direction": "higher", |
| "minimal": -0.015271898913936655, |
| "neural_mlp": -0.010171410134180991 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "temporal_order": { |
| "plain_goal": "Tell whether two nearby windows are in the correct time order.", |
| "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", |
| "input": "A pair of adjacent window vectors, plus their difference vector.", |
| "middle_modules": [ |
| "Pair builder creates correct-order and reversed-order examples.", |
| "Feature combiner concatenates first window, second window, and their difference.", |
| "Binary classifier predicts correct vs reversed.", |
| "Evaluator reports F1, precision, and recall.", |
| "Diagnostic reader interprets whether features encode local time direction." |
| ], |
| "output": "A binary label: correct order or reversed order.", |
| "junior_tip": "This asks whether the representation knows which moment came first.", |
| "failure_mode": "It only tests local ordering, not long-term planning or causality.", |
| "display_name": "Temporal Order Verification", |
| "research_name": "Temporal Order Verification", |
| "task_family": "diagnostic", |
| "architecture_family": "pairwise classifier", |
| "primary_direction": "D. Scene Reconstruction & World Modeling", |
| "card_blurb": "Tell whether two neighboring windows are in chronological order or reversed.", |
| "input_short": "two adjacent windows plus difference vector", |
| "process_short": "pair builder -> feature combiner -> binary classifier", |
| "output_short": "correct or reversed", |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "poster_modality": "video", |
| "task": "temporal_order", |
| "artifact_id": "temporal_order", |
| "metric": { |
| "key": "f1", |
| "name": "F1", |
| "direction": "higher", |
| "minimal": 0.5399515738498789, |
| "neural_mlp": 0.8520179372197308 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| }, |
| "misalignment_detection": { |
| "plain_goal": "Detect when modalities that should match are shifted out of sync.", |
| "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", |
| "middle_modules": [ |
| "Alignment builder creates positive pairs from the same time window.", |
| "Shift builder creates negative pairs by offsetting one modality group.", |
| "Feature combiner joins both sides into one example.", |
| "Binary classifier predicts aligned vs misaligned.", |
| "Evaluator reports F1 and accuracy." |
| ], |
| "output": "A binary label: aligned or shifted.", |
| "junior_tip": "This is a synchronization alarm for multimodal data.", |
| "failure_mode": "Synthetic shifts are useful diagnostics but do not solve calibration, reconstruction, or mapping by themselves.", |
| "display_name": "Multimodal Synchronization Detection", |
| "research_name": "Cross-Modal Misalignment Detection", |
| "task_family": "diagnostic", |
| "architecture_family": "pairwise classifier", |
| "primary_direction": "B. 3D/4D Reconstruction & Neural Rendering", |
| "card_blurb": "Detect whether motion and visual/depth streams have been artificially shifted out of sync.", |
| "input_short": "motion-side and visual/depth-side feature groups", |
| "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", |
| "output_short": "aligned or shifted", |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "poster_modality": "pose_slam", |
| "task": "misalignment_detection", |
| "artifact_id": "misalignment_detection", |
| "metric": { |
| "key": "f1", |
| "name": "F1", |
| "direction": "higher", |
| "minimal": 0.5051698670605613, |
| "neural_mlp": 0.7152682255845944 |
| }, |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" |
| } |
| } |
| } |
|
|