{ "baseline_summary": { "baseline_heads": "minimal and neural MLP heads", "current_use": "task design, data-contract validation, case studies, and baseline comparison", "split": "chronological single-episode split for public-sample diagnostics", "task_count": 12 }, "directions": [ { "code": "A", "counts": { "diagnostic": 0, "direct": 2, "proxy": 2, "total_links": 4 }, "current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.", "current_status": "partially implemented", "extension_tasks": [ { "current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior.", "family": "classification", "id": "body_motion_intensity", "metric_name": "macro-F1", "name": "Body and Hand Motion Intensity" } ], "focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.", "id": "human_motion", "name": "Human Modeling & Motion Understanding", "next_steps": [ "Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.", "Train sequence models over multi-episode motion trajectories instead of isolated windows.", "Evaluate affordance prediction on held-out objects and held-out episodes." ], "preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.", "task_ids": [ "timeline_action", "hand_trajectory_forecast", "contact_prediction", "object_relevance" ], "tasks": [ { "architecture_family": "multiclass classifier", "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", "current_limit": "Chronological single-episode split creates unseen future action classes.", "direction_roles": { "A": "proxy", "C": "direct" }, "display_name": "Action Recognition", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "timeline_action", "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", "input_short": "20-frame multimodal window", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.05, "name": "macro-F1", "neural_mlp": 0.0148 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "current action class", "primary_direction": "C", "process_short": "window features -> action label builder -> classifier", "research_name": "Egocentric Action Recognition", "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout." }, { "architecture_family": "continuous regressor", "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", "direction_roles": { "A": "direct", "C": "proxy" }, "display_name": "Hand Trajectory Forecasting", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", "label": "Neural metrics" } ], "family": "forecast", "id": "hand_trajectory_forecast", "input": "The current all-modality window vector at time t.", "input_short": "current multimodal window", "metric": { "better_baseline": "neural_mlp", "direction": "lower", "key": "mpjpe", "minimal": 0.8647, "name": "MPJPE", "neural_mlp": 0.1079 }, "modalities": [ "motion_capture", "video", "depth", "pose_slam", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "future hand-joint trajectory", "primary_direction": "A", "process_short": "current features -> future mocap target -> regression head", "research_name": "3D Hand Motion Forecasting", "why": "Directly predicts human hand motion and supports hand-object interaction modeling." }, { "architecture_family": "binary classifier", "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", "current_limit": "The public sample is degenerate for this target because one class dominates.", "direction_roles": { "A": "direct", "C": "proxy" }, "display_name": "Contact State Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "contact_prediction", "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", "input_short": "non-contact, non-caption features", "metric": { "better_baseline": "tie", "direction": "higher", "key": "macro_f1", "minimal": 1.0, "name": "macro-F1", "neural_mlp": 1.0 }, "modalities": [ "motion_capture", "video", "depth", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "contact or no contact", "primary_direction": "A", "process_short": "feature filter -> contact target -> binary classifier", "research_name": "Human-Object Contact Prediction", "why": "Targets physical interaction state, a core affordance and manipulation signal." }, { "architecture_family": "multi-label classifier", "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", "current_limit": "Object labels are language-derived and sparse in one episode.", "direction_roles": { "A": "proxy", "C": "direct", "D": "proxy" }, "display_name": "Object Relevance Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", "label": "Neural predictions" } ], "family": "supervised", "id": "object_relevance", "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", "input_short": "non-caption multimodal features", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "micro_f1", "minimal": 0.1803, "name": "micro-F1", "neural_mlp": 0.1679 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "relevant object set", "primary_direction": "C", "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", "research_name": "Object-Centric Interaction Recognition", "why": "Connects egocentric activity to manipulated objects and early object-centric state." } ] }, { "code": "B", "counts": { "diagnostic": 1, "direct": 0, "proxy": 2, "total_links": 3 }, "current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.", "current_status": "proxy tasks only", "extension_tasks": [ { "current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis.", "family": "retrieval", "id": "multi_view_consistency_retrieval", "metric_name": "MRR", "name": "Multi-View Consistency Retrieval" } ], "focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.", "id": "reconstruction_rendering", "name": "3D/4D Reconstruction & Neural Rendering", "next_steps": [ "Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.", "Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.", "Evaluate novel-view synthesis and temporal consistency across held-out views/time." ], "preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.", "task_ids": [ "cross_modal_retrieval", "modality_reconstruction", "misalignment_detection" ], "tasks": [ { "architecture_family": "two-tower retrieval head", "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", "direction_roles": { "B": "proxy", "C": "diagnostic", "D": "proxy" }, "display_name": "Cross-Modal Retrieval", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "cross_modal_retrieval", "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", "input_short": "motion/IMU/pose query; depth/video candidates", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "mrr", "minimal": 0.2693, "name": "MRR", "neural_mlp": 0.13 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked visual windows", "primary_direction": "C", "process_short": "modality split -> projection -> nearest-neighbor ranker", "research_name": "Multimodal Representation Retrieval", "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." }, { "architecture_family": "feature regressor", "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", "direction_roles": { "B": "proxy", "D": "proxy" }, "display_name": "Cross-Modal Reconstruction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", "label": "Neural metrics" } ], "family": "forecast", "id": "modality_reconstruction", "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", "input_short": "motion, IMU, and camera/pose features", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "r2", "minimal": -0.0153, "name": "R2", "neural_mlp": -0.0102 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "reconstructed depth/video vector", "primary_direction": "B", "process_short": "source-target split -> scaler -> regression head", "research_name": "Modality Feature Reconstruction", "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective." }, { "architecture_family": "pairwise classifier", "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", "direction_roles": { "B": "diagnostic", "C": "diagnostic", "D": "diagnostic" }, "display_name": "Multimodal Synchronization Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "misalignment_detection", "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", "input_short": "motion-side and visual/depth-side feature groups", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.5052, "name": "F1", "neural_mlp": 0.7153 }, "modalities": [ "motion_capture", "inertial", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "aligned or shifted", "primary_direction": "C", "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", "research_name": "Cross-Modal Misalignment Detection", "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." } ] }, { "code": "C", "counts": { "diagnostic": 3, "direct": 6, "proxy": 2, "total_links": 11 }, "current_readout": "Most of the 12 tasks directly target egocentric action, task state, interaction, grounding, and alignment.", "current_status": "strongest implemented track", "extension_tasks": [ { "current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks.", "family": "regression", "id": "action_phase_progress", "metric_name": "MAE", "name": "Action Phase Progress Estimation" } ], "focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.", "id": "egocentric_interaction", "name": "Egocentric Vision & Interaction", "next_steps": [ "Move from single-episode chronological splits to held-out-episode splits.", "Use the extracted AAC audio block with stronger multimodal backbones for action, intent, and grounding.", "Evaluate long-horizon task success prediction and action-conditioned generation." ], "preferred_background": "Video understanding, action recognition, or egocentric vision.", "task_ids": [ "timeline_action", "timeline_subtask", "transition_detection", "next_action", "hand_trajectory_forecast", "contact_prediction", "object_relevance", "caption_grounding", "cross_modal_retrieval", "temporal_order", "misalignment_detection" ], "tasks": [ { "architecture_family": "multiclass classifier", "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", "current_limit": "Chronological single-episode split creates unseen future action classes.", "direction_roles": { "A": "proxy", "C": "direct" }, "display_name": "Action Recognition", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "timeline_action", "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", "input_short": "20-frame multimodal window", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.05, "name": "macro-F1", "neural_mlp": 0.0148 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "current action class", "primary_direction": "C", "process_short": "window features -> action label builder -> classifier", "research_name": "Egocentric Action Recognition", "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout." }, { "architecture_family": "multiclass classifier", "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Procedure Step Recognition", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "timeline_subtask", "input": "The same all-modality window vector used by action recognition.", "input_short": "20-frame multimodal window", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.0506, "name": "macro-F1", "neural_mlp": 0.0281 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "current procedure step", "primary_direction": "C", "process_short": "window features -> subtask label builder -> classifier", "research_name": "Temporal Subtask Recognition", "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state." }, { "architecture_family": "binary classifier", "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", "direction_roles": { "C": "direct", "D": "diagnostic" }, "display_name": "Action Boundary Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "diagnostic", "id": "transition_detection", "input": "One all-modality window vector plus labels derived from action-change timestamps.", "input_short": "current window with boundary target", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.6118, "name": "macro-F1", "neural_mlp": 0.5862 }, "modalities": [ "video", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "boundary or steady", "primary_direction": "C", "process_short": "action changes -> boundary labels -> binary classifier", "research_name": "Temporal Action Segmentation", "why": "Localizes egocentric task boundaries and diagnoses temporal state changes." }, { "architecture_family": "future-label classifier", "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", "current_limit": "Unseen future labels dominate the single-episode chronological test.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Next-Action Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "next_action", "input": "The current all-modality window vector at time t.", "input_short": "current window at time t", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.0593, "name": "macro-F1", "neural_mlp": 0.0419 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "action at t+20 frames", "primary_direction": "C", "process_short": "current features -> future label shift -> classifier", "research_name": "Short-Horizon Intention Prediction", "why": "Tests action intention/task-flow prediction from egocentric context." }, { "architecture_family": "continuous regressor", "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", "direction_roles": { "A": "direct", "C": "proxy" }, "display_name": "Hand Trajectory Forecasting", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", "label": "Neural metrics" } ], "family": "forecast", "id": "hand_trajectory_forecast", "input": "The current all-modality window vector at time t.", "input_short": "current multimodal window", "metric": { "better_baseline": "neural_mlp", "direction": "lower", "key": "mpjpe", "minimal": 0.8647, "name": "MPJPE", "neural_mlp": 0.1079 }, "modalities": [ "motion_capture", "video", "depth", "pose_slam", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "future hand-joint trajectory", "primary_direction": "A", "process_short": "current features -> future mocap target -> regression head", "research_name": "3D Hand Motion Forecasting", "why": "Directly predicts human hand motion and supports hand-object interaction modeling." }, { "architecture_family": "binary classifier", "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", "current_limit": "The public sample is degenerate for this target because one class dominates.", "direction_roles": { "A": "direct", "C": "proxy" }, "display_name": "Contact State Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "contact_prediction", "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", "input_short": "non-contact, non-caption features", "metric": { "better_baseline": "tie", "direction": "higher", "key": "macro_f1", "minimal": 1.0, "name": "macro-F1", "neural_mlp": 1.0 }, "modalities": [ "motion_capture", "video", "depth", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "contact or no contact", "primary_direction": "A", "process_short": "feature filter -> contact target -> binary classifier", "research_name": "Human-Object Contact Prediction", "why": "Targets physical interaction state, a core affordance and manipulation signal." }, { "architecture_family": "multi-label classifier", "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", "current_limit": "Object labels are language-derived and sparse in one episode.", "direction_roles": { "A": "proxy", "C": "direct", "D": "proxy" }, "display_name": "Object Relevance Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", "label": "Neural predictions" } ], "family": "supervised", "id": "object_relevance", "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", "input_short": "non-caption multimodal features", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "micro_f1", "minimal": 0.1803, "name": "micro-F1", "neural_mlp": 0.1679 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "relevant object set", "primary_direction": "C", "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", "research_name": "Object-Centric Interaction Recognition", "why": "Connects egocentric activity to manipulated objects and early object-centric state." }, { "architecture_family": "retrieval ranker", "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", "current_limit": "Bag-of-objects language features are too weak for rich grounding.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Language Grounding", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "caption_grounding", "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", "input_short": "text-like query and candidate windows", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "mrr", "minimal": 0.016, "name": "MRR", "neural_mlp": 0.0168 }, "modalities": [ "language", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked matching moments", "primary_direction": "C", "process_short": "query features -> candidate index -> cosine ranker", "research_name": "Language-to-Moment Grounding", "why": "Grounds language annotation into egocentric sensor time and task state." }, { "architecture_family": "two-tower retrieval head", "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", "direction_roles": { "B": "proxy", "C": "diagnostic", "D": "proxy" }, "display_name": "Cross-Modal Retrieval", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "cross_modal_retrieval", "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", "input_short": "motion/IMU/pose query; depth/video candidates", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "mrr", "minimal": 0.2693, "name": "MRR", "neural_mlp": 0.13 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked visual windows", "primary_direction": "C", "process_short": "modality split -> projection -> nearest-neighbor ranker", "research_name": "Multimodal Representation Retrieval", "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." }, { "architecture_family": "pairwise classifier", "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", "direction_roles": { "C": "diagnostic", "D": "diagnostic" }, "display_name": "Temporal Order Verification", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "temporal_order", "input": "A pair of adjacent window vectors, plus their difference vector.", "input_short": "two adjacent windows plus difference vector", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.54, "name": "F1", "neural_mlp": 0.852 }, "modalities": [ "video", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "correct or reversed", "primary_direction": "C", "process_short": "pair builder -> feature combiner -> binary classifier", "research_name": "Temporal Order Verification", "why": "Checks whether features encode local time direction and task progression." }, { "architecture_family": "pairwise classifier", "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", "direction_roles": { "B": "diagnostic", "C": "diagnostic", "D": "diagnostic" }, "display_name": "Multimodal Synchronization Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "misalignment_detection", "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", "input_short": "motion-side and visual/depth-side feature groups", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.5052, "name": "F1", "neural_mlp": 0.7153 }, "modalities": [ "motion_capture", "inertial", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "aligned or shifted", "primary_direction": "C", "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", "research_name": "Cross-Modal Misalignment Detection", "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." } ] }, { "code": "D", "counts": { "diagnostic": 3, "direct": 0, "proxy": 6, "total_links": 9 }, "current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.", "current_status": "early proxy tasks", "extension_tasks": [ { "current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model.", "family": "forecast", "id": "ego_motion_forecast", "metric_name": "MAE", "name": "Short-Horizon Ego-Motion Forecasting" } ], "focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.", "id": "world_modeling", "name": "Scene Reconstruction & World Modeling", "next_steps": [ "Convert windows into persistent object/scene-state nodes with timestamps and camera poses.", "Add map consistency, object permanence, and spatial relation prediction tasks.", "Train held-out-episode world models that predict future observations and task state." ], "preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.", "task_ids": [ "timeline_subtask", "transition_detection", "next_action", "object_relevance", "caption_grounding", "cross_modal_retrieval", "modality_reconstruction", "temporal_order", "misalignment_detection" ], "tasks": [ { "architecture_family": "multiclass classifier", "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Procedure Step Recognition", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "timeline_subtask", "input": "The same all-modality window vector used by action recognition.", "input_short": "20-frame multimodal window", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.0506, "name": "macro-F1", "neural_mlp": 0.0281 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "current procedure step", "primary_direction": "C", "process_short": "window features -> subtask label builder -> classifier", "research_name": "Temporal Subtask Recognition", "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state." }, { "architecture_family": "binary classifier", "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", "direction_roles": { "C": "direct", "D": "diagnostic" }, "display_name": "Action Boundary Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "diagnostic", "id": "transition_detection", "input": "One all-modality window vector plus labels derived from action-change timestamps.", "input_short": "current window with boundary target", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.6118, "name": "macro-F1", "neural_mlp": 0.5862 }, "modalities": [ "video", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "boundary or steady", "primary_direction": "C", "process_short": "action changes -> boundary labels -> binary classifier", "research_name": "Temporal Action Segmentation", "why": "Localizes egocentric task boundaries and diagnoses temporal state changes." }, { "architecture_family": "future-label classifier", "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", "current_limit": "Unseen future labels dominate the single-episode chronological test.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Next-Action Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "next_action", "input": "The current all-modality window vector at time t.", "input_short": "current window at time t", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.0593, "name": "macro-F1", "neural_mlp": 0.0419 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "action at t+20 frames", "primary_direction": "C", "process_short": "current features -> future label shift -> classifier", "research_name": "Short-Horizon Intention Prediction", "why": "Tests action intention/task-flow prediction from egocentric context." }, { "architecture_family": "multi-label classifier", "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", "current_limit": "Object labels are language-derived and sparse in one episode.", "direction_roles": { "A": "proxy", "C": "direct", "D": "proxy" }, "display_name": "Object Relevance Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", "label": "Neural predictions" } ], "family": "supervised", "id": "object_relevance", "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", "input_short": "non-caption multimodal features", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "micro_f1", "minimal": 0.1803, "name": "micro-F1", "neural_mlp": 0.1679 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "relevant object set", "primary_direction": "C", "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", "research_name": "Object-Centric Interaction Recognition", "why": "Connects egocentric activity to manipulated objects and early object-centric state." }, { "architecture_family": "retrieval ranker", "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", "current_limit": "Bag-of-objects language features are too weak for rich grounding.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Language Grounding", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "caption_grounding", "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", "input_short": "text-like query and candidate windows", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "mrr", "minimal": 0.016, "name": "MRR", "neural_mlp": 0.0168 }, "modalities": [ "language", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked matching moments", "primary_direction": "C", "process_short": "query features -> candidate index -> cosine ranker", "research_name": "Language-to-Moment Grounding", "why": "Grounds language annotation into egocentric sensor time and task state." }, { "architecture_family": "two-tower retrieval head", "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", "direction_roles": { "B": "proxy", "C": "diagnostic", "D": "proxy" }, "display_name": "Cross-Modal Retrieval", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "cross_modal_retrieval", "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", "input_short": "motion/IMU/pose query; depth/video candidates", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "mrr", "minimal": 0.2693, "name": "MRR", "neural_mlp": 0.13 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked visual windows", "primary_direction": "C", "process_short": "modality split -> projection -> nearest-neighbor ranker", "research_name": "Multimodal Representation Retrieval", "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." }, { "architecture_family": "feature regressor", "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", "direction_roles": { "B": "proxy", "D": "proxy" }, "display_name": "Cross-Modal Reconstruction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", "label": "Neural metrics" } ], "family": "forecast", "id": "modality_reconstruction", "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", "input_short": "motion, IMU, and camera/pose features", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "r2", "minimal": -0.0153, "name": "R2", "neural_mlp": -0.0102 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "reconstructed depth/video vector", "primary_direction": "B", "process_short": "source-target split -> scaler -> regression head", "research_name": "Modality Feature Reconstruction", "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective." }, { "architecture_family": "pairwise classifier", "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", "direction_roles": { "C": "diagnostic", "D": "diagnostic" }, "display_name": "Temporal Order Verification", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "temporal_order", "input": "A pair of adjacent window vectors, plus their difference vector.", "input_short": "two adjacent windows plus difference vector", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.54, "name": "F1", "neural_mlp": 0.852 }, "modalities": [ "video", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "correct or reversed", "primary_direction": "C", "process_short": "pair builder -> feature combiner -> binary classifier", "research_name": "Temporal Order Verification", "why": "Checks whether features encode local time direction and task progression." }, { "architecture_family": "pairwise classifier", "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", "direction_roles": { "B": "diagnostic", "C": "diagnostic", "D": "diagnostic" }, "display_name": "Multimodal Synchronization Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "misalignment_detection", "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", "input_short": "motion-side and visual/depth-side feature groups", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.5052, "name": "F1", "neural_mlp": 0.7153 }, "modalities": [ "motion_capture", "inertial", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "aligned or shifted", "primary_direction": "C", "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", "research_name": "Cross-Modal Misalignment Detection", "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." } ] } ], "generated_at_utc": "2026-06-03T12:47:15+00:00", "omni_plan": { "adapter": "LoRA rank 16, alpha 32, dropout 0.05", "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct", "evaluation": [ "JSON validity", "action macro-F1", "subtask accuracy", "transition accuracy", "next-action accuracy", "contact accuracy", "object micro-F1", "held-out episode count" ], "first_pilot": "32 held-out-episode pilot after valid episodes are staged", "training_unit": "episode-level split, window-level supervised examples" }, "phases": [ { "completion_evidence": [ "PROJECT_STATUS.md", "EVALUATION_PROTOCOL.md", "RESEARCH_TAKEAWAYS.md", "docs/data/summary_metrics.json", "results/episode_task_suite/summary_report.json" ], "deliverables": [ "1161 aligned windows", "12 task contracts", "minimal baseline heads", "neural MLP heads", "modality atlas", "task walkthroughs", "derived figures" ], "entry_condition": "One public Xperience-10M sample episode is available.", "id": "public_sample_task_lab", "name": "Public-Sample Task Lab", "reader_takeaway": "The public sample supports task design, feature contracts, walkthroughs, and baseline comparisons.", "stage": "now", "status": "implemented" }, { "completion_evidence": [ "results/omni_finetune/DATA_ACCESS_STATUS.md", "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", "results/omni_finetune/source_discovery.json" ], "deliverables": [ "32 valid episodes", "episode manifest", "missing-view manifest", "held-out episode split", "source-discovery report" ], "entry_condition": "Gated dataset access and enough storage for selected episodes.", "id": "multi_episode_data_staging", "name": "Multi-Episode Data Staging", "reader_takeaway": "The next scale decision is data staging, with train/test separation at the episode level.", "stage": "scale_up", "status": "active" }, { "completion_evidence": [ "dataset_manifest.json", "training_metadata.json", "progress.jsonl", "metrics.json", "predictions.jsonl", "RUN_REPORT.md" ], "deliverables": [ "dataset JSONL/media manifests", "LoRA adapter checkpoint", "progress logs", "held-out predictions", "metrics", "confusion matrices", "run report" ], "entry_condition": "At least 32 valid episodes are staged locally with no train/test episode leakage.", "id": "qwen3_omni_lora_pilot_32_episode", "name": "32-Episode Qwen3-Omni LoRA Pilot", "reader_takeaway": "The first omni-model pilot should establish a complete held-out-episode training and evaluation loop.", "stage": "omni", "status": "next" }, { "completion_evidence": [ "held-out metrics by session", "held-out metrics by task", "held-out metrics by modality", "ablation tables", "qualitative error analysis" ], "deliverables": [ "split-by-session metrics", "modality ablations", "calibration/object/language error analysis", "missing-view sensitivity analysis" ], "entry_condition": "The 32-episode pilot trains and evaluates cleanly.", "id": "robustness_run_64_128_episode", "name": "64-128 Episode Robustness Run", "reader_takeaway": "The robustness run tests whether the pilot conclusions survive broader sessions and missing modalities.", "stage": "future", "status": "planned" }, { "completion_evidence": [ "task-specific held-out evaluations", "qualitative inspection", "updated model cards" ], "deliverables": [ "audio encoder integration", "depth/image reconstruction", "SLAM/world modeling probes", "policy-style next-action tasks", "affordance and object-interaction tasks" ], "entry_condition": "Enough multi-episode data and compute budget for larger multimodal objectives.", "id": "foundation_world_model_extensions", "name": "Foundation and World-Model Extensions", "reader_takeaway": "The long-term direction is richer multimodal representation learning for embodied-AI reasoning.", "stage": "future", "status": "planned" } ], "scale_up": { "access_status": "Hugging Face returns 403 pending review for the full Xperience-10M gated dataset.", "candidate_scan_top_level_sessions": 64, "estimated_bytes": 72031620552, "exclude": [ "visualization.rrd" ], "selection_strategy": "stratified_round_robin_by_top_level_session", "status": "pending_huggingface_gated_access", "target_episodes": 32, "valid_candidates": 680 }, "scope": { "feature_blocks": 18, "feature_dim": 8546, "num_frames": 5821, "num_windows": 1161, "sample_episode_count": 1, "stride_frames": 5, "warning": "These walkthroughs explain task contracts on one public sample episode; cross-episode performance requires held-out episodes.", "window_frames": 20 }, "source_files": [ "docs/data/research_directions.json", "docs/data/task_walkthroughs.json", "docs/data/research_roadmap.json", "docs/data/summary_metrics.json", "docs/data/research_direction_extensions.json", "results/episode_task_suite/summary_report.json", "results/episode_task_suite/feature_manifest.json" ], "tasks": [ { "architecture_family": "multiclass classifier", "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", "current_limit": "Chronological single-episode split creates unseen future action classes.", "direction_roles": { "A": "proxy", "C": "direct" }, "display_name": "Action Recognition", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "timeline_action", "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", "input_short": "20-frame multimodal window", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.05, "name": "macro-F1", "neural_mlp": 0.0148 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "current action class", "primary_direction": "C", "process_short": "window features -> action label builder -> classifier", "research_name": "Egocentric Action Recognition", "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout." }, { "architecture_family": "multiclass classifier", "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Procedure Step Recognition", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "timeline_subtask", "input": "The same all-modality window vector used by action recognition.", "input_short": "20-frame multimodal window", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.0506, "name": "macro-F1", "neural_mlp": 0.0281 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "current procedure step", "primary_direction": "C", "process_short": "window features -> subtask label builder -> classifier", "research_name": "Temporal Subtask Recognition", "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state." }, { "architecture_family": "binary classifier", "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", "direction_roles": { "C": "direct", "D": "diagnostic" }, "display_name": "Action Boundary Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "diagnostic", "id": "transition_detection", "input": "One all-modality window vector plus labels derived from action-change timestamps.", "input_short": "current window with boundary target", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.6118, "name": "macro-F1", "neural_mlp": 0.5862 }, "modalities": [ "video", "pose_slam", "motion_capture", "inertial", "language" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "boundary or steady", "primary_direction": "C", "process_short": "action changes -> boundary labels -> binary classifier", "research_name": "Temporal Action Segmentation", "why": "Localizes egocentric task boundaries and diagnoses temporal state changes." }, { "architecture_family": "future-label classifier", "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", "current_limit": "Unseen future labels dominate the single-episode chronological test.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Next-Action Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "next_action", "input": "The current all-modality window vector at time t.", "input_short": "current window at time t", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "macro_f1", "minimal": 0.0593, "name": "macro-F1", "neural_mlp": 0.0419 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "action at t+20 frames", "primary_direction": "C", "process_short": "current features -> future label shift -> classifier", "research_name": "Short-Horizon Intention Prediction", "why": "Tests action intention/task-flow prediction from egocentric context." }, { "architecture_family": "continuous regressor", "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", "direction_roles": { "A": "direct", "C": "proxy" }, "display_name": "Hand Trajectory Forecasting", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", "label": "Neural metrics" } ], "family": "forecast", "id": "hand_trajectory_forecast", "input": "The current all-modality window vector at time t.", "input_short": "current multimodal window", "metric": { "better_baseline": "neural_mlp", "direction": "lower", "key": "mpjpe", "minimal": 0.8647, "name": "MPJPE", "neural_mlp": 0.1079 }, "modalities": [ "motion_capture", "video", "depth", "pose_slam", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "future hand-joint trajectory", "primary_direction": "A", "process_short": "current features -> future mocap target -> regression head", "research_name": "3D Hand Motion Forecasting", "why": "Directly predicts human hand motion and supports hand-object interaction modeling." }, { "architecture_family": "binary classifier", "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", "current_limit": "The public sample is degenerate for this target because one class dominates.", "direction_roles": { "A": "direct", "C": "proxy" }, "display_name": "Contact State Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv", "label": "Neural predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv", "label": "Confusion matrix" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv", "label": "Neural confusion matrix" } ], "family": "supervised", "id": "contact_prediction", "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", "input_short": "non-contact, non-caption features", "metric": { "better_baseline": "tie", "direction": "higher", "key": "macro_f1", "minimal": 1.0, "name": "macro-F1", "neural_mlp": 1.0 }, "modalities": [ "motion_capture", "video", "depth", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "contact or no contact", "primary_direction": "A", "process_short": "feature filter -> contact target -> binary classifier", "research_name": "Human-Object Contact Prediction", "why": "Targets physical interaction state, a core affordance and manipulation signal." }, { "architecture_family": "multi-label classifier", "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", "current_limit": "Object labels are language-derived and sparse in one episode.", "direction_roles": { "A": "proxy", "C": "direct", "D": "proxy" }, "display_name": "Object Relevance Prediction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", "label": "Neural predictions" } ], "family": "supervised", "id": "object_relevance", "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", "input_short": "non-caption multimodal features", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "micro_f1", "minimal": 0.1803, "name": "micro-F1", "neural_mlp": 0.1679 }, "modalities": [ "video", "depth", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "relevant object set", "primary_direction": "C", "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", "research_name": "Object-Centric Interaction Recognition", "why": "Connects egocentric activity to manipulated objects and early object-centric state." }, { "architecture_family": "retrieval ranker", "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", "current_limit": "Bag-of-objects language features are too weak for rich grounding.", "direction_roles": { "C": "direct", "D": "proxy" }, "display_name": "Language Grounding", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "caption_grounding", "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", "input_short": "text-like query and candidate windows", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "mrr", "minimal": 0.016, "name": "MRR", "neural_mlp": 0.0168 }, "modalities": [ "language", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked matching moments", "primary_direction": "C", "process_short": "query features -> candidate index -> cosine ranker", "research_name": "Language-to-Moment Grounding", "why": "Grounds language annotation into egocentric sensor time and task state." }, { "architecture_family": "two-tower retrieval head", "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", "direction_roles": { "B": "proxy", "C": "diagnostic", "D": "proxy" }, "display_name": "Cross-Modal Retrieval", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", "label": "Neural metrics" } ], "family": "retrieval", "id": "cross_modal_retrieval", "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", "input_short": "motion/IMU/pose query; depth/video candidates", "metric": { "better_baseline": "minimal", "direction": "higher", "key": "mrr", "minimal": 0.2693, "name": "MRR", "neural_mlp": 0.13 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "ranked visual windows", "primary_direction": "C", "process_short": "modality split -> projection -> nearest-neighbor ranker", "research_name": "Multimodal Representation Retrieval", "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." }, { "architecture_family": "feature regressor", "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", "direction_roles": { "B": "proxy", "D": "proxy" }, "display_name": "Cross-Modal Reconstruction", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", "label": "Neural metrics" } ], "family": "forecast", "id": "modality_reconstruction", "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", "input_short": "motion, IMU, and camera/pose features", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "r2", "minimal": -0.0153, "name": "R2", "neural_mlp": -0.0102 }, "modalities": [ "motion_capture", "inertial", "pose_slam", "depth", "video" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "reconstructed depth/video vector", "primary_direction": "B", "process_short": "source-target split -> scaler -> regression head", "research_name": "Modality Feature Reconstruction", "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective." }, { "architecture_family": "pairwise classifier", "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", "direction_roles": { "C": "diagnostic", "D": "diagnostic" }, "display_name": "Temporal Order Verification", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "temporal_order", "input": "A pair of adjacent window vectors, plus their difference vector.", "input_short": "two adjacent windows plus difference vector", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.54, "name": "F1", "neural_mlp": 0.852 }, "modalities": [ "video", "pose_slam", "motion_capture", "inertial" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "correct or reversed", "primary_direction": "C", "process_short": "pair builder -> feature combiner -> binary classifier", "research_name": "Temporal Order Verification", "why": "Checks whether features encode local time direction and task progression." }, { "architecture_family": "pairwise classifier", "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", "direction_roles": { "B": "diagnostic", "C": "diagnostic", "D": "diagnostic" }, "display_name": "Multimodal Synchronization Detection", "evidence_links": [ { "href": "data/task_walkthroughs.json", "label": "Task walkthrough" }, { "href": "single_episode_explorer.html", "label": "Single-episode explorer" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", "label": "Minimal metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", "label": "Neural metrics" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", "label": "Minimal predictions" }, { "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", "label": "Neural predictions" } ], "family": "diagnostic", "id": "misalignment_detection", "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", "input_short": "motion-side and visual/depth-side feature groups", "metric": { "better_baseline": "neural_mlp", "direction": "higher", "key": "f1", "minimal": 0.5052, "name": "F1", "neural_mlp": 0.7153 }, "modalities": [ "motion_capture", "inertial", "video", "depth", "pose_slam" ], "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", "output_short": "aligned or shifted", "primary_direction": "C", "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", "research_name": "Cross-Modal Misalignment Detection", "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." } ], "title": "Interactive Research Roadmap" }