| { |
| "title": "Ropedia Xperience-10M Task Suite Evaluation Protocol", |
| "status": "pass", |
| "version": "2026-06-01", |
| "generated_at_utc": "2026-06-03T12:47:15+00:00", |
| "source_files": [ |
| "docs/data/summary_metrics.json", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/windows.csv", |
| "results/episode_task_suite/feature_manifest.json" |
| ], |
| "scope": { |
| "validated_episode_count": 1, |
| "annotation": "data/sample/xperience-10m-sample/annotation.hdf5", |
| "num_frames": 5821, |
| "num_windows": 1161, |
| "feature_dim": 8546, |
| "window_frames": 20, |
| "stride_frames": 5, |
| "audio_featurized": true, |
| "raw_data_redistributed": false |
| }, |
| "split_policy": { |
| "name": "single_episode_chronological", |
| "train_fraction": 0.7, |
| "test_fraction": 0.3, |
| "why": "The split preserves time order so future episode segments are not mixed randomly into the train set.", |
| "limitation": "It is still one episode; cross-episode generalization is evaluated in the multi-episode stage." |
| }, |
| "feature_policy": { |
| "input_contract": "8,546-dimensional aligned multimodal window representation", |
| "source_manifest": "results/episode_task_suite/feature_manifest.json", |
| "normalization": "Scalers are fit on train windows only for the baseline heads.", |
| "audio_status": "Audio is one of the synchronized source modalities in the current task representation." |
| }, |
| "baselines": [ |
| { |
| "name": "minimal", |
| "heads": [ |
| "softmax", |
| "binary logistic", |
| "multi-label logistic", |
| "ridge regression", |
| "ridge projection plus cosine ranking" |
| ], |
| "purpose": "Keep each task contract interpretable and easy to debug." |
| }, |
| { |
| "name": "neural_mlp", |
| "heads": [ |
| "PyTorch MLP classifier", |
| "PyTorch MLP regressor", |
| "PyTorch MLP multi-label head" |
| ], |
| "purpose": "Check nonlinear gains before larger omni-model fine-tuning.", |
| "config": { |
| "name": "neural_mlp", |
| "type": "lightweight PyTorch MLP over shared window features", |
| "epochs": 80, |
| "hidden_dim": 128, |
| "batch_size": 128, |
| "learning_rate": 0.001, |
| "weight_decay": 0.0001, |
| "dropout": 0.1, |
| "device": "auto" |
| } |
| } |
| ], |
| "task_protocols": [ |
| { |
| "task": "timeline_action", |
| "family": "supervised classification", |
| "unit": "single window", |
| "input": "current 20-frame all-feature window", |
| "target": "current action label", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "leakage_rule": "No future labels enter the input. Chronological split exposes unseen later action labels.", |
| "counts": { |
| "num_windows": 1144, |
| "num_train_windows": 801, |
| "num_test_windows": 343 |
| }, |
| "minimal_primary_metric": 0.05, |
| "neural_primary_metric": 0.014814814814814814, |
| "minimal_metric_source": "results/episode_task_suite/timeline_action/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/timeline_action/metrics.json" |
| }, |
| { |
| "task": "timeline_subtask", |
| "family": "supervised classification", |
| "unit": "single window", |
| "input": "current 20-frame all-feature window", |
| "target": "current subtask label", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "leakage_rule": "No future labels enter the input. Chronological split exposes unseen later subtask labels.", |
| "counts": { |
| "num_windows": 1147, |
| "num_train_windows": 803, |
| "num_test_windows": 344 |
| }, |
| "minimal_primary_metric": 0.05056355513846935, |
| "neural_primary_metric": 0.02810810810810811, |
| "minimal_metric_source": "results/episode_task_suite/timeline_subtask/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json" |
| }, |
| { |
| "task": "transition_detection", |
| "family": "temporal diagnostic", |
| "unit": "single window", |
| "input": "current 20-frame all-feature window", |
| "target": "action boundary versus steady", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "leakage_rule": "Boundary labels are targets only. Boundary timing is evaluated after prediction.", |
| "counts": { |
| "num_windows": 1161, |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 0.6118237590630229, |
| "neural_primary_metric": 0.5862068965517241, |
| "minimal_metric_source": "results/episode_task_suite/transition_detection/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/transition_detection/metrics.json" |
| }, |
| { |
| "task": "next_action", |
| "family": "short-horizon prediction", |
| "unit": "single window", |
| "input": "current 20-frame all-feature window at time t", |
| "target": "action label at t + 20 frames", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "leakage_rule": "Future labels are shifted into targets only; model inputs remain current-window features.", |
| "counts": { |
| "num_windows": 1161, |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 0.05925925925925927, |
| "neural_primary_metric": 0.04186046511627907, |
| "minimal_metric_source": "results/episode_task_suite/next_action/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/next_action/metrics.json" |
| }, |
| { |
| "task": "hand_trajectory_forecast", |
| "family": "trajectory regression", |
| "unit": "single window", |
| "input": "current all-feature window", |
| "target": "future left/right hand 3D joints for 10 frames", |
| "primary_metric": "mpjpe", |
| "higher_is_better": false, |
| "leakage_rule": "Future mocap coordinates are targets only, not inputs.", |
| "counts": { |
| "num_windows": 1159, |
| "num_train_windows": 811, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 0.8646570444107056, |
| "neural_primary_metric": 0.10785018652677536, |
| "minimal_metric_source": "results/episode_task_suite/hand_trajectory_forecast/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json" |
| }, |
| { |
| "task": "contact_prediction", |
| "family": "binary classification", |
| "unit": "single window", |
| "input": "non-contact and non-caption signals", |
| "target": "any body contact", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "leakage_rule": "Contact-derived fields and caption labels are excluded from inputs.", |
| "counts": { |
| "num_windows": 1161, |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 1.0, |
| "neural_primary_metric": 1.0, |
| "minimal_metric_source": "results/episode_task_suite/contact_prediction/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/contact_prediction/metrics.json" |
| }, |
| { |
| "task": "object_relevance", |
| "family": "multi-label classification", |
| "unit": "single window", |
| "input": "non-caption signals", |
| "target": "current relevant object set", |
| "primary_metric": "micro_f1", |
| "higher_is_better": true, |
| "leakage_rule": "Caption/object-label fields are excluded from inputs.", |
| "counts": { |
| "num_windows": 1161, |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 0.18034382095361662, |
| "neural_primary_metric": 0.1679279279279279, |
| "minimal_metric_source": "results/episode_task_suite/object_relevance/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/object_relevance/metrics.json" |
| }, |
| { |
| "task": "caption_grounding", |
| "family": "retrieval", |
| "unit": "caption query", |
| "input": "caption object/interaction query plus candidate sensor windows", |
| "target": "matching time window", |
| "primary_metric": "mrr", |
| "higher_is_better": true, |
| "leakage_rule": "Queries are ranked against held-out candidate windows; reported ranks are computed after model scoring.", |
| "counts": { |
| "num_queries": 348, |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 0.016023479050338015, |
| "neural_primary_metric": 0.01684125567132316, |
| "minimal_metric_source": "results/episode_task_suite/caption_grounding/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/caption_grounding/metrics.json" |
| }, |
| { |
| "task": "cross_modal_retrieval", |
| "family": "retrieval", |
| "unit": "sensor query", |
| "input": "motion, IMU, and camera query features", |
| "target": "matching depth/video window", |
| "primary_metric": "top5_accuracy", |
| "higher_is_better": true, |
| "leakage_rule": "Query-side and candidate-side signals are split before projection/ranking.", |
| "counts": { |
| "num_queries": 348, |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": 0.367816091954023, |
| "neural_primary_metric": 0.19827586206896552, |
| "minimal_metric_source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json" |
| }, |
| { |
| "task": "modality_reconstruction", |
| "family": "cross-modal regression", |
| "unit": "single window", |
| "input": "motion, IMU, and camera features", |
| "target": "depth/video feature vector", |
| "primary_metric": "r2", |
| "higher_is_better": true, |
| "leakage_rule": "Target-side signals are excluded from the input side.", |
| "counts": { |
| "num_train_windows": 813, |
| "num_test_windows": 348 |
| }, |
| "minimal_primary_metric": -0.015271898913936655, |
| "neural_primary_metric": -0.010171410134180991, |
| "minimal_metric_source": "results/episode_task_suite/modality_reconstruction/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json" |
| }, |
| { |
| "task": "temporal_order", |
| "family": "pairwise diagnostic", |
| "unit": "adjacent window pair", |
| "input": "two adjacent windows", |
| "target": "correct versus reversed order", |
| "primary_metric": "f1", |
| "higher_is_better": true, |
| "leakage_rule": "Pairs are built after windowing; labels are synthetic order labels, not input features.", |
| "counts": { |
| "num_samples": 2320, |
| "num_train_samples": 1624, |
| "num_test_samples": 696 |
| }, |
| "minimal_primary_metric": 0.5399515738498789, |
| "neural_primary_metric": 0.8520179372197308, |
| "minimal_metric_source": "results/episode_task_suite/temporal_order/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/temporal_order/metrics.json" |
| }, |
| { |
| "task": "misalignment_detection", |
| "family": "pairwise diagnostic", |
| "unit": "paired modality window", |
| "input": "motion side plus visual/depth side", |
| "target": "aligned versus shifted by 8 windows", |
| "primary_metric": "f1", |
| "higher_is_better": true, |
| "leakage_rule": "Shift labels are synthetic targets; shifted visual/depth blocks are generated after feature splitting.", |
| "counts": { |
| "num_samples": 2306, |
| "num_train_samples": 1614, |
| "num_test_samples": 692 |
| }, |
| "minimal_primary_metric": 0.5051698670605613, |
| "neural_primary_metric": 0.7152682255845944, |
| "minimal_metric_source": "results/episode_task_suite/misalignment_detection/metrics.json", |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json" |
| } |
| ], |
| "global_leakage_controls": [ |
| "Use chronological train/test splits instead of random window shuffling.", |
| "Fit scalers and learned projections on train windows only.", |
| "Keep future labels, future mocap, contact labels, object labels, and caption labels on the target side unless a task explicitly treats language as the query.", |
| "For cross-modal tasks, split query-side and candidate-side signals before training and ranking.", |
| "Report unseen test classes when the chronological split exposes labels absent from the train segment." |
| ], |
| "current_limitations": [ |
| "Cross-episode generalization is evaluated in the later multi-episode stage.", |
| "Feature-vector reconstruction is separate from pixel depth, mesh, NeRF, or Gaussian reconstruction.", |
| "Qwen3-Omni setup artifacts are preparation artifacts until the selected held-out pilot runs.", |
| "Full audio-visual representation learning still needs multi-episode training; the current report includes single-episode audio/no-audio ablations." |
| ], |
| "scale_up_gate": { |
| "required_before_full_omni_pilot": [ |
| "selected staged Xperience-10M episodes", |
| "held-out episode split with no train/test episode leakage", |
| "manifest, training metadata, progress logs, metrics, predictions, and run report", |
| "held-out evaluation on test episodes rather than train windows" |
| ], |
| "current_status": "prepared; selected data relay in progress", |
| "evidence": [ |
| "results/omni_finetune/DATA_ACCESS_STATUS.md", |
| "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md" |
| ] |
| } |
| } |
|
|