{ "omni_relay": { "status": "selected_relay_in_progress", "dataset": "ropedia-ai/xperience-10m", "staging": "accelerated_chunked_parallel_transfer_with_batch_prefetch", "training_target": "external_multi_gpu_training_host", "selection_strategy": "stratified_round_robin_by_top_level_session", "target_episodes": 128, "selected_sessions": 128, "candidate_scan_top_level_sessions": 802, "valid_candidates": 12102, "estimated_bytes": 298188841943, "exclude": [ "visualization.rrd" ], "access_status": "Full-dataset access is granted; selected multi-episode relay is in progress with chunked parallel transfer and overlapping batch prefetch.", "current_scope": "The selected-episode Qwen3-Omni fine-tune requires completed data staging and held-out evaluation; the 32-episode Qwen3-Omni fine-tune requires gated data staging before any real held-out metric is reported." }, "models": { "motion_action": { "accuracy": 0.9828178694158075, "balanced_accuracy": 0.9643518518518519, "macro_f1": 0.96884342657456, "weighted_f1": 0.9824311468352843, "num_eval_windows": 291, "num_classes": 18, "majority_baseline_accuracy": 0.13745704467353953, "train_final_accuracy": 1.0, "train_final_loss": 0.019042566418647766 }, "motion_subtask": { "accuracy": 0.9758620689655172, "balanced_accuracy": 0.9783924095954172, "macro_f1": 0.9528048001232955, "weighted_f1": 0.9778836359351952, "num_eval_windows": 290, "num_classes": 14, "majority_baseline_accuracy": 0.14482758620689656, "train_final_accuracy": 1.0, "train_final_loss": 0.02664567530155182 }, "all_modalities_action": { "accuracy": 0.9862542955326461, "balanced_accuracy": 0.9856481481481482, "macro_f1": 0.9828810433408773, "weighted_f1": 0.9862660597416385, "num_eval_windows": 291, "num_classes": 18, "majority_baseline_accuracy": 0.13745704467353953, "train_final_accuracy": 1.0, "train_final_loss": 0.014677195809781551, "feature_dim": 8546, "num_windows": 1144 }, "all_modalities_subtask": { "accuracy": 0.9827586206896551, "balanced_accuracy": 0.9505102040816327, "macro_f1": 0.9173189771658273, "weighted_f1": 0.9841228382209077, "num_eval_windows": 290, "num_classes": 14, "majority_baseline_accuracy": 0.14482758620689656, "train_final_accuracy": 1.0, "train_final_loss": 0.012834250926971436, "feature_dim": 8546, "num_windows": 1147 } }, "suite": { "annotation": "data/sample/xperience-10m-sample/annotation.hdf5", "num_frames": 5821, "num_windows": 1161, "feature_dim": 8546, "window_frames": 20, "stride_frames": 5, "tasks": { "timeline_action": { "accuracy": 0.029154518950437316, "balanced_accuracy": 0.03125, "macro_f1": 0.05, "weighted_f1": 0.04664723032069971, "num_eval_windows": 343, "num_classes": 18, "task": "timeline_action", "input": "all modalities -> current action label", "split": "chronological", "num_windows": 1144, "num_train_windows": 801, "num_test_windows": 343, "feature_dim": 8546, "majority_baseline_accuracy": 0.0, "train_final_accuracy": 1.0, "train_final_loss": 0.016824405640363693, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ] }, "timeline_subtask": { "accuracy": 0.05813953488372093, "balanced_accuracy": 0.05376979652090881, "macro_f1": 0.05056355513846935, "weighted_f1": 0.06827161211620246, "num_eval_windows": 344, "num_classes": 14, "task": "timeline_subtask", "input": "all modalities -> current subtask label", "split": "chronological", "num_windows": 1147, "num_train_windows": 803, "num_test_windows": 344, "feature_dim": 8546, "majority_baseline_accuracy": 0.0, "train_final_accuracy": 1.0, "train_final_loss": 0.014138756319880486, "unseen_test_classes": [ "Move bottle to coffee equipment", "Pour coffee", "Pour milk into coffee", "Prepare for pouring" ] }, "transition_detection": { "accuracy": 0.9080459770114943, "balanced_accuracy": 0.6543674698795181, "macro_f1": 0.6118237590630229, "weighted_f1": 0.9197389592989339, "num_eval_windows": 348, "num_classes": 2, "task": "transition_detection", "input": "all modalities -> action boundary/steady", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8546, "majority_baseline_accuracy": 0.9540229885057471, "train_final_accuracy": 1.0, "train_final_loss": 0.007154403254389763, "unseen_test_classes": [], "boundary_precision": 0.07142857142857142, "boundary_recall": 0.5, "boundary_f1": 0.125, "matched_boundaries": 2, "true_boundaries": 4, "predicted_boundaries": 28, "mean_abs_timing_error_frames": 3.5 }, "next_action": { "accuracy": 0.034482758620689655, "balanced_accuracy": 0.04, "macro_f1": 0.05925925925925927, "weighted_f1": 0.05108556832694764, "num_eval_windows": 348, "num_classes": 18, "task": "next_action", "input": "all modalities at t -> action at t+20 frames", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8546, "majority_baseline_accuracy": 0.0, "train_final_accuracy": 1.0, "train_final_loss": 0.01754833571612835, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ] }, "hand_trajectory_forecast": { "mse": 14.956222534179688, "mae": 0.420173317193985, "r2": -1763.3831383277447, "task": "hand_trajectory_forecast", "input": "all modalities at t -> future left/right hand 3D joints", "split": "chronological", "num_windows": 1159, "num_train_windows": 811, "num_test_windows": 348, "forecast_frames": 10, "mpjpe": 0.8646570444107056, "final_frame_mpjpe": 1.0330793857574463, "target_dim": 1260 }, "contact_prediction": { "accuracy": 1.0, "balanced_accuracy": 1.0, "macro_f1": 1.0, "weighted_f1": 1.0, "num_eval_windows": 348, "num_classes": 1, "task": "contact_prediction", "input": "all non-contact/non-caption-label modalities -> any body contact", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 7503, "majority_baseline_accuracy": 1.0, "train_final_accuracy": 1.0, "train_final_loss": 0.0006056802230887115, "unseen_test_classes": [] }, "object_relevance": { "micro_f1": 0.18034382095361662, "macro_f1": 0.06329638076675959, "exact_match": 0.005747126436781609, "precision": 0.16106604866743918, "recall": 0.20486366985998525, "task": "object_relevance", "input": "all non-caption modalities -> current relevant object set", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "num_objects": 34 }, "caption_grounding": { "mrr": 0.016023479050338015, "median_rank": 172.0, "mean_rank": 174.67816091954023, "num_queries": 348, "top1_accuracy": 0.0028735632183908046, "top5_accuracy": 0.011494252873563218, "top10_accuracy": 0.014367816091954023, "task": "caption_grounding", "input": "caption objects/interaction text query + candidate sensor windows", "output": "matching time window", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348 }, "cross_modal_retrieval": { "mrr": 0.26925966892956127, "median_rank": 14.0, "mean_rank": 43.34770114942529, "num_queries": 348, "top1_accuracy": 0.16379310344827586, "top5_accuracy": 0.367816091954023, "top10_accuracy": 0.47126436781609193, "task": "cross_modal_retrieval", "input": "motion/IMU/camera/audio query", "output": "matching depth/video window", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348 }, "modality_reconstruction": { "mse": 1358.1593017578125, "mae": 0.29572129249572754, "r2": -0.015271898913936655, "task": "modality_reconstruction", "input": "motion/IMU/camera/audio", "output": "depth/video feature vector", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 5096 }, "temporal_order": { "accuracy": 0.4540229885057471, "precision": 0.4665271966527197, "recall": 0.6408045977011494, "f1": 0.5399515738498789, "tp": 223, "tn": 93, "fp": 255, "fn": 125, "positive_rate_true": 0.5, "positive_rate_pred": 0.6867816091954023, "task": "temporal_order", "input": "two adjacent windows -> whether order is correct", "split": "chronological", "num_samples": 2320, "num_train_samples": 1624, "num_test_samples": 696, "train_final_accuracy": 0.5086206896551724 }, "misalignment_detection": { "accuracy": 0.5158959537572254, "precision": 0.5166163141993958, "recall": 0.49421965317919075, "f1": 0.5051698670605613, "tp": 171, "tn": 186, "fp": 160, "fn": 175, "positive_rate_true": 0.5, "positive_rate_pred": 0.47832369942196534, "task": "misalignment_detection", "input": "motion+visual/audio pair -> aligned vs shifted by 8 windows", "split": "chronological", "num_samples": 2306, "num_train_samples": 1614, "num_test_samples": 692, "train_final_accuracy": 0.49380421313506817 } }, "neural_model": { "name": "neural_mlp", "type": "lightweight PyTorch MLP over shared window features", "epochs": 80, "hidden_dim": 128, "batch_size": 128, "learning_rate": 0.001, "weight_decay": 0.0001, "dropout": 0.1, "device": "auto" }, "neural_tasks": { "timeline_action": { "accuracy": 0.008746355685131196, "balanced_accuracy": 0.009375, "macro_f1": 0.014814814814814814, "weighted_f1": 0.013821401576503616, "num_eval_windows": 343, "num_classes": 18, "task": "timeline_action", "input": "all modalities -> current action label", "split": "chronological", "num_windows": 1144, "num_train_windows": 801, "num_test_windows": 343, "feature_dim": 8546, "majority_baseline_accuracy": 0.0, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.04246756529782, "train_final_accuracy": 0.9875156054931336 }, "timeline_subtask": { "accuracy": 0.0377906976744186, "balanced_accuracy": 0.045614035087719294, "macro_f1": 0.02810810810810811, "weighted_f1": 0.023287240729101197, "num_eval_windows": 344, "num_classes": 14, "task": "timeline_subtask", "input": "all modalities -> current subtask label", "split": "chronological", "num_windows": 1147, "num_train_windows": 803, "num_test_windows": 344, "feature_dim": 8546, "majority_baseline_accuracy": 0.0, "unseen_test_classes": [ "Move bottle to coffee equipment", "Pour coffee", "Pour milk into coffee", "Prepare for pouring" ], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 5.4104819144748596e-05, "train_final_accuracy": 1.0 }, "transition_detection": { "accuracy": 0.8735632183908046, "balanced_accuracy": 0.666039156626506, "macro_f1": 0.5862068965517241, "weighted_f1": 0.8993261989694807, "num_eval_windows": 348, "num_classes": 2, "task": "transition_detection", "input": "all modalities -> action boundary/steady", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8546, "majority_baseline_accuracy": 0.9540229885057471, "unseen_test_classes": [], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.029138497962572854, "train_final_accuracy": 0.990159901599016, "boundary_precision": 0.07142857142857142, "boundary_recall": 0.75, "boundary_f1": 0.13043478260869565, "matched_boundaries": 3, "true_boundaries": 4, "predicted_boundaries": 42, "mean_abs_timing_error_frames": 2.6666666666666665 }, "next_action": { "accuracy": 0.02586206896551724, "balanced_accuracy": 0.03, "macro_f1": 0.04186046511627907, "weighted_f1": 0.03608660785886127, "num_eval_windows": 348, "num_classes": 18, "task": "next_action", "input": "all modalities at t -> action at t+20 frames", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8546, "majority_baseline_accuracy": 0.0, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.000416612956025105, "train_final_accuracy": 1.0 }, "hand_trajectory_forecast": { "mse": 0.004775360692292452, "mae": 0.05433763191103935, "r2": 0.43665148265771614, "task": "hand_trajectory_forecast", "input": "all modalities at t -> future left/right hand 3D joints", "split": "chronological", "num_windows": 1159, "num_train_windows": 811, "num_test_windows": 348, "forecast_frames": 10, "mpjpe": 0.10785018652677536, "final_frame_mpjpe": 0.11407545953989029, "target_dim": 1260, "model": "neural_mlp", "head": "z-score -> MLP regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.055699273420247435 }, "contact_prediction": { "accuracy": 1.0, "balanced_accuracy": 1.0, "macro_f1": 1.0, "weighted_f1": 1.0, "num_eval_windows": 348, "num_classes": 1, "task": "contact_prediction", "input": "all non-contact/non-caption-label modalities -> any body contact", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 7503, "majority_baseline_accuracy": 1.0, "unseen_test_classes": [], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.0, "train_final_accuracy": 1.0 }, "object_relevance": { "micro_f1": 0.1679279279279279, "macro_f1": 0.048883162556964774, "exact_match": 0.014367816091954023, "precision": 0.16431593794076163, "recall": 0.17170228445099484, "task": "object_relevance", "input": "all non-caption modalities -> current relevant object set", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "num_objects": 34, "feature_dim": 7650, "model": "neural_mlp", "head": "z-score -> MLP sigmoid multilabel", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.003651880362182214 }, "caption_grounding": { "mrr": 0.01684125567132316, "median_rank": 180.5, "mean_rank": 178.382183908046, "num_queries": 348, "top1_accuracy": 0.0028735632183908046, "top5_accuracy": 0.014367816091954023, "top10_accuracy": 0.020114942528735632, "task": "caption_grounding", "input": "caption objects/interaction text query + candidate sensor windows", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 896, "output": "matching time window", "model": "neural_mlp", "head": "z-score -> MLP projection/regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.06317874967483723 }, "cross_modal_retrieval": { "mrr": 0.1299971898648288, "median_rank": 40.0, "mean_rank": 66.60057471264368, "num_queries": 348, "top1_accuracy": 0.05172413793103448, "top5_accuracy": 0.19827586206896552, "top10_accuracy": 0.2413793103448276, "task": "cross_modal_retrieval", "input": "motion/IMU/camera/audio query", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 5096, "output": "matching depth/video window", "model": "neural_mlp", "head": "z-score -> MLP projection/regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.21891545446596464 }, "modality_reconstruction": { "mse": 1351.3363037109375, "mae": 0.10379635542631149, "r2": -0.010171410134180991, "task": "modality_reconstruction", "input": "motion/IMU/camera/audio", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 5096, "output": "depth/video feature vector", "model": "neural_mlp", "head": "z-score -> MLP projection/regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.21891545446596464 }, "temporal_order": { "accuracy": 0.8577586206896551, "precision": 0.8878504672897196, "recall": 0.8189655172413793, "f1": 0.8520179372197308, "tp": 285, "tn": 312, "fp": 36, "fn": 63, "positive_rate_true": 0.5, "positive_rate_pred": 0.46120689655172414, "task": "temporal_order", "input": "two adjacent windows -> whether order is correct", "split": "chronological", "num_samples": 2320, "num_train_samples": 1624, "num_test_samples": 696, "feature_dim": 25638, "model": "neural_mlp", "head": "z-score -> MLP binary softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.0005108328477586757, "train_final_accuracy": 1.0 }, "misalignment_detection": { "accuracy": 0.7008670520231214, "precision": 0.6824146981627297, "recall": 0.7514450867052023, "f1": 0.7152682255845944, "tp": 260, "tn": 225, "fp": 121, "fn": 86, "positive_rate_true": 0.5, "positive_rate_pred": 0.5505780346820809, "task": "misalignment_detection", "input": "motion+visual/audio pair -> aligned vs shifted by 8 windows", "split": "chronological", "num_samples": 2306, "num_train_samples": 1614, "num_test_samples": 692, "feature_dim": 7511, "model": "neural_mlp", "head": "z-score -> MLP binary softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.010604870708167664, "train_final_accuracy": 0.9956629491945477 } } }, "feature_manifest": [ { "name": "hand left joints", "start": 0, "end": 441, "dim": 441 }, { "name": "hand right joints", "start": 441, "end": 882, "dim": 441 }, { "name": "body joints", "start": 882, "end": 1974, "dim": 1092 }, { "name": "body contacts", "start": 1974, "end": 2121, "dim": 147 }, { "name": "camera translation", "start": 2121, "end": 2142, "dim": 21 }, { "name": "camera rotation matrix", "start": 2142, "end": 2205, "dim": 63 }, { "name": "imu accel gyro", "start": 2205, "end": 2247, "dim": 42 }, { "name": "depth confidence", "start": 2247, "end": 3227, "dim": 980 }, { "name": "video fisheye cam0", "start": 3227, "end": 3913, "dim": 686 }, { "name": "video fisheye cam1", "start": 3913, "end": 4599, "dim": 686 }, { "name": "video fisheye cam2", "start": 4599, "end": 5285, "dim": 686 }, { "name": "video fisheye cam3", "start": 5285, "end": 5971, "dim": 686 }, { "name": "video stereo left", "start": 5971, "end": 6657, "dim": 686 }, { "name": "video stereo right", "start": 6657, "end": 7343, "dim": 686 }, { "name": "audio", "start": 7343, "end": 7511, "dim": 168 }, { "name": "language text", "start": 7511, "end": 8407, "dim": 896 }, { "name": "slam point cloud", "start": 8407, "end": 8429, "dim": 22 }, { "name": "calibration", "start": 8429, "end": 8546, "dim": 117 } ] }