ropedia-xperience-10m-task-baselines / docs /data /research_direction_extensions.json

Publish Ropedia Xperience-10M task baseline cards

45c1706 verified about 10 hours ago

11.9 kB

	{
	"source": {
	"shared_windows": "results/episode_task_suite/shared_windows.npz",
	"windows_csv": "results/episode_task_suite/windows.csv",
	"feature_manifest": "results/episode_task_suite/feature_manifest.json"
	},
	"dataset_scope": {
	"sample_episode_count": 1,
	"num_windows": 1161,
	"feature_dim": 8546,
	"first_start_frame": 0,
	"last_end_frame": 5819,
	"warning": "Single public sample episode; these extension probes validate task design and pipeline mechanics, not cross-episode generalization."
	},
	"baselines": {
	"minimal": "Ridge classifiers/regressors/projections plus cosine retrieval on the committed feature tensor.",
	"neural_mlp": "Small one-hidden-layer PyTorch MLP heads using the same inputs, targets, chronological split, and evaluator."
	},
	"run_config": {
	"train_fraction": 0.7,
	"ridge_l2": 10.0,
	"seed": 7,
	"future_windows": 4,
	"neural_epochs": 25,
	"neural_hidden_dim": 128,
	"neural_batch_size": 128,
	"skip_neural": false
	},
	"task_specs": {
	"body_motion_intensity": {
	"direction": "A",
	"direction_name": "Human Modeling & Motion Understanding",
	"name": "Body and Hand Motion Intensity",
	"family": "classification",
	"case_study": "A window with a fast reach or pour should be classified as high motion; a steady holding window should be low motion.",
	"input": "Current non-mocap feature blocks: video, audio, depth, camera pose/rotation, IMU, SLAM, calibration, and language context.",
	"middle_process": "Compute the target from hand/body joint changes between neighboring windows, hide the mocap blocks from the input, then classify high versus low motion using the train-set median as the threshold.",
	"output": "Binary label: high_motion or low_motion.",
	"minimal_baseline": "Ridge classifier on standardized non-mocap features.",
	"neural_baseline": "One-hidden-layer MLP binary classifier on the same input features.",
	"metric_name": "macro-F1",
	"metric_key": "macro_f1",
	"metric_direction": "higher",
	"current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior."
	},
	"multi_view_consistency_retrieval": {
	"direction": "B",
	"direction_name": "3D/4D Reconstruction & Neural Rendering",
	"name": "Multi-View Consistency Retrieval",
	"family": "retrieval",
	"case_study": "Given the fisheye camera features for a pouring moment, retrieve the synchronized stereo-left view from the same time window.",
	"input": "Query side: fisheye_cam0 video feature block. Candidate side: stereo_left video feature block from held-out windows.",
	"middle_process": "Learn a projection from one camera-view feature space into another, then rank held-out candidate windows by cosine similarity.",
	"output": "Ranked candidate windows; the correct synchronized view should rank near the top.",
	"minimal_baseline": "Ridge projection followed by cosine nearest-neighbor retrieval.",
	"neural_baseline": "One-hidden-layer MLP projection followed by the same cosine retrieval evaluator.",
	"metric_name": "MRR",
	"metric_key": "mrr",
	"metric_direction": "higher",
	"current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis."
	},
	"action_phase_progress": {
	"direction": "C",
	"direction_name": "Egocentric Vision & Interaction",
	"name": "Action Phase Progress Estimation",
	"family": "regression",
	"case_study": "Inside a Pour coffee action segment, estimate whether the current window is near the beginning, middle, or end of that action.",
	"input": "Current non-caption multimodal feature vector, so the label text cannot be copied directly from the language block.",
	"middle_process": "Convert contiguous action-label runs into a normalized 0-to-1 progress target, train on earlier windows, and regress progress for later windows.",
	"output": "A scalar progress value between 0.0 and 1.0 for the current action segment.",
	"minimal_baseline": "Ridge regressor on standardized non-caption features.",
	"neural_baseline": "One-hidden-layer MLP regressor on the same input features.",
	"metric_name": "MAE",
	"metric_key": "mae",
	"metric_direction": "lower",
	"current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks."
	},
	"ego_motion_forecast": {
	"direction": "D",
	"direction_name": "Scene Reconstruction & World Modeling",
	"name": "Short-Horizon Ego-Motion Forecasting",
	"family": "forecast",
	"case_study": "From the current sensors, predict how the camera translation will change over the next 20 frames while the wearer moves through the scene.",
	"input": "Current multimodal features excluding the camera-translation block and caption text.",
	"middle_process": "Build a future target from camera-translation difference at a four-window horizon, then regress that future ego-motion delta from current sensors.",
	"output": "A future camera-translation delta vector.",
	"minimal_baseline": "Ridge regressor with a 20-frame forecast horizon.",
	"neural_baseline": "One-hidden-layer MLP regressor with the same horizon and split.",
	"metric_name": "MAE",
	"metric_key": "mae",
	"metric_direction": "lower",
	"current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model."
	}
	},
	"tasks": {
	"body_motion_intensity": {
	"train_windows": 812,
	"test_windows": 348,
	"target_threshold_train_median": 0.476467490196228,
	"input_dim": 6425,
	"target_source": "hand/body joint delta between neighboring windows",
	"minimal": {
	"accuracy": 0.7758620689655172,
	"macro_f1": 0.7658385093167701,
	"positive_rate_true": 0.35919540229885055,
	"positive_rate_pred": 0.4339080459770115,
	"num_test": 348
	},
	"neural_mlp": {
	"accuracy": 0.8304597701149425,
	"macro_f1": 0.8254423029509534,
	"positive_rate_true": 0.35919540229885055,
	"positive_rate_pred": 0.47126436781609193,
	"num_test": 348
	},
	"neural_training": {
	"available": true,
	"epochs": 25,
	"hidden_dim": 128,
	"loss_history": [
	0.37322977610996794,
	0.22245765099384515,
	0.1382973729242832,
	0.10363741681493562,
	0.0795709453523159,
	0.06539858697817244,
	0.055655122610735776,
	0.043255199022187385,
	0.03558319240001035,
	0.031631215764530776,
	0.029465350402711796,
	0.024383640274625695,
	0.02020622924740972,
	0.016222001351599624,
	0.018758724778523587,
	0.013199950316049196,
	0.014794624612432689,
	0.01013119441452505,
	0.009688532855040554,
	0.008956241283767622,
	0.006733611014469761,
	0.006677041435843618,
	0.0067647325489761795,
	0.005346325666556511,
	0.004052691048084588
	]
	}
	},
	"multi_view_consistency_retrieval": {
	"train_windows": 813,
	"test_windows": 348,
	"query_block": "video_fisheye_cam0",
	"target_block": "video_stereo_left",
	"query_dim": 686,
	"target_dim": 686,
	"minimal": {
	"mrr": 0.5533982515335083,
	"top1": 0.41954022988505746,
	"top5": 0.7068965517241379,
	"top10": 0.8304597701149425,
	"median_rank": 2.0,
	"num_test": 348
	},
	"neural_mlp": {
	"mrr": 0.34691643714904785,
	"top1": 0.23275862068965517,
	"top5": 0.46264367816091956,
	"top10": 0.5890804597701149,
	"median_rank": 7.0,
	"num_test": 348
	},
	"neural_training": {
	"available": true,
	"epochs": 25,
	"hidden_dim": 128,
	"loss_history": [
	0.9800718805740094,
	0.8296866191855803,
	0.7029420470986126,
	0.6089339927846948,
	0.5426930648814268,
	0.49323386093200644,
	0.45315542230600214,
	0.4240272395578551,
	0.3964498403400806,
	0.37567753094588696,
	0.3599070675332021,
	0.3417643405048023,
	0.32952829051721577,
	0.31516501450450657,
	0.3070896395824639,
	0.29752101269888553,
	0.287490411878072,
	0.2791558311654193,
	0.2707079971921693,
	0.2669465311998811,
	0.2603630047442728,
	0.2501040017656148,
	0.24714160980920216,
	0.24146720613060843,
	0.23866056472173036
	]
	}
	},
	"action_phase_progress": {
	"train_windows": 813,
	"test_windows": 348,
	"input_dim": 7650,
	"target_source": "normalized position inside contiguous action-label runs",
	"minimal": {
	"mse": 0.16943013668060303,
	"mae": 0.32674381136894226,
	"r2": -1.0236103208433347,
	"num_test": 348
	},
	"neural_mlp": {
	"mse": 0.14226463437080383,
	"mae": 0.301545649766922,
	"r2": -0.6991557278041094,
	"num_test": 348
	},
	"neural_training": {
	"available": true,
	"epochs": 25,
	"hidden_dim": 128,
	"loss_history": [
	3.236165899632162,
	1.2893786148831414,
	0.8036823107014429,
	0.5113777590120557,
	0.42643894586909153,
	0.37627028047965166,
	0.30304253713524504,
	0.25041163572526065,
	0.20774717810409096,
	0.18116216590630319,
	0.16409150619165191,
	0.14362229159397394,
	0.1277421933001991,
	0.1232111468672899,
	0.11924667946013724,
	0.11960234325310401,
	0.09951645682988572,
	0.08239271907825459,
	0.08965909919102401,
	0.07973466079503408,
	0.07939992471154765,
	0.07085797808340408,
	0.08331163055269188,
	0.068286436959313,
	0.07575550977814241
	]
	}
	},
	"ego_motion_forecast": {
	"train_windows": 810,
	"test_windows": 347,
	"forecast_horizon_windows": 4,
	"forecast_horizon_frames": 20,
	"input_dim": 7629,
	"target_dim": 21,
	"target_source": "future minus current camera_translation feature block",
	"minimal": {
	"mse": 3.3189830780029297,
	"mae": 0.16999860107898712,
	"r2": -5674.96718626448,
	"num_test": 347
	},
	"neural_mlp": {
	"mse": 0.547074019908905,
	"mae": 0.0833469107747078,
	"r2": -934.5800418396838,
	"num_test": 347
	},
	"neural_training": {
	"available": true,
	"epochs": 25,
	"hidden_dim": 128,
	"loss_history": [
	1.034793225188314,
	0.6070329941349265,
	0.459870958622591,
	0.3616479166495947,
	0.2902924293353234,
	0.23913239262722158,
	0.1907091121982645,
	0.17297036942140556,
	0.15034288657300265,
	0.13807891327657817,
	0.13084740807980666,
	0.12387588925567675,
	0.12085371225336451,
	0.11271689225126195,
	0.10728766140010622,
	0.10044601888936243,
	0.09365091108613544,
	0.09090288755150489,
	0.08592776887946658,
	0.0805281208805096,
	0.08014915316929051,
	0.07900124887625376,
	0.07897219589830917,
	0.07575527565714754,
	0.0774567014273302
	]
	}
	}
	}
	}