ropedia-xperience-10m-task-baselines / docs /data /foundation_model_plan.json
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
45c1706 verified
{
"title": "Xperience-10M Foundation Model Plan",
"status": "planning_artifact",
"current_boundary": "No held-out multi-episode foundation-model result has been completed in this repo. The current foundation-model artifacts are setup-stage until enough valid episodes are staged and evaluated.",
"decision": {
"immediate_trainable_backbone": "Qwen3-Omni",
"first_world_model_branch": "Cosmos 3",
"first_policy_branch_candidates": [
"OpenVLA / OpenVLA-OFT",
"openpi pi0/pi0.5",
"NVIDIA GR00T"
],
"external_reasoning_reference": "Gemini Robotics"
},
"model_families": [
{
"priority": 1,
"family": "Qwen3-Omni",
"category": "omni_instruction_model",
"openness": "open_weights_available_from_official_hf_repo",
"best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.",
"xperience10m_fit": [
"RGB/fisheye video, embedded audio, and language prompts can enter directly.",
"Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.",
"Matches current task outputs: labels, structured JSON, captions, and short decisions."
],
"current_decision": "keep_as_first_pilot",
"entry_condition": "Selected episodes staged with held-out episode split.",
"public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
},
{
"priority": 2,
"family": "Cosmos 3",
"category": "world_foundation_model",
"openness": "track_official_nvidia_release_and_available_weights",
"best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.",
"xperience10m_fit": [
"Uses video streams as visual state.",
"Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.",
"Better aligned with prediction/generation objectives than simple label classification."
],
"current_decision": "add_as_first_world_model_branch_after_data_gate",
"entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.",
"public_source": "https://www.nvidia.com/en-us/ai/cosmos/"
},
{
"priority": 3,
"family": "NVIDIA GR00T",
"category": "humanoid_policy_foundation_model",
"openness": "track_official_nvidia_release_and_tooling",
"best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.",
"xperience10m_fit": [
"Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.",
"Egocentric video plus human motion can support affordance and interaction tasks."
],
"current_decision": "track_as_humanoid_policy_branch",
"entry_condition": "Retargeting artifact and action-space definition exist.",
"public_source": "https://developer.nvidia.com/isaac/gr00t"
},
{
"priority": 4,
"family": "OpenVLA / OpenVLA-OFT",
"category": "vision_language_action_policy",
"openness": "open_project_and_weights",
"best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.",
"xperience10m_fit": [
"Good candidate when each window is expressed as visual observation, instruction/context, and action token.",
"Requires an explicit action target; current human egocentric labels are not robot controls by default."
],
"current_decision": "candidate_after_action_space_design",
"entry_condition": "Window-to-action-token conversion is implemented and audited.",
"public_source": "https://openvla.github.io/"
},
{
"priority": 5,
"family": "openpi pi0/pi0.5",
"category": "robot_policy_model",
"openness": "open_source_policy_training_stack",
"best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.",
"xperience10m_fit": [
"Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.",
"Better for policy branch than for current structured task JSON outputs."
],
"current_decision": "candidate_policy_branch",
"entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.",
"public_source": "https://github.com/Physical-Intelligence/openpi"
},
{
"priority": 6,
"family": "Gemini Robotics",
"category": "closed_embodied_reasoning_reference",
"openness": "closed_or_limited_access",
"best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.",
"xperience10m_fit": [
"Can help reason over egocentric scenes and task descriptions.",
"Not a local fine-tune target for this repo."
],
"current_decision": "external_reference_only",
"entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.",
"public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
},
{
"priority": 7,
"family": "Octo / SmolVLA-style lightweight policies",
"category": "lightweight_robot_policy_baselines",
"openness": "open_projects",
"best_role": "Cheaper policy baselines for observation-to-action experiments.",
"xperience10m_fit": [
"Useful after action target design.",
"Less directly omni-modal than Qwen3-Omni or Cosmos 3."
],
"current_decision": "optional_baseline_after_data_staging",
"entry_condition": "Action labels and baseline protocol exist.",
"public_source": "https://github.com/huggingface/lerobot"
}
],
"execution_order": [
{
"step": 1,
"name": "Data gate",
"action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split."
},
{
"step": 2,
"name": "First held-out baseline",
"action": "Run Qwen3-Omni LoRA to establish the full train/eval loop."
},
{
"step": 3,
"name": "Model-selection dry run",
"action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate."
},
{
"step": 4,
"name": "World-model branch",
"action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute."
},
{
"step": 5,
"name": "Policy branch",
"action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable."
},
{
"step": 6,
"name": "Publication rule",
"action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples."
}
],
"evaluation_additions": [
{
"target": "structured_task_prediction",
"metrics": [
"JSON validity",
"macro-F1",
"accuracy",
"micro-F1"
],
"model_families": [
"Qwen3-Omni",
"Gemini Robotics reference"
]
},
{
"target": "future_state_prediction",
"metrics": [
"retrieval rank",
"temporal consistency",
"feature reconstruction",
"qualitative visual inspection"
],
"model_families": [
"Cosmos 3"
]
},
{
"target": "action_conditioned_dynamics",
"metrics": [
"transition accuracy",
"contact accuracy",
"next-action accuracy"
],
"model_families": [
"Cosmos 3",
"OpenVLA",
"openpi",
"GR00T"
]
},
{
"target": "cross_episode_generalization",
"metrics": [
"held-out episode metrics",
"held-out session metrics",
"leakage audit"
],
"model_families": [
"all trainable branches"
]
}
],
"source_links": [
{
"label": "Qwen3-Omni official HF model",
"url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
},
{
"label": "NVIDIA Cosmos",
"url": "https://www.nvidia.com/en-us/ai/cosmos/"
},
{
"label": "NVIDIA Isaac GR00T",
"url": "https://developer.nvidia.com/isaac/gr00t"
},
{
"label": "OpenVLA",
"url": "https://openvla.github.io/"
},
{
"label": "openpi",
"url": "https://github.com/Physical-Intelligence/openpi"
},
{
"label": "Gemini Robotics",
"url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
},
{
"label": "Octo",
"url": "https://octo-models.github.io/"
},
{
"label": "LeRobot / SmolVLA",
"url": "https://github.com/huggingface/lerobot"
}
]
}