| { |
| "title": "Xperience-10M Foundation Model Plan", |
| "status": "planning_artifact", |
| "current_boundary": "No held-out multi-episode foundation-model result has been completed in this repo. The current foundation-model artifacts are setup-stage until enough valid episodes are staged and evaluated.", |
| "decision": { |
| "immediate_trainable_backbone": "Qwen3-Omni", |
| "first_world_model_branch": "Cosmos 3", |
| "first_policy_branch_candidates": [ |
| "OpenVLA / OpenVLA-OFT", |
| "openpi pi0/pi0.5", |
| "NVIDIA GR00T" |
| ], |
| "external_reasoning_reference": "Gemini Robotics" |
| }, |
| "model_families": [ |
| { |
| "priority": 1, |
| "family": "Qwen3-Omni", |
| "category": "omni_instruction_model", |
| "openness": "open_weights_available_from_official_hf_repo", |
| "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.", |
| "xperience10m_fit": [ |
| "RGB/fisheye video, embedded audio, and language prompts can enter directly.", |
| "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.", |
| "Matches current task outputs: labels, structured JSON, captions, and short decisions." |
| ], |
| "current_decision": "keep_as_first_pilot", |
| "entry_condition": "Selected episodes staged with held-out episode split.", |
| "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" |
| }, |
| { |
| "priority": 2, |
| "family": "Cosmos 3", |
| "category": "world_foundation_model", |
| "openness": "track_official_nvidia_release_and_available_weights", |
| "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.", |
| "xperience10m_fit": [ |
| "Uses video streams as visual state.", |
| "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.", |
| "Better aligned with prediction/generation objectives than simple label classification." |
| ], |
| "current_decision": "add_as_first_world_model_branch_after_data_gate", |
| "entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.", |
| "public_source": "https://www.nvidia.com/en-us/ai/cosmos/" |
| }, |
| { |
| "priority": 3, |
| "family": "NVIDIA GR00T", |
| "category": "humanoid_policy_foundation_model", |
| "openness": "track_official_nvidia_release_and_tooling", |
| "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.", |
| "xperience10m_fit": [ |
| "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.", |
| "Egocentric video plus human motion can support affordance and interaction tasks." |
| ], |
| "current_decision": "track_as_humanoid_policy_branch", |
| "entry_condition": "Retargeting artifact and action-space definition exist.", |
| "public_source": "https://developer.nvidia.com/isaac/gr00t" |
| }, |
| { |
| "priority": 4, |
| "family": "OpenVLA / OpenVLA-OFT", |
| "category": "vision_language_action_policy", |
| "openness": "open_project_and_weights", |
| "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.", |
| "xperience10m_fit": [ |
| "Good candidate when each window is expressed as visual observation, instruction/context, and action token.", |
| "Requires an explicit action target; current human egocentric labels are not robot controls by default." |
| ], |
| "current_decision": "candidate_after_action_space_design", |
| "entry_condition": "Window-to-action-token conversion is implemented and audited.", |
| "public_source": "https://openvla.github.io/" |
| }, |
| { |
| "priority": 5, |
| "family": "openpi pi0/pi0.5", |
| "category": "robot_policy_model", |
| "openness": "open_source_policy_training_stack", |
| "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.", |
| "xperience10m_fit": [ |
| "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.", |
| "Better for policy branch than for current structured task JSON outputs." |
| ], |
| "current_decision": "candidate_policy_branch", |
| "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.", |
| "public_source": "https://github.com/Physical-Intelligence/openpi" |
| }, |
| { |
| "priority": 6, |
| "family": "Gemini Robotics", |
| "category": "closed_embodied_reasoning_reference", |
| "openness": "closed_or_limited_access", |
| "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.", |
| "xperience10m_fit": [ |
| "Can help reason over egocentric scenes and task descriptions.", |
| "Not a local fine-tune target for this repo." |
| ], |
| "current_decision": "external_reference_only", |
| "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.", |
| "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" |
| }, |
| { |
| "priority": 7, |
| "family": "Octo / SmolVLA-style lightweight policies", |
| "category": "lightweight_robot_policy_baselines", |
| "openness": "open_projects", |
| "best_role": "Cheaper policy baselines for observation-to-action experiments.", |
| "xperience10m_fit": [ |
| "Useful after action target design.", |
| "Less directly omni-modal than Qwen3-Omni or Cosmos 3." |
| ], |
| "current_decision": "optional_baseline_after_data_staging", |
| "entry_condition": "Action labels and baseline protocol exist.", |
| "public_source": "https://github.com/huggingface/lerobot" |
| } |
| ], |
| "execution_order": [ |
| { |
| "step": 1, |
| "name": "Data gate", |
| "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split." |
| }, |
| { |
| "step": 2, |
| "name": "First held-out baseline", |
| "action": "Run Qwen3-Omni LoRA to establish the full train/eval loop." |
| }, |
| { |
| "step": 3, |
| "name": "Model-selection dry run", |
| "action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate." |
| }, |
| { |
| "step": 4, |
| "name": "World-model branch", |
| "action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute." |
| }, |
| { |
| "step": 5, |
| "name": "Policy branch", |
| "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable." |
| }, |
| { |
| "step": 6, |
| "name": "Publication rule", |
| "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples." |
| } |
| ], |
| "evaluation_additions": [ |
| { |
| "target": "structured_task_prediction", |
| "metrics": [ |
| "JSON validity", |
| "macro-F1", |
| "accuracy", |
| "micro-F1" |
| ], |
| "model_families": [ |
| "Qwen3-Omni", |
| "Gemini Robotics reference" |
| ] |
| }, |
| { |
| "target": "future_state_prediction", |
| "metrics": [ |
| "retrieval rank", |
| "temporal consistency", |
| "feature reconstruction", |
| "qualitative visual inspection" |
| ], |
| "model_families": [ |
| "Cosmos 3" |
| ] |
| }, |
| { |
| "target": "action_conditioned_dynamics", |
| "metrics": [ |
| "transition accuracy", |
| "contact accuracy", |
| "next-action accuracy" |
| ], |
| "model_families": [ |
| "Cosmos 3", |
| "OpenVLA", |
| "openpi", |
| "GR00T" |
| ] |
| }, |
| { |
| "target": "cross_episode_generalization", |
| "metrics": [ |
| "held-out episode metrics", |
| "held-out session metrics", |
| "leakage audit" |
| ], |
| "model_families": [ |
| "all trainable branches" |
| ] |
| } |
| ], |
| "source_links": [ |
| { |
| "label": "Qwen3-Omni official HF model", |
| "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" |
| }, |
| { |
| "label": "NVIDIA Cosmos", |
| "url": "https://www.nvidia.com/en-us/ai/cosmos/" |
| }, |
| { |
| "label": "NVIDIA Isaac GR00T", |
| "url": "https://developer.nvidia.com/isaac/gr00t" |
| }, |
| { |
| "label": "OpenVLA", |
| "url": "https://openvla.github.io/" |
| }, |
| { |
| "label": "openpi", |
| "url": "https://github.com/Physical-Intelligence/openpi" |
| }, |
| { |
| "label": "Gemini Robotics", |
| "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" |
| }, |
| { |
| "label": "Octo", |
| "url": "https://octo-models.github.io/" |
| }, |
| { |
| "label": "LeRobot / SmolVLA", |
| "url": "https://github.com/huggingface/lerobot" |
| } |
| ] |
| } |
|
|