Publish Ropedia Xperience-10M task baseline cards

45c1706 verified about 5 hours ago

8.88 kB

	{
	"title": "Xperience-10M Foundation Model Plan",
	"status": "planning_artifact",
	"current_boundary": "No held-out multi-episode foundation-model result has been completed in this repo. The current foundation-model artifacts are setup-stage until enough valid episodes are staged and evaluated.",
	"decision": {
	"immediate_trainable_backbone": "Qwen3-Omni",
	"first_world_model_branch": "Cosmos 3",
	"first_policy_branch_candidates": [
	"OpenVLA / OpenVLA-OFT",
	"openpi pi0/pi0.5",
	"NVIDIA GR00T"
	],
	"external_reasoning_reference": "Gemini Robotics"
	},
	"model_families": [
	{
	"priority": 1,
	"family": "Qwen3-Omni",
	"category": "omni_instruction_model",
	"openness": "open_weights_available_from_official_hf_repo",
	"best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.",
	"xperience10m_fit": [
	"RGB/fisheye video, embedded audio, and language prompts can enter directly.",
	"Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.",
	"Matches current task outputs: labels, structured JSON, captions, and short decisions."
	],
	"current_decision": "keep_as_first_pilot",
	"entry_condition": "Selected episodes staged with held-out episode split.",
	"public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
	},
	{
	"priority": 2,
	"family": "Cosmos 3",
	"category": "world_foundation_model",
	"openness": "track_official_nvidia_release_and_available_weights",
	"best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.",
	"xperience10m_fit": [
	"Uses video streams as visual state.",
	"Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.",
	"Better aligned with prediction/generation objectives than simple label classification."
	],
	"current_decision": "add_as_first_world_model_branch_after_data_gate",
	"entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.",
	"public_source": "https://www.nvidia.com/en-us/ai/cosmos/"
	},
	{
	"priority": 3,
	"family": "NVIDIA GR00T",
	"category": "humanoid_policy_foundation_model",
	"openness": "track_official_nvidia_release_and_tooling",
	"best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.",
	"xperience10m_fit": [
	"Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.",
	"Egocentric video plus human motion can support affordance and interaction tasks."
	],
	"current_decision": "track_as_humanoid_policy_branch",
	"entry_condition": "Retargeting artifact and action-space definition exist.",
	"public_source": "https://developer.nvidia.com/isaac/gr00t"
	},
	{
	"priority": 4,
	"family": "OpenVLA / OpenVLA-OFT",
	"category": "vision_language_action_policy",
	"openness": "open_project_and_weights",
	"best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.",
	"xperience10m_fit": [
	"Good candidate when each window is expressed as visual observation, instruction/context, and action token.",
	"Requires an explicit action target; current human egocentric labels are not robot controls by default."
	],
	"current_decision": "candidate_after_action_space_design",
	"entry_condition": "Window-to-action-token conversion is implemented and audited.",
	"public_source": "https://openvla.github.io/"
	},
	{
	"priority": 5,
	"family": "openpi pi0/pi0.5",
	"category": "robot_policy_model",
	"openness": "open_source_policy_training_stack",
	"best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.",
	"xperience10m_fit": [
	"Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.",
	"Better for policy branch than for current structured task JSON outputs."
	],
	"current_decision": "candidate_policy_branch",
	"entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.",
	"public_source": "https://github.com/Physical-Intelligence/openpi"
	},
	{
	"priority": 6,
	"family": "Gemini Robotics",
	"category": "closed_embodied_reasoning_reference",
	"openness": "closed_or_limited_access",
	"best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.",
	"xperience10m_fit": [
	"Can help reason over egocentric scenes and task descriptions.",
	"Not a local fine-tune target for this repo."
	],
	"current_decision": "external_reference_only",
	"entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.",
	"public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
	},
	{
	"priority": 7,
	"family": "Octo / SmolVLA-style lightweight policies",
	"category": "lightweight_robot_policy_baselines",
	"openness": "open_projects",
	"best_role": "Cheaper policy baselines for observation-to-action experiments.",
	"xperience10m_fit": [
	"Useful after action target design.",
	"Less directly omni-modal than Qwen3-Omni or Cosmos 3."
	],
	"current_decision": "optional_baseline_after_data_staging",
	"entry_condition": "Action labels and baseline protocol exist.",
	"public_source": "https://github.com/huggingface/lerobot"
	}
	],
	"execution_order": [
	{
	"step": 1,
	"name": "Data gate",
	"action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split."
	},
	{
	"step": 2,
	"name": "First held-out baseline",
	"action": "Run Qwen3-Omni LoRA to establish the full train/eval loop."
	},
	{
	"step": 3,
	"name": "Model-selection dry run",
	"action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate."
	},
	{
	"step": 4,
	"name": "World-model branch",
	"action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute."
	},
	{
	"step": 5,
	"name": "Policy branch",
	"action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable."
	},
	{
	"step": 6,
	"name": "Publication rule",
	"action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples."
	}
	],
	"evaluation_additions": [
	{
	"target": "structured_task_prediction",
	"metrics": [
	"JSON validity",
	"macro-F1",
	"accuracy",
	"micro-F1"
	],
	"model_families": [
	"Qwen3-Omni",
	"Gemini Robotics reference"
	]
	},
	{
	"target": "future_state_prediction",
	"metrics": [
	"retrieval rank",
	"temporal consistency",
	"feature reconstruction",
	"qualitative visual inspection"
	],
	"model_families": [
	"Cosmos 3"
	]
	},
	{
	"target": "action_conditioned_dynamics",
	"metrics": [
	"transition accuracy",
	"contact accuracy",
	"next-action accuracy"
	],
	"model_families": [
	"Cosmos 3",
	"OpenVLA",
	"openpi",
	"GR00T"
	]
	},
	{
	"target": "cross_episode_generalization",
	"metrics": [
	"held-out episode metrics",
	"held-out session metrics",
	"leakage audit"
	],
	"model_families": [
	"all trainable branches"
	]
	}
	],
	"source_links": [
	{
	"label": "Qwen3-Omni official HF model",
	"url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
	},
	{
	"label": "NVIDIA Cosmos",
	"url": "https://www.nvidia.com/en-us/ai/cosmos/"
	},
	{
	"label": "NVIDIA Isaac GR00T",
	"url": "https://developer.nvidia.com/isaac/gr00t"
	},
	{
	"label": "OpenVLA",
	"url": "https://openvla.github.io/"
	},
	{
	"label": "openpi",
	"url": "https://github.com/Physical-Intelligence/openpi"
	},
	{
	"label": "Gemini Robotics",
	"url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
	},
	{
	"label": "Octo",
	"url": "https://octo-models.github.io/"
	},
	{
	"label": "LeRobot / SmolVLA",
	"url": "https://github.com/huggingface/lerobot"
	}
	]
	}