| { | |
| "model_type": "cosmos-policy-planning", | |
| "architecture": "diffusion-transformer", | |
| "base_model": "nvidia/Cosmos-Policy-ALOHA-Predict2-2B", | |
| "num_parameters": "2B", | |
| "input_spec": { | |
| "text": { | |
| "type": "string", | |
| "description": "Natural language task description" | |
| }, | |
| "images": { | |
| "format": "RGB", | |
| "resolution": [224, 224], | |
| "views": ["top_down", "left_wrist", "right_wrist"] | |
| }, | |
| "proprioception": { | |
| "dim": 14, | |
| "components": ["left_arm_joints", "right_arm_joints"], | |
| "joints_per_arm": 7 | |
| }, | |
| "actions": { | |
| "dim": 14, | |
| "horizon": 50, | |
| "description": "Candidate action sequence to evaluate" | |
| } | |
| }, | |
| "output_spec": { | |
| "future_proprioception": { | |
| "dim": 14 | |
| }, | |
| "future_images": { | |
| "resolution": [224, 224], | |
| "views": 3 | |
| }, | |
| "value": { | |
| "dim": 1, | |
| "description": "Expected cumulative reward for action sequence" | |
| } | |
| }, | |
| "diffusion_config": { | |
| "denoising_steps": 10, | |
| "sigma_min": 4.0, | |
| "sigma_max": 80.0 | |
| }, | |
| "planning_config": { | |
| "ensemble_world_model_queries": 3, | |
| "ensemble_value_queries": 5, | |
| "total_predictions_per_action": 15, | |
| "best_of_n_search": 8 | |
| }, | |
| "training": { | |
| "dataset": "ALOHA policy rollouts", | |
| "num_episodes": 648, | |
| "hardware": "8x H100", | |
| "batch_split": { | |
| "policy": 0.1, | |
| "world_model": 0.45, | |
| "value_function": 0.45 | |
| } | |
| }, | |
| "benchmark_results": { | |
| "put_candies_in_bowl": 0.60, | |
| "put_candy_in_ziploc_bag": 0.84, | |
| "average": 0.72, | |
| "improvement_over_base": 0.125 | |
| }, | |
| "inference": { | |
| "precision": "bf16", | |
| "latency_seconds": 4.9, | |
| "recommended_gpus": 8 | |
| }, | |
| "robot_platform": "ALOHA 2 (ViperX 300 S dual arms)", | |
| "control_frequency_hz": 25 | |
| } | |