| { | |
| "model_type": "cosmos-policy", | |
| "architecture": "diffusion-transformer", | |
| "base_model": "nvidia/Cosmos-Predict2-2B-Video2World", | |
| "num_parameters": "2B", | |
| "input_spec": { | |
| "text": { | |
| "type": "string", | |
| "description": "Natural language task description" | |
| }, | |
| "images": { | |
| "format": "RGB", | |
| "resolution": [224, 224], | |
| "views": ["top_down", "left_wrist", "right_wrist"] | |
| }, | |
| "proprioception": { | |
| "dim": 14, | |
| "components": ["left_arm_joints", "right_arm_joints"], | |
| "joints_per_arm": 7 | |
| } | |
| }, | |
| "output_spec": { | |
| "actions": { | |
| "dim": 14, | |
| "horizon": 50, | |
| "components": ["left_arm_6dof", "left_gripper", "right_arm_6dof", "right_gripper"], | |
| "control_frequency_hz": 25 | |
| }, | |
| "future_proprioception": { | |
| "dim": 14 | |
| }, | |
| "future_images": { | |
| "resolution": [224, 224], | |
| "views": 3 | |
| }, | |
| "value": { | |
| "dim": 1 | |
| } | |
| }, | |
| "diffusion_config": { | |
| "denoising_steps": 10, | |
| "sigma_min": 4.0, | |
| "sigma_max": 80.0, | |
| "generation_mode": "parallel" | |
| }, | |
| "training": { | |
| "dataset": "ALOHA-Cosmos-Policy", | |
| "gradient_steps": 50000, | |
| "batch_size": 200, | |
| "hardware": "8x H100", | |
| "action_chunk_size": 50, | |
| "num_demonstrations": 185 | |
| }, | |
| "benchmark_results": { | |
| "put_x_on_plate": 1.0, | |
| "fold_shirt": 0.995, | |
| "put_candies_in_bowl": 0.896, | |
| "put_candy_in_ziploc_bag": 0.854, | |
| "average": 0.936 | |
| }, | |
| "inference": { | |
| "precision": "bf16", | |
| "vram_gb": 6.0 | |
| }, | |
| "robot_platform": "ALOHA 2 (ViperX 300 S dual arms)" | |
| } | |