| { | |
| "model_type": "cosmos-policy", | |
| "architecture": "diffusion-transformer", | |
| "base_model": "nvidia/Cosmos-Predict2-2B-Video2World", | |
| "num_parameters": "2B", | |
| "input_spec": { | |
| "text": { | |
| "type": "string", | |
| "description": "Natural language task description" | |
| }, | |
| "images": { | |
| "format": "RGB", | |
| "resolution": [224, 224], | |
| "views": ["agentview_left", "agentview_right", "eye_in_hand"] | |
| }, | |
| "proprioception": { | |
| "dim": 9, | |
| "components": ["gripper_joints", "end_effector_position", "quaternion"] | |
| } | |
| }, | |
| "output_spec": { | |
| "actions": { | |
| "dim": 7, | |
| "horizon": 32, | |
| "execution_horizon": 16, | |
| "components": ["end_effector_6dof", "gripper"] | |
| }, | |
| "future_proprioception": { | |
| "dim": 9 | |
| }, | |
| "future_images": { | |
| "resolution": [224, 224], | |
| "views": 3 | |
| }, | |
| "value": { | |
| "dim": 1 | |
| } | |
| }, | |
| "diffusion_config": { | |
| "denoising_steps": 5, | |
| "sigma_min": 4.0, | |
| "sigma_max": 80.0, | |
| "generation_mode": "parallel" | |
| }, | |
| "training": { | |
| "dataset": "RoboCasa-Cosmos-Policy", | |
| "gradient_steps": 45000, | |
| "batch_size": 800, | |
| "hardware": "32x H100", | |
| "action_chunk_size": 32, | |
| "num_tasks": 24, | |
| "demos_per_task": 50 | |
| }, | |
| "benchmark_results": { | |
| "robocasa_average": 0.671 | |
| }, | |
| "inference": { | |
| "precision": "bf16", | |
| "vram_gb": 8.9 | |
| }, | |
| "simulation_environment": "RoboCasa" | |
| } | |