{ "model_type": "cosmos-policy", "architecture": "diffusion-transformer", "base_model": "nvidia/Cosmos-Predict2-2B-Video2World", "num_parameters": "2B", "input_spec": { "text": { "type": "string", "description": "Natural language task description" }, "images": { "format": "RGB", "resolution": [224, 224], "views": ["agentview_left", "agentview_right", "eye_in_hand"] }, "proprioception": { "dim": 9, "components": ["gripper_joints", "end_effector_position", "quaternion"] } }, "output_spec": { "actions": { "dim": 7, "horizon": 32, "execution_horizon": 16, "components": ["end_effector_6dof", "gripper"] }, "future_proprioception": { "dim": 9 }, "future_images": { "resolution": [224, 224], "views": 3 }, "value": { "dim": 1 } }, "diffusion_config": { "denoising_steps": 5, "sigma_min": 4.0, "sigma_max": 80.0, "generation_mode": "parallel" }, "training": { "dataset": "RoboCasa-Cosmos-Policy", "gradient_steps": 45000, "batch_size": 800, "hardware": "32x H100", "action_chunk_size": 32, "num_tasks": 24, "demos_per_task": 50 }, "benchmark_results": { "robocasa_average": 0.671 }, "inference": { "precision": "bf16", "vram_gb": 8.9 }, "simulation_environment": "RoboCasa" }