| { | |
| "model_type": "cosmos-policy", | |
| "architecture": "diffusion-transformer", | |
| "base_model": "nvidia/Cosmos-Predict2-2B-Video2World", | |
| "num_parameters": "2B", | |
| "input_spec": { | |
| "text": { | |
| "type": "string", | |
| "description": "Natural language task description" | |
| }, | |
| "images": { | |
| "format": "RGB", | |
| "resolution": [224, 224], | |
| "views": ["agentview", "eye_in_hand"] | |
| }, | |
| "proprioception": { | |
| "dim": 9, | |
| "components": ["gripper_joints", "end_effector_position", "quaternion"] | |
| } | |
| }, | |
| "output_spec": { | |
| "actions": { | |
| "dim": 7, | |
| "horizon": 16, | |
| "components": ["end_effector_6dof", "gripper"] | |
| }, | |
| "future_proprioception": { | |
| "dim": 9 | |
| }, | |
| "future_images": { | |
| "resolution": [224, 224] | |
| }, | |
| "value": { | |
| "dim": 1 | |
| } | |
| }, | |
| "diffusion_config": { | |
| "denoising_steps": 5, | |
| "sigma_min": 4.0, | |
| "sigma_max": 80.0, | |
| "generation_mode": "parallel" | |
| }, | |
| "training": { | |
| "dataset": "LIBERO-Cosmos-Policy", | |
| "gradient_steps": 40000, | |
| "batch_size": 1920, | |
| "hardware": "64x H100", | |
| "action_chunk_size": 16 | |
| }, | |
| "benchmark_results": { | |
| "libero_spatial": 0.981, | |
| "libero_object": 1.0, | |
| "libero_goal": 0.982, | |
| "libero_long": 0.976, | |
| "average": 0.985 | |
| }, | |
| "inference": { | |
| "precision": "bf16", | |
| "vram_gb": 6.8 | |
| } | |
| } | |