jwgu's picture
Add config.json for download tracking (#2)
4b2a04c
{
"model_type": "cosmos-policy",
"architecture": "diffusion-transformer",
"base_model": "nvidia/Cosmos-Predict2-2B-Video2World",
"num_parameters": "2B",
"input_spec": {
"text": {
"type": "string",
"description": "Natural language task description"
},
"images": {
"format": "RGB",
"resolution": [224, 224],
"views": ["agentview_left", "agentview_right", "eye_in_hand"]
},
"proprioception": {
"dim": 9,
"components": ["gripper_joints", "end_effector_position", "quaternion"]
}
},
"output_spec": {
"actions": {
"dim": 7,
"horizon": 32,
"execution_horizon": 16,
"components": ["end_effector_6dof", "gripper"]
},
"future_proprioception": {
"dim": 9
},
"future_images": {
"resolution": [224, 224],
"views": 3
},
"value": {
"dim": 1
}
},
"diffusion_config": {
"denoising_steps": 5,
"sigma_min": 4.0,
"sigma_max": 80.0,
"generation_mode": "parallel"
},
"training": {
"dataset": "RoboCasa-Cosmos-Policy",
"gradient_steps": 45000,
"batch_size": 800,
"hardware": "32x H100",
"action_chunk_size": 32,
"num_tasks": 24,
"demos_per_task": 50
},
"benchmark_results": {
"robocasa_average": 0.671
},
"inference": {
"precision": "bf16",
"vram_gb": 8.9
},
"simulation_environment": "RoboCasa"
}