Cosmos
Diffusers
Safetensors
cosmos3_omni
nvidia
cosmos3
vllm
vllm-omni
text, image, video, audio, and action generation
omnimodel
Instructions to use nvidia/Cosmos3-Nano with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use nvidia/Cosmos3-Nano with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Diffusers
How to use nvidia/Cosmos3-Nano with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("nvidia/Cosmos3-Nano", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| { | |
| "allow_patterns_overrides": [ | |
| "*/*.safetensors" | |
| ], | |
| "architectures": [ | |
| "Cosmos3ForConditionalGeneration" | |
| ], | |
| "image_token_id": 151655, | |
| "model": { | |
| "_recursive_": false, | |
| "_target": "omni_mot_model", | |
| "config": { | |
| "_type": "omni_mot_model_config", | |
| "action_gen": true, | |
| "activation_checkpointing": { | |
| "_type": "activation_checkpointing_config", | |
| "determinism_check": "default", | |
| "mode": "selective", | |
| "preserve_rng_state": true, | |
| "save_ops_regex": [ | |
| "fmha" | |
| ] | |
| }, | |
| "causal_training_strategy": "none", | |
| "diffusion_expert_config": { | |
| "_type": "diffusion_expert_config", | |
| "base_fps": 24, | |
| "enable_fps_modulation": true, | |
| "load_weights_from_pretrained": false, | |
| "max_vae_latent_side_after_patchify": 20, | |
| "patch_spatial": 2, | |
| "position_embedding_type": "unified_3d_mrope", | |
| "rope_h_extrapolation_ratio": 1.0, | |
| "rope_t_extrapolation_ratio": 1.0, | |
| "rope_w_extrapolation_ratio": 1.0, | |
| "timestep_range": 1.0, | |
| "unified_3d_mrope_reset_spatial_ids": true, | |
| "unified_3d_mrope_temporal_modality_margin": 15000 | |
| }, | |
| "ema": { | |
| "_type": "ema_config", | |
| "enabled": false, | |
| "iteration_shift": 0, | |
| "rate": 0.1 | |
| }, | |
| "fixed_step_sampler_config": null, | |
| "input_caption_key": "ai_caption", | |
| "input_image_key": "images", | |
| "input_video_key": "video", | |
| "joint_attn_implementation": "two_way", | |
| "latent_downsample_factor": 16, | |
| "lbl": { | |
| "_type": "lbl_config", | |
| "coeff_gen": null, | |
| "coeff_und": null, | |
| "method": "local" | |
| }, | |
| "log_enc_time_every_n": 100, | |
| "lora_alpha": 32, | |
| "lora_enabled": false, | |
| "lora_rank": 16, | |
| "lora_target_modules": "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen", | |
| "max_action_dim": 64, | |
| "max_num_tokens_after_packing": 74000, | |
| "natten_parameter_list": null, | |
| "net": null, | |
| "num_embodiment_domains": 32, | |
| "parallelism": { | |
| "_type": "parallelism_config", | |
| "cfg_parallel_shard_degree": 1, | |
| "compile_dynamic": true, | |
| "compiled_region": "language", | |
| "context_parallel_shard_degree": 1, | |
| "coordinate_descent_tuning": false, | |
| "data_parallel_replicate_degree": 1, | |
| "data_parallel_shard_degree": 8, | |
| "enable_inference_mode": false, | |
| "max_autotune_pointwise": false, | |
| "precision": "bfloat16", | |
| "use_cuda_graphs": false, | |
| "use_torch_compile": true | |
| }, | |
| "rectified_flow_inference_config": { | |
| "_type": "rectified_flow_inference_config", | |
| "num_train_timesteps": 1000, | |
| "scheduler_type": "unipc", | |
| "shift": 1, | |
| "use_dynamic_shifting": false | |
| }, | |
| "rectified_flow_training_config": { | |
| "_type": "rectified_flow_training_config", | |
| "action_loss_weight": 10.0, | |
| "high_sigma_ratio": 0.05, | |
| "high_sigma_timesteps_max": 1000, | |
| "high_sigma_timesteps_min": 995, | |
| "image_loss_scale": null, | |
| "independent_action_schedule": false, | |
| "independent_sound_schedule": false, | |
| "loss_scale": 10.0, | |
| "normalize_loss_by_active": false, | |
| "shift": { | |
| "256": 3, | |
| "480": 5, | |
| "720": 10 | |
| }, | |
| "shift_action": null, | |
| "shift_sound": null, | |
| "sound_loss_scale": 2.0, | |
| "train_time_action_distribution": "logitnormal", | |
| "train_time_image_distribution": "logitnormal", | |
| "train_time_sound_distribution": "logitnormal", | |
| "train_time_video_distribution": "waver", | |
| "train_time_weight": "uniform", | |
| "use_discrete_rf": false, | |
| "use_dynamic_shift": false, | |
| "use_high_sigma_strategy": false, | |
| "use_high_sigma_strategy_action": false, | |
| "use_high_sigma_strategy_sound": false | |
| }, | |
| "resolution": "720", | |
| "sound_dim": 64, | |
| "sound_gen": true, | |
| "sound_latent_fps": 25, | |
| "sound_tokenizer": { | |
| "_target": "avae_interface", | |
| "audio_channels": 2, | |
| "avae_config_path": "", | |
| "avae_path": "pretrained/tokenizers/audio/avae/avae_48k_noncausal_25hz_64ch.ckpt", | |
| "bucket_name": "bucket", | |
| "hop_size": 1920, | |
| "io_channels": 64, | |
| "latent_mean": null, | |
| "latent_std": null, | |
| "normalization_type": "none", | |
| "normalize_latents": false, | |
| "object_store_credential_path_pretrained": "credentials/gcp_training.secret", | |
| "sample_rate": 48000, | |
| "tanh_clamp": 0.995, | |
| "tanh_input_scale": 1.5, | |
| "tanh_output_scale": 3.5 | |
| }, | |
| "state_ch": 48, | |
| "state_t": 300, | |
| "tokenizer": { | |
| "_target": "wan2pt2_vae_interface", | |
| "bucket_name": "bucket", | |
| "chunk_duration": 93, | |
| "encode_bucket_multiple": null, | |
| "encode_chunk_frames": { | |
| "256": 68, | |
| "480": 24, | |
| "720": 12 | |
| }, | |
| "encode_exact_durations": [ | |
| 17, | |
| 61, | |
| 73 | |
| ], | |
| "keep_decoder_cache": false, | |
| "object_store_credential_path_pretrained": "credentials/gcp_training.secret", | |
| "spatial_compression_factor": 16, | |
| "temporal_compression_factor": 4, | |
| "temporal_window": null, | |
| "use_streaming_encode": false, | |
| "vae_path": "pretrained/tokenizers/video/wan2pt2/Wan2.2_VAE.pth" | |
| }, | |
| "video_temporal_causal": false, | |
| "vision_gen": true, | |
| "vlm_config": { | |
| "_type": "vlm_config", | |
| "layer_module": null, | |
| "model_instance": { | |
| "_target": "qwen3_vl_text_for_causal_lm", | |
| "config": { | |
| "_target": "create_vlm_config", | |
| "base_config": { | |
| "_target": "qwen3_vl_mot_config_from_json_file", | |
| "json_file": "cosmos3://vfm/models/vlm/qwen3_vl/configs/Qwen3-VL-8B-Instruct.json" | |
| }, | |
| "qk_norm_for_text": true | |
| } | |
| }, | |
| "model_name": "nvidia/Cosmos3-Nano-Reasoner", | |
| "pretrained_weights": { | |
| "_type": "pretrained_weights_config", | |
| "backbone_path": "s3://bucket/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Nano-Reasoner-bb9c6f5/", | |
| "checkpoint_format": null, | |
| "credentials_path": "credentials/gcp_checkpoint.secret", | |
| "enable_gcs_patch_in_boto3": true, | |
| "enabled": false | |
| }, | |
| "qk_norm": false, | |
| "tie_word_embeddings": false, | |
| "tokenizer": { | |
| "_target": "create_qwen2_tokenizer_with_download", | |
| "config_variant": "gcp", | |
| "pretrained_model_name": "Qwen/Qwen3-VL-8B-Instruct" | |
| }, | |
| "use_system_prompt": false | |
| } | |
| } | |
| }, | |
| "model_type": "cosmos3_omni", | |
| "text_config": { | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 151643, | |
| "dtype": "bfloat16", | |
| "eos_token_id": 151645, | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 4096, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 12288, | |
| "max_position_embeddings": 262144, | |
| "model_type": "qwen3_vl_text", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 36, | |
| "num_key_value_heads": 8, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": { | |
| "mrope_interleaved": true, | |
| "mrope_section": [ | |
| 24, | |
| 20, | |
| 20 | |
| ], | |
| "rope_type": "default" | |
| }, | |
| "rope_theta": 5000000, | |
| "use_cache": true, | |
| "vocab_size": 151936 | |
| }, | |
| "tie_word_embeddings": false, | |
| "transformers_version": "4.57.0.dev0", | |
| "video_token_id": 151656, | |
| "vision_config": { | |
| "deepstack_visual_indexes": [ | |
| 8, | |
| 16, | |
| 24 | |
| ], | |
| "depth": 27, | |
| "hidden_act": "gelu_pytorch_tanh", | |
| "hidden_size": 1152, | |
| "in_channels": 3, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 4304, | |
| "model_type": "qwen3_vl", | |
| "num_heads": 16, | |
| "num_position_embeddings": 2304, | |
| "out_hidden_size": 4096, | |
| "patch_size": 16, | |
| "spatial_merge_size": 2, | |
| "temporal_patch_size": 2 | |
| }, | |
| "vision_end_token_id": 151653, | |
| "vision_start_token_id": 151652 | |
| } | |