Cosmos3-Nano / config.json
mingyuliutw's picture
Super-squash branch 'main' using huggingface_hub
138d071
{
"allow_patterns_overrides": [
"*/*.safetensors"
],
"architectures": [
"Cosmos3ForConditionalGeneration"
],
"image_token_id": 151655,
"model": {
"_recursive_": false,
"_target": "omni_mot_model",
"config": {
"_type": "omni_mot_model_config",
"action_gen": true,
"activation_checkpointing": {
"_type": "activation_checkpointing_config",
"determinism_check": "default",
"mode": "selective",
"preserve_rng_state": true,
"save_ops_regex": [
"fmha"
]
},
"causal_training_strategy": "none",
"diffusion_expert_config": {
"_type": "diffusion_expert_config",
"base_fps": 24,
"enable_fps_modulation": true,
"load_weights_from_pretrained": false,
"max_vae_latent_side_after_patchify": 20,
"patch_spatial": 2,
"position_embedding_type": "unified_3d_mrope",
"rope_h_extrapolation_ratio": 1.0,
"rope_t_extrapolation_ratio": 1.0,
"rope_w_extrapolation_ratio": 1.0,
"timestep_range": 1.0,
"unified_3d_mrope_reset_spatial_ids": true,
"unified_3d_mrope_temporal_modality_margin": 15000
},
"ema": {
"_type": "ema_config",
"enabled": false,
"iteration_shift": 0,
"rate": 0.1
},
"fixed_step_sampler_config": null,
"input_caption_key": "ai_caption",
"input_image_key": "images",
"input_video_key": "video",
"joint_attn_implementation": "two_way",
"latent_downsample_factor": 16,
"lbl": {
"_type": "lbl_config",
"coeff_gen": null,
"coeff_und": null,
"method": "local"
},
"log_enc_time_every_n": 100,
"lora_alpha": 32,
"lora_enabled": false,
"lora_rank": 16,
"lora_target_modules": "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen",
"max_action_dim": 64,
"max_num_tokens_after_packing": 74000,
"natten_parameter_list": null,
"net": null,
"num_embodiment_domains": 32,
"parallelism": {
"_type": "parallelism_config",
"cfg_parallel_shard_degree": 1,
"compile_dynamic": true,
"compiled_region": "language",
"context_parallel_shard_degree": 1,
"coordinate_descent_tuning": false,
"data_parallel_replicate_degree": 1,
"data_parallel_shard_degree": 8,
"enable_inference_mode": false,
"max_autotune_pointwise": false,
"precision": "bfloat16",
"use_cuda_graphs": false,
"use_torch_compile": true
},
"rectified_flow_inference_config": {
"_type": "rectified_flow_inference_config",
"num_train_timesteps": 1000,
"scheduler_type": "unipc",
"shift": 1,
"use_dynamic_shifting": false
},
"rectified_flow_training_config": {
"_type": "rectified_flow_training_config",
"action_loss_weight": 10.0,
"high_sigma_ratio": 0.05,
"high_sigma_timesteps_max": 1000,
"high_sigma_timesteps_min": 995,
"image_loss_scale": null,
"independent_action_schedule": false,
"independent_sound_schedule": false,
"loss_scale": 10.0,
"normalize_loss_by_active": false,
"shift": {
"256": 3,
"480": 5,
"720": 10
},
"shift_action": null,
"shift_sound": null,
"sound_loss_scale": 2.0,
"train_time_action_distribution": "logitnormal",
"train_time_image_distribution": "logitnormal",
"train_time_sound_distribution": "logitnormal",
"train_time_video_distribution": "waver",
"train_time_weight": "uniform",
"use_discrete_rf": false,
"use_dynamic_shift": false,
"use_high_sigma_strategy": false,
"use_high_sigma_strategy_action": false,
"use_high_sigma_strategy_sound": false
},
"resolution": "720",
"sound_dim": 64,
"sound_gen": true,
"sound_latent_fps": 25,
"sound_tokenizer": {
"_target": "avae_interface",
"audio_channels": 2,
"avae_config_path": "",
"avae_path": "pretrained/tokenizers/audio/avae/avae_48k_noncausal_25hz_64ch.ckpt",
"bucket_name": "bucket",
"hop_size": 1920,
"io_channels": 64,
"latent_mean": null,
"latent_std": null,
"normalization_type": "none",
"normalize_latents": false,
"object_store_credential_path_pretrained": "credentials/gcp_training.secret",
"sample_rate": 48000,
"tanh_clamp": 0.995,
"tanh_input_scale": 1.5,
"tanh_output_scale": 3.5
},
"state_ch": 48,
"state_t": 300,
"tokenizer": {
"_target": "wan2pt2_vae_interface",
"bucket_name": "bucket",
"chunk_duration": 93,
"encode_bucket_multiple": null,
"encode_chunk_frames": {
"256": 68,
"480": 24,
"720": 12
},
"encode_exact_durations": [
17,
61,
73
],
"keep_decoder_cache": false,
"object_store_credential_path_pretrained": "credentials/gcp_training.secret",
"spatial_compression_factor": 16,
"temporal_compression_factor": 4,
"temporal_window": null,
"use_streaming_encode": false,
"vae_path": "pretrained/tokenizers/video/wan2pt2/Wan2.2_VAE.pth"
},
"video_temporal_causal": false,
"vision_gen": true,
"vlm_config": {
"_type": "vlm_config",
"layer_module": null,
"model_instance": {
"_target": "qwen3_vl_text_for_causal_lm",
"config": {
"_target": "create_vlm_config",
"base_config": {
"_target": "qwen3_vl_mot_config_from_json_file",
"json_file": "cosmos3://vfm/models/vlm/qwen3_vl/configs/Qwen3-VL-8B-Instruct.json"
},
"qk_norm_for_text": true
}
},
"model_name": "nvidia/Cosmos3-Nano-Reasoner",
"pretrained_weights": {
"_type": "pretrained_weights_config",
"backbone_path": "s3://bucket/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Nano-Reasoner-bb9c6f5/",
"checkpoint_format": null,
"credentials_path": "credentials/gcp_checkpoint.secret",
"enable_gcs_patch_in_boto3": true,
"enabled": false
},
"qk_norm": false,
"tie_word_embeddings": false,
"tokenizer": {
"_target": "create_qwen2_tokenizer_with_download",
"config_variant": "gcp",
"pretrained_model_name": "Qwen/Qwen3-VL-8B-Instruct"
},
"use_system_prompt": false
}
}
},
"model_type": "cosmos3_omni",
"text_config": {
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"dtype": "bfloat16",
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 12288,
"max_position_embeddings": 262144,
"model_type": "qwen3_vl_text",
"num_attention_heads": 32,
"num_hidden_layers": 36,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_interleaved": true,
"mrope_section": [
24,
20,
20
],
"rope_type": "default"
},
"rope_theta": 5000000,
"use_cache": true,
"vocab_size": 151936
},
"tie_word_embeddings": false,
"transformers_version": "4.57.0.dev0",
"video_token_id": 151656,
"vision_config": {
"deepstack_visual_indexes": [
8,
16,
24
],
"depth": 27,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"in_channels": 3,
"initializer_range": 0.02,
"intermediate_size": 4304,
"model_type": "qwen3_vl",
"num_heads": 16,
"num_position_embeddings": 2304,
"out_hidden_size": 4096,
"patch_size": 16,
"spatial_merge_size": 2,
"temporal_patch_size": 2
},
"vision_end_token_id": 151653,
"vision_start_token_id": 151652
}