Super-squash branch 'main' using huggingface_hub

138d071 1 day ago

8.17 kB

	{
	"allow_patterns_overrides": [
	"/.safetensors"
	],
	"architectures": [
	"Cosmos3ForConditionalGeneration"
	],
	"image_token_id": 151655,
	"model": {
	"_recursive_": false,
	"_target": "omni_mot_model",
	"config": {
	"_type": "omni_mot_model_config",
	"action_gen": true,
	"activation_checkpointing": {
	"_type": "activation_checkpointing_config",
	"determinism_check": "default",
	"mode": "selective",
	"preserve_rng_state": true,
	"save_ops_regex": [
	"fmha"
	]
	},
	"causal_training_strategy": "none",
	"diffusion_expert_config": {
	"_type": "diffusion_expert_config",
	"base_fps": 24,
	"enable_fps_modulation": true,
	"load_weights_from_pretrained": false,
	"max_vae_latent_side_after_patchify": 20,
	"patch_spatial": 2,
	"position_embedding_type": "unified_3d_mrope",
	"rope_h_extrapolation_ratio": 1.0,
	"rope_t_extrapolation_ratio": 1.0,
	"rope_w_extrapolation_ratio": 1.0,
	"timestep_range": 1.0,
	"unified_3d_mrope_reset_spatial_ids": true,
	"unified_3d_mrope_temporal_modality_margin": 15000
	},
	"ema": {
	"_type": "ema_config",
	"enabled": false,
	"iteration_shift": 0,
	"rate": 0.1
	},
	"fixed_step_sampler_config": null,
	"input_caption_key": "ai_caption",
	"input_image_key": "images",
	"input_video_key": "video",
	"joint_attn_implementation": "two_way",
	"latent_downsample_factor": 16,
	"lbl": {
	"_type": "lbl_config",
	"coeff_gen": null,
	"coeff_und": null,
	"method": "local"
	},
	"log_enc_time_every_n": 100,
	"lora_alpha": 32,
	"lora_enabled": false,
	"lora_rank": 16,
	"lora_target_modules": "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen",
	"max_action_dim": 64,
	"max_num_tokens_after_packing": 74000,
	"natten_parameter_list": null,
	"net": null,
	"num_embodiment_domains": 32,
	"parallelism": {
	"_type": "parallelism_config",
	"cfg_parallel_shard_degree": 1,
	"compile_dynamic": true,
	"compiled_region": "language",
	"context_parallel_shard_degree": 1,
	"coordinate_descent_tuning": false,
	"data_parallel_replicate_degree": 1,
	"data_parallel_shard_degree": 8,
	"enable_inference_mode": false,
	"max_autotune_pointwise": false,
	"precision": "bfloat16",
	"use_cuda_graphs": false,
	"use_torch_compile": true
	},
	"rectified_flow_inference_config": {
	"_type": "rectified_flow_inference_config",
	"num_train_timesteps": 1000,
	"scheduler_type": "unipc",
	"shift": 1,
	"use_dynamic_shifting": false
	},
	"rectified_flow_training_config": {
	"_type": "rectified_flow_training_config",
	"action_loss_weight": 10.0,
	"high_sigma_ratio": 0.05,
	"high_sigma_timesteps_max": 1000,
	"high_sigma_timesteps_min": 995,
	"image_loss_scale": null,
	"independent_action_schedule": false,
	"independent_sound_schedule": false,
	"loss_scale": 10.0,
	"normalize_loss_by_active": false,
	"shift": {
	"256": 3,
	"480": 5,
	"720": 10
	},
	"shift_action": null,
	"shift_sound": null,
	"sound_loss_scale": 2.0,
	"train_time_action_distribution": "logitnormal",
	"train_time_image_distribution": "logitnormal",
	"train_time_sound_distribution": "logitnormal",
	"train_time_video_distribution": "waver",
	"train_time_weight": "uniform",
	"use_discrete_rf": false,
	"use_dynamic_shift": false,
	"use_high_sigma_strategy": false,
	"use_high_sigma_strategy_action": false,
	"use_high_sigma_strategy_sound": false
	},
	"resolution": "720",
	"sound_dim": 64,
	"sound_gen": true,
	"sound_latent_fps": 25,
	"sound_tokenizer": {
	"_target": "avae_interface",
	"audio_channels": 2,
	"avae_config_path": "",
	"avae_path": "pretrained/tokenizers/audio/avae/avae_48k_noncausal_25hz_64ch.ckpt",
	"bucket_name": "bucket",
	"hop_size": 1920,
	"io_channels": 64,
	"latent_mean": null,
	"latent_std": null,
	"normalization_type": "none",
	"normalize_latents": false,
	"object_store_credential_path_pretrained": "credentials/gcp_training.secret",
	"sample_rate": 48000,
	"tanh_clamp": 0.995,
	"tanh_input_scale": 1.5,
	"tanh_output_scale": 3.5
	},
	"state_ch": 48,
	"state_t": 300,
	"tokenizer": {
	"_target": "wan2pt2_vae_interface",
	"bucket_name": "bucket",
	"chunk_duration": 93,
	"encode_bucket_multiple": null,
	"encode_chunk_frames": {
	"256": 68,
	"480": 24,
	"720": 12
	},
	"encode_exact_durations": [
	17,
	61,
	73
	],
	"keep_decoder_cache": false,
	"object_store_credential_path_pretrained": "credentials/gcp_training.secret",
	"spatial_compression_factor": 16,
	"temporal_compression_factor": 4,
	"temporal_window": null,
	"use_streaming_encode": false,
	"vae_path": "pretrained/tokenizers/video/wan2pt2/Wan2.2_VAE.pth"
	},
	"video_temporal_causal": false,
	"vision_gen": true,
	"vlm_config": {
	"_type": "vlm_config",
	"layer_module": null,
	"model_instance": {
	"_target": "qwen3_vl_text_for_causal_lm",
	"config": {
	"_target": "create_vlm_config",
	"base_config": {
	"_target": "qwen3_vl_mot_config_from_json_file",
	"json_file": "cosmos3://vfm/models/vlm/qwen3_vl/configs/Qwen3-VL-8B-Instruct.json"
	},
	"qk_norm_for_text": true
	}
	},
	"model_name": "nvidia/Cosmos3-Nano-Reasoner",
	"pretrained_weights": {
	"_type": "pretrained_weights_config",
	"backbone_path": "s3://bucket/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Nano-Reasoner-bb9c6f5/",
	"checkpoint_format": null,
	"credentials_path": "credentials/gcp_checkpoint.secret",
	"enable_gcs_patch_in_boto3": true,
	"enabled": false
	},
	"qk_norm": false,
	"tie_word_embeddings": false,
	"tokenizer": {
	"_target": "create_qwen2_tokenizer_with_download",
	"config_variant": "gcp",
	"pretrained_model_name": "Qwen/Qwen3-VL-8B-Instruct"
	},
	"use_system_prompt": false
	}
	}
	},
	"model_type": "cosmos3_omni",
	"text_config": {
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 151643,
	"dtype": "bfloat16",
	"eos_token_id": 151645,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 12288,
	"max_position_embeddings": 262144,
	"model_type": "qwen3_vl_text",
	"num_attention_heads": 32,
	"num_hidden_layers": 36,
	"num_key_value_heads": 8,
	"rms_norm_eps": 1e-06,
	"rope_scaling": {
	"mrope_interleaved": true,
	"mrope_section": [
	24,
	20,
	20
	],
	"rope_type": "default"
	},
	"rope_theta": 5000000,
	"use_cache": true,
	"vocab_size": 151936
	},
	"tie_word_embeddings": false,
	"transformers_version": "4.57.0.dev0",
	"video_token_id": 151656,
	"vision_config": {
	"deepstack_visual_indexes": [
	8,
	16,
	24
	],
	"depth": 27,
	"hidden_act": "gelu_pytorch_tanh",
	"hidden_size": 1152,
	"in_channels": 3,
	"initializer_range": 0.02,
	"intermediate_size": 4304,
	"model_type": "qwen3_vl",
	"num_heads": 16,
	"num_position_embeddings": 2304,
	"out_hidden_size": 4096,
	"patch_size": 16,
	"spatial_merge_size": 2,
	"temporal_patch_size": 2
	},
	"vision_end_token_id": 151653,
	"vision_start_token_id": 151652
	}