| { | |
| "model_type": "s1-omni-image", | |
| "architectures": [ | |
| "S1OmniImageModel" | |
| ], | |
| "prefixes": { | |
| "qwen3_vl": "qwen3_vl.", | |
| "transformer": "transformer.", | |
| "vae": "vae.", | |
| "alignment_mlp": "alignment_mlp." | |
| }, | |
| "qwen3_vl_config": { | |
| "architectures": [ | |
| "Qwen3VLForConditionalGeneration" | |
| ], | |
| "dtype": "bfloat16", | |
| "eos_token_id": 151645, | |
| "hidden_size": 4096, | |
| "image_token_id": 151655, | |
| "model_type": "qwen3_vl", | |
| "pad_token_id": 151643, | |
| "text_config": { | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 151643, | |
| "dtype": "bfloat16", | |
| "eos_token_id": 151645, | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 4096, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 12288, | |
| "max_position_embeddings": 262144, | |
| "model_type": "qwen3_vl_text", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 36, | |
| "num_key_value_heads": 8, | |
| "pad_token_id": 151643, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": { | |
| "mrope_interleaved": true, | |
| "mrope_section": [ | |
| 24, | |
| 20, | |
| 20 | |
| ], | |
| "rope_type": "default" | |
| }, | |
| "rope_theta": 5000000, | |
| "use_cache": false, | |
| "vocab_size": 151936 | |
| }, | |
| "tie_word_embeddings": false, | |
| "transformers_version": "4.57.6", | |
| "video_token_id": 151656, | |
| "vision_config": { | |
| "deepstack_visual_indexes": [ | |
| 8, | |
| 16, | |
| 24 | |
| ], | |
| "depth": 27, | |
| "dtype": "bfloat16", | |
| "hidden_act": "gelu_pytorch_tanh", | |
| "hidden_size": 1152, | |
| "in_channels": 3, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 4304, | |
| "model_type": "qwen3_vl", | |
| "num_heads": 16, | |
| "num_position_embeddings": 2304, | |
| "out_hidden_size": 4096, | |
| "pad_token_id": 151643, | |
| "patch_size": 16, | |
| "spatial_merge_size": 2, | |
| "temporal_patch_size": 2 | |
| }, | |
| "vision_end_token_id": 151653, | |
| "vision_start_token_id": 151652 | |
| }, | |
| "transformer_config": { | |
| "_class_name": "QwenImageTransformer2DModel", | |
| "_diffusers_version": "0.36.0.dev0", | |
| "attention_head_dim": 128, | |
| "axes_dims_rope": [ | |
| 16, | |
| 56, | |
| 56 | |
| ], | |
| "guidance_embeds": false, | |
| "in_channels": 64, | |
| "joint_attention_dim": 3584, | |
| "num_attention_heads": 24, | |
| "num_layers": 60, | |
| "out_channels": 16, | |
| "patch_size": 2, | |
| "zero_cond_t": true | |
| }, | |
| "vae_config": { | |
| "_class_name": "AutoencoderKLQwenImage", | |
| "_diffusers_version": "0.36.0.dev0", | |
| "attn_scales": [], | |
| "base_dim": 96, | |
| "dim_mult": [ | |
| 1, | |
| 2, | |
| 4, | |
| 4 | |
| ], | |
| "dropout": 0.0, | |
| "latents_mean": [ | |
| -0.7571, | |
| -0.7089, | |
| -0.9113, | |
| 0.1075, | |
| -0.1745, | |
| 0.9653, | |
| -0.1517, | |
| 1.5508, | |
| 0.4134, | |
| -0.0715, | |
| 0.5517, | |
| -0.3632, | |
| -0.1922, | |
| -0.9497, | |
| 0.2503, | |
| -0.2921 | |
| ], | |
| "latents_std": [ | |
| 2.8184, | |
| 1.4541, | |
| 2.3275, | |
| 2.6558, | |
| 1.2196, | |
| 1.7708, | |
| 2.6052, | |
| 2.0743, | |
| 3.2687, | |
| 2.1526, | |
| 2.8652, | |
| 1.5579, | |
| 1.6382, | |
| 1.1253, | |
| 2.8251, | |
| 1.916 | |
| ], | |
| "num_res_blocks": 2, | |
| "temperal_downsample": [ | |
| false, | |
| true, | |
| true | |
| ], | |
| "z_dim": 16 | |
| }, | |
| "scheduler_config": { | |
| "_class_name": "FlowMatchEulerDiscreteScheduler", | |
| "_diffusers_version": "0.36.0.dev0", | |
| "base_image_seq_len": 256, | |
| "base_shift": 0.5, | |
| "invert_sigmas": false, | |
| "max_image_seq_len": 8192, | |
| "max_shift": 0.9, | |
| "num_train_timesteps": 1000, | |
| "shift": 1.0, | |
| "shift_terminal": 0.02, | |
| "stochastic_sampling": false, | |
| "time_shift_type": "exponential", | |
| "use_beta_sigmas": false, | |
| "use_dynamic_shifting": true, | |
| "use_exponential_sigmas": false, | |
| "use_karras_sigmas": false | |
| }, | |
| "alignment_mlp": { | |
| "input_dim": 4096, | |
| "output_dim": 3584, | |
| "hidden_dim": 4096, | |
| "type": "mlp" | |
| }, | |
| "special_tokens": { | |
| "think_start": "<think>", | |
| "think_end": "</think>", | |
| "image_gen_start": "<image_gen>", | |
| "image_gen_end": "</image_gen>", | |
| "image_edit_start": "<image_edit>", | |
| "image_edit_end": "</image_edit>" | |
| }, | |
| "generation": { | |
| "max_new_tokens": 2048, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "do_sample": true | |
| }, | |
| "image_generation": { | |
| "default_height": 1024, | |
| "default_width": 1024, | |
| "num_inference_steps": 50, | |
| "guidance_scale": 1.0, | |
| "true_cfg_scale": 4.0 | |
| } | |
| } |