{ "model_type": "s1-omni-image", "architectures": [ "S1OmniImageModel" ], "prefixes": { "qwen3_vl": "qwen3_vl.", "transformer": "transformer.", "vae": "vae.", "alignment_mlp": "alignment_mlp." }, "qwen3_vl_config": { "architectures": [ "Qwen3VLForConditionalGeneration" ], "dtype": "bfloat16", "eos_token_id": 151645, "hidden_size": 4096, "image_token_id": 151655, "model_type": "qwen3_vl", "pad_token_id": 151643, "text_config": { "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 151643, "dtype": "bfloat16", "eos_token_id": 151645, "head_dim": 128, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 12288, "max_position_embeddings": 262144, "model_type": "qwen3_vl_text", "num_attention_heads": 32, "num_hidden_layers": 36, "num_key_value_heads": 8, "pad_token_id": 151643, "rms_norm_eps": 1e-06, "rope_scaling": { "mrope_interleaved": true, "mrope_section": [ 24, 20, 20 ], "rope_type": "default" }, "rope_theta": 5000000, "use_cache": false, "vocab_size": 151936 }, "tie_word_embeddings": false, "transformers_version": "4.57.6", "video_token_id": 151656, "vision_config": { "deepstack_visual_indexes": [ 8, 16, 24 ], "depth": 27, "dtype": "bfloat16", "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "in_channels": 3, "initializer_range": 0.02, "intermediate_size": 4304, "model_type": "qwen3_vl", "num_heads": 16, "num_position_embeddings": 2304, "out_hidden_size": 4096, "pad_token_id": 151643, "patch_size": 16, "spatial_merge_size": 2, "temporal_patch_size": 2 }, "vision_end_token_id": 151653, "vision_start_token_id": 151652 }, "transformer_config": { "_class_name": "QwenImageTransformer2DModel", "_diffusers_version": "0.36.0.dev0", "attention_head_dim": 128, "axes_dims_rope": [ 16, 56, 56 ], "guidance_embeds": false, "in_channels": 64, "joint_attention_dim": 3584, "num_attention_heads": 24, "num_layers": 60, "out_channels": 16, "patch_size": 2, "zero_cond_t": true }, "vae_config": { "_class_name": "AutoencoderKLQwenImage", "_diffusers_version": "0.36.0.dev0", "attn_scales": [], "base_dim": 96, "dim_mult": [ 1, 2, 4, 4 ], "dropout": 0.0, "latents_mean": [ -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921 ], "latents_std": [ 2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.916 ], "num_res_blocks": 2, "temperal_downsample": [ false, true, true ], "z_dim": 16 }, "scheduler_config": { "_class_name": "FlowMatchEulerDiscreteScheduler", "_diffusers_version": "0.36.0.dev0", "base_image_seq_len": 256, "base_shift": 0.5, "invert_sigmas": false, "max_image_seq_len": 8192, "max_shift": 0.9, "num_train_timesteps": 1000, "shift": 1.0, "shift_terminal": 0.02, "stochastic_sampling": false, "time_shift_type": "exponential", "use_beta_sigmas": false, "use_dynamic_shifting": true, "use_exponential_sigmas": false, "use_karras_sigmas": false }, "alignment_mlp": { "input_dim": 4096, "output_dim": 3584, "hidden_dim": 4096, "type": "mlp" }, "special_tokens": { "think_start": "", "think_end": "", "image_gen_start": "", "image_gen_end": "", "image_edit_start": "", "image_edit_end": "" }, "generation": { "max_new_tokens": 2048, "temperature": 0.7, "top_p": 0.9, "do_sample": true }, "image_generation": { "default_height": 1024, "default_width": 1024, "num_inference_steps": 50, "guidance_scale": 1.0, "true_cfg_scale": 4.0 } }