| { | |
| "pipeline_type": "ZImageMLXPipeline", | |
| "base_model": "Tongyi-MAI/Z-Image-Turbo", | |
| "framework": "mlx", | |
| "model": { | |
| "total_params": "10.26B", | |
| "text_encoder": { | |
| "type": "Qwen3", | |
| "params": "4.02B", | |
| "hidden_size": 2560, | |
| "num_layers": 36, | |
| "num_attention_heads": 32, | |
| "num_key_value_heads": 8, | |
| "dtype": "bfloat16" | |
| }, | |
| "transformer": { | |
| "type": "ZImageTransformer (S3-DiT)", | |
| "params": "6.15B", | |
| "dim": 3840, | |
| "n_heads": 30, | |
| "head_dim": 128, | |
| "n_layers": 30, | |
| "n_refiner_layers": 2, | |
| "ffn_dim": 10240, | |
| "in_channels": 16, | |
| "patch_size": 2, | |
| "dtype": "bfloat16" | |
| }, | |
| "vae": { | |
| "type": "AutoencoderKL Decoder", | |
| "params": "84M", | |
| "latent_channels": 16, | |
| "block_out_channels": [128, 256, 512, 512], | |
| "scaling_factor": 0.3611, | |
| "shift_factor": 0.1159, | |
| "dtype": "float32" | |
| }, | |
| "scheduler": { | |
| "type": "FlowMatchEulerDiscrete", | |
| "shift": 3.0, | |
| "num_train_timesteps": 1000 | |
| } | |
| }, | |
| "quantization": { | |
| "supported_bits": [4, 8, 16], | |
| "default_bits": 16, | |
| "group_size": 64, | |
| "min_quantize_dim": 1024 | |
| }, | |
| "generation_defaults": { | |
| "width": 512, | |
| "height": 512, | |
| "num_steps": 8, | |
| "guidance_scale": 0.0, | |
| "max_text_len": 256 | |
| } | |
| } | |