| { |
| "_gradient_checkpointing": false, |
| "architectures": [ |
| "MetaQuery" |
| ], |
| "attn_implementation": null, |
| "connector_num_hidden_layers": 24, |
| "diffusion_model_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", |
| "discard_mllm_image_feature": false, |
| "dtype": "bfloat16", |
| "in_channels": 32, |
| "input_size": [ |
| 20, |
| 15 |
| ], |
| "loss_type": "flow", |
| "max_input_text_tokens": 256, |
| "max_pixels": 1003520, |
| "min_pixels": 200740, |
| "mllm_id": "google/gemma-2-2b-it", |
| "model_type": "metaquery", |
| "modules_to_freeze": [ |
| "vae", |
| "model.mllm_backbone" |
| ], |
| "modules_to_unfreeze": [], |
| "noise_scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", |
| "num_metaqueries": 64, |
| "scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", |
| "source_vae_feature": true, |
| "system_prompt": "You are a robot and should focus on your actions. Generate a new image that meets the user's instruction while maintaining consistency with the original input where appropriate.", |
| "transformers_version": "4.57.1", |
| "vae_downsample_f": 32, |
| "vae_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers" |
| } |
|
|