| { | |
| "_gradient_checkpointing": false, | |
| "architectures": [ | |
| "VisualForesight" | |
| ], | |
| "attn_implementation": null, | |
| "diffusion_model_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", | |
| "dtype": "bfloat16", | |
| "in_channels": 32, | |
| "input_size": [ | |
| 15, | |
| 20 | |
| ], | |
| "max_input_text_tokens": 256, | |
| "mllm_id": "google/gemma-2-2b-it", | |
| "model_type": "visualforesight", | |
| "modules_to_freeze": [ | |
| "vae", | |
| "mllm_backbone" | |
| ], | |
| "modules_to_unfreeze": [], | |
| "noise_scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", | |
| "scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", | |
| "system_prompt": "You are a robot and should focus on your actions. Generate a new image that meets the user's instruction while maintaining consistency with the original input where appropriate.", | |
| "transformers_version": "4.57.1", | |
| "vae_downsample_f": 32, | |
| "vae_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers" | |
| } |