{ "_gradient_checkpointing": false, "architectures": [ "VisualForesight" ], "attn_implementation": null, "diffusion_model_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", "dtype": "bfloat16", "in_channels": 32, "input_size": [ 15, 20 ], "max_input_text_tokens": 256, "mllm_id": "google/gemma-2-2b-it", "model_type": "visualforesight", "modules_to_freeze": [ "vae", "mllm_backbone" ], "modules_to_unfreeze": [], "noise_scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", "scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", "system_prompt": "You are a robot and should focus on your actions. Generate a new image that meets the user's instruction while maintaining consistency with the original input where appropriate.", "transformers_version": "4.57.1", "vae_downsample_f": 32, "vae_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers" }