{ "architectures": [ "LAMModel" ], "encoder_height": 224, "encoder_width": 304, "fsq_levels": [ 7, 5, 5, 5, 5 ], "initializer_range": 0.02, "is_diffusion": true, "latent_channels": 5, "max_tokens": 80, "min_tokens": 1, "model_type": "lam", "null_latent": 0, "num_tokens_in": 333, "pool_pick_tokens": 80, "torch_dtype": "float32", "transformer_config": { "_class_name": "SD3Transformer2DModel", "_diffusers_version": "0.31.0.dev0", "_name_or_path": "stabilityai/stable-diffusion-3.5-medium", "attention_head_dim": 64, "caption_projection_dim": 1536, "dual_attention_layers": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ], "in_channels": 32, "joint_attention_dim": 5, "num_attention_heads": 24, "num_layers": 24, "out_channels": 16, "patch_size": 2, "pooled_projection_dim": 400, "pos_embed_max_size": 384, "qk_norm": "rms_norm", "sample_size": 128 }, "transformers_version": "4.50.0", "use_empty_prompt": true, "use_tail_drop": true, "videomae_from_pretrained": "OpenGVLab/VideoMAEv2-Large", "vocab_size": 4375 }