{ "architectures": [ "LegatoModel" ], "encoder_pretrained_model_name_or_path": "./vision_encoder", "image_token_index": 4096, "model_type": "legato", "pad_token_id": 4, "text_config": { "bos_token_id": 1, "cross_attention_layers": [ 3, 7, 11, 15 ], "dropout": 0, "eos_token_id": [ 2 ], "hidden_act": "silu", "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 1526, "max_position_embeddings": 131072, "model_type": "mllama_text_model", "num_attention_heads": 12, "num_hidden_layers": 18, "num_key_value_heads": 6, "pad_token_id": 4, "rms_norm_eps": 1e-05, "rope_scaling": { "factor": 8.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3" }, "rope_theta": 500000, "torch_dtype": "float32", "use_cache": true, "vocab_size": 4097 }, "torch_dtype": "float32", "transformers_version": "4.50.0", "vision_config": { "_attn_implementation_autoset": true, "_name_or_path": "meta-llama/Llama-3.2-11B-Vision", "attention_heads": 16, "hidden_act": "gelu", "hidden_size": 1280, "image_size": 448, "initializer_range": 0.02, "intermediate_layers_indices": [ 3, 7, 15, 23, 30 ], "intermediate_size": 5120, "max_num_tiles": 4, "model_type": "mllama_vision_model", "norm_eps": 1e-05, "num_channels": 3, "num_global_layers": 8, "num_hidden_layers": 32, "patch_size": 14, "supported_aspect_ratios": [ [ 1, 1 ], [ 1, 2 ], [ 1, 3 ], [ 1, 4 ], [ 2, 1 ], [ 2, 2 ], [ 3, 1 ], [ 4, 1 ] ], "torch_dtype": "float32", "vision_output_dim": 7680 } }