qingy2024's picture
Extract Mllama vision encoder from unsloth/Llama-3.2-11B-Vision
621808b verified
Raw
History Blame Contribute Delete
860 Bytes
{
"_attn_implementation_autoset": false,
"architectures": [
"MllamaVisionModel"
],
"attention_heads": 16,
"dtype": "float32",
"hidden_act": "gelu",
"hidden_size": 1280,
"image_size": 448,
"initializer_range": 0.02,
"intermediate_layers_indices": [
3,
7,
15,
23,
30
],
"intermediate_size": 5120,
"max_num_tiles": 4,
"model_type": "mllama_vision_model",
"norm_eps": 1e-05,
"num_channels": 3,
"num_global_layers": 8,
"num_hidden_layers": 32,
"patch_size": 14,
"supported_aspect_ratios": [
[
1,
1
],
[
1,
2
],
[
1,
3
],
[
1,
4
],
[
2,
1
],
[
2,
2
],
[
3,
1
],
[
4,
1
]
],
"transformers_version": "4.57.6",
"vision_output_dim": 7680
}