File size: 1,268 Bytes
1f07a2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
{
"architectures": [
"VTPModel"
],
"decoder_depth": 24,
"decoder_embed_dim": 1024,
"decoder_ffn_layer": "swiglu",
"decoder_init_values": null,
"decoder_norm_layer": "layernorm",
"decoder_num_heads": 16,
"decoder_use_qk_norm": false,
"dtype": "float32",
"image_size": 256,
"init_logit_bias": null,
"init_logit_scale": null,
"model_type": "vtp",
"nonscalar_logit_scale": false,
"text_context_length": 77,
"text_depth": 12,
"text_embed_cls": false,
"text_embed_dim": 768,
"text_ls_init_value": null,
"text_mlp_ratio": 4.0,
"text_no_causal_mask": false,
"text_num_heads": 12,
"text_output_tokens": false,
"text_pad_id": 0,
"text_pool_type": "argmax",
"text_proj_bias": false,
"text_proj_type": "linear",
"text_quick_gelu": false,
"text_vocab_size": 49408,
"train_clip": true,
"train_reconstruction": true,
"transformers_version": "4.56.0.dev0",
"vision_bottleneck_ae_only": true,
"vision_clip_feat": "cls",
"vision_depth": 24,
"vision_embed_dim": 1024,
"vision_feature_bottleneck": 64,
"vision_ffn_layer": "swiglu",
"vision_init_values": null,
"vision_mlp_ratio": 4,
"vision_norm_layer": "rmsnorm",
"vision_num_heads": 16,
"vision_patch_size": 16,
"vision_use_qk_norm": false
}
|