{ "architectures": [ "VTPModel" ], "decoder_depth": 24, "decoder_embed_dim": 1024, "decoder_ffn_layer": "swiglu", "decoder_init_values": null, "decoder_norm_layer": "layernorm", "decoder_num_heads": 16, "decoder_use_qk_norm": false, "dtype": "float32", "image_size": 256, "init_logit_bias": null, "init_logit_scale": null, "model_type": "vtp", "nonscalar_logit_scale": false, "text_context_length": 77, "text_depth": 12, "text_embed_cls": false, "text_embed_dim": 768, "text_ls_init_value": null, "text_mlp_ratio": 4.0, "text_no_causal_mask": false, "text_num_heads": 12, "text_output_tokens": false, "text_pad_id": 0, "text_pool_type": "argmax", "text_proj_bias": false, "text_proj_type": "linear", "text_quick_gelu": false, "text_vocab_size": 49408, "train_clip": true, "train_reconstruction": true, "transformers_version": "4.56.0.dev0", "vision_bottleneck_ae_only": true, "vision_clip_feat": "cls", "vision_depth": 24, "vision_embed_dim": 1024, "vision_feature_bottleneck": 64, "vision_ffn_layer": "swiglu", "vision_init_values": null, "vision_mlp_ratio": 4, "vision_norm_layer": "rmsnorm", "vision_num_heads": 16, "vision_patch_size": 16, "vision_use_qk_norm": false }