// config.json { "_name_or_path": "clip-vit-base-patch16", "architectures": [ "VisionEncoderDecoderModel" ], "encoder": { "model_type": "clip_vision_model", "hidden_size": 768, "image_size": 224 }, "decoder": { "model_type": "gpt2", "hidden_size": 768, "n_layer": 8, "n_head": 12 }, "model_type": "encoder_decoder", "max_length": 50, "num_beams": 4, "initializer_factor": 1.0 }