// config.json
{
  "_name_or_path": "clip-vit-base-patch16",
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "encoder": {
    "model_type": "clip_vision_model",
    "hidden_size": 768,
    "image_size": 224
  },
  "decoder": {
    "model_type": "gpt2",
    "hidden_size": 768,
    "n_layer": 8,
    "n_head": 12
  },
  "model_type": "encoder_decoder",
  "max_length": 50,
  "num_beams": 4,
  "initializer_factor": 1.0
}