| // config.json | |
| { | |
| "_name_or_path": "clip-vit-base-patch16", | |
| "architectures": [ | |
| "VisionEncoderDecoderModel" | |
| ], | |
| "encoder": { | |
| "model_type": "clip_vision_model", | |
| "hidden_size": 768, | |
| "image_size": 224 | |
| }, | |
| "decoder": { | |
| "model_type": "gpt2", | |
| "hidden_size": 768, | |
| "n_layer": 8, | |
| "n_head": 12 | |
| }, | |
| "model_type": "encoder_decoder", | |
| "max_length": 50, | |
| "num_beams": 4, | |
| "initializer_factor": 1.0 | |
| } |