| { | |
| "model_type": "multimodal", | |
| "base_model": "Qwen/Qwen2.5-7B-Instruct", | |
| "vision_encoder": "openai/clip-vit-large-patch14", | |
| "modalities": { | |
| "input": [ | |
| "image", | |
| "text" | |
| ], | |
| "output": [ | |
| "text" | |
| ] | |
| }, | |
| "projection": { | |
| "type": "mlp", | |
| "hidden_size": 1024, | |
| "num_layers": 2, | |
| "dropout": 0.1 | |
| }, | |
| "max_seq_length": 4096, | |
| "special_tokens": { | |
| "image_start": "<img>", | |
| "image_end": "</img>" | |
| } | |
| } |