{
  "model_type": "multimodal",
  "base_model": "Qwen/Qwen2.5-7B-Instruct",
  "vision_encoder": "openai/clip-vit-large-patch14",
  "modalities": {
    "input": [
      "image",
      "text"
    ],
    "output": [
      "text"
    ]
  },
  "projection": {
    "type": "mlp",
    "hidden_size": 1024,
    "num_layers": 2,
    "dropout": 0.1
  },
  "max_seq_length": 4096,
  "special_tokens": {
    "image_start": "<img>",
    "image_end": "</img>"
  }
}