{ "model_type": "multimodal", "base_model": "Qwen/Qwen2.5-7B-Instruct", "vision_encoder": "openai/clip-vit-large-patch14", "modalities": { "input": [ "image", "text" ], "output": [ "text" ] }, "projection": { "type": "mlp", "hidden_size": 1024, "num_layers": 2, "dropout": 0.1 }, "max_seq_length": 4096, "special_tokens": { "image_start": "", "image_end": "" } }