from __future__ import annotations from typing import Any from shared.prompt_enhancer.qwen35_vl import _prepare_multimodal_vllm_prompt VISION_QA_SYSTEM_PROMPT = "Answer the user's question about the provided image accurately and concisely. If the answer is uncertain, say so." def build_image_question_prompt(caption_model: Any, processor: Any, image: Any, question: str, system_prompt: str | None = None): question = str(question or "").strip() if len(question) == 0: raise ValueError("Vision question is empty.") messages = [] system_prompt = str(system_prompt or VISION_QA_SYSTEM_PROMPT).strip() if len(system_prompt) > 0: messages.append({"role": "system", "content": system_prompt}) messages.append( { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": question}, ], } ) text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False) model_inputs = processor( text=[text], images=[image], return_tensors="pt", padding=True, return_mm_token_type_ids=True, ) return _prepare_multimodal_vllm_prompt(caption_model, model_inputs) __all__ = ["VISION_QA_SYSTEM_PROMPT", "build_image_question_prompt"]