File size: 1,380 Bytes
7344bef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | from __future__ import annotations
from typing import Any
from shared.prompt_enhancer.qwen35_vl import _prepare_multimodal_vllm_prompt
VISION_QA_SYSTEM_PROMPT = "Answer the user's question about the provided image accurately and concisely. If the answer is uncertain, say so."
def build_image_question_prompt(caption_model: Any, processor: Any, image: Any, question: str, system_prompt: str | None = None):
question = str(question or "").strip()
if len(question) == 0:
raise ValueError("Vision question is empty.")
messages = []
system_prompt = str(system_prompt or VISION_QA_SYSTEM_PROMPT).strip()
if len(system_prompt) > 0:
messages.append({"role": "system", "content": system_prompt})
messages.append(
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question},
],
}
)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
model_inputs = processor(
text=[text],
images=[image],
return_tensors="pt",
padding=True,
return_mm_token_type_ids=True,
)
return _prepare_multimodal_vllm_prompt(caption_model, model_inputs)
__all__ = ["VISION_QA_SYSTEM_PROMPT", "build_image_question_prompt"]
|