File size: 1,380 Bytes
7344bef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from __future__ import annotations

from typing import Any

from shared.prompt_enhancer.qwen35_vl import _prepare_multimodal_vllm_prompt


VISION_QA_SYSTEM_PROMPT = "Answer the user's question about the provided image accurately and concisely. If the answer is uncertain, say so."


def build_image_question_prompt(caption_model: Any, processor: Any, image: Any, question: str, system_prompt: str | None = None):
    question = str(question or "").strip()
    if len(question) == 0:
        raise ValueError("Vision question is empty.")
    messages = []
    system_prompt = str(system_prompt or VISION_QA_SYSTEM_PROMPT).strip()
    if len(system_prompt) > 0:
        messages.append({"role": "system", "content": system_prompt})
    messages.append(
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        }
    )
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
    model_inputs = processor(
        text=[text],
        images=[image],
        return_tensors="pt",
        padding=True,
        return_mm_token_type_ids=True,
    )
    return _prepare_multimodal_vllm_prompt(caption_model, model_inputs)


__all__ = ["VISION_QA_SYSTEM_PROMPT", "build_image_question_prompt"]