Wan2GP / shared /deepy /vision.py
Egnalkram's picture
Upload folder using huggingface_hub
4689c2b verified
from __future__ import annotations
from typing import Any
from shared.prompt_enhancer.qwen35_vl import _prepare_multimodal_vllm_prompt
VISION_QA_SYSTEM_PROMPT = "Answer the user's question about the provided image accurately and concisely. If the answer is uncertain, say so."
def build_image_question_prompt(caption_model: Any, processor: Any, image: Any, question: str, system_prompt: str | None = None):
question = str(question or "").strip()
if len(question) == 0:
raise ValueError("Vision question is empty.")
messages = []
system_prompt = str(system_prompt or VISION_QA_SYSTEM_PROMPT).strip()
if len(system_prompt) > 0:
messages.append({"role": "system", "content": system_prompt})
messages.append(
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question},
],
}
)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
model_inputs = processor(
text=[text],
images=[image],
return_tensors="pt",
padding=True,
return_mm_token_type_ids=True,
)
return _prepare_multimodal_vllm_prompt(caption_model, model_inputs)
__all__ = ["VISION_QA_SYSTEM_PROMPT", "build_image_question_prompt"]