File size: 2,662 Bytes
b0c0df0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from typing import List

from PIL import Image

REASONING_SYS_PROMPT = (
    "You are a helpful assistant.\n\n"
    "Solve the following problem step by step, and optionally write Python code "
    "for image manipulation to enhance your reasoning process. The Python code "
    "will be executed by an external sandbox, and the processed image or result "
    "(wrapped in <sandbox_output></sandbox_output>) can be returned to aid your "
    "reasoning and help you arrive at the final answer.\n\n"
    "**Reasoning & Image Manipulation (Optional but Encouraged):**\n"
    "    * You have the capability to write executable Python code to perform "
    "image manipulations (e.g., cropping to a Region of Interest (ROI), "
    "resizing, rotation, adjusting contrast) or perform calculation for better "
    "reasoning.\n"
    "    * The code will be executed in a secure sandbox, and its output will be "
    "provided back to you for further analysis.\n"
    "    * All Python code snippets **must** be wrapped as follows:\n"
    "    <code>\n"
    "    ```python\n"
    "    # your code.\n"
    "    ```\n"
    "    </code>\n"
    "    * At the end of the code, print the path of the processed image "
    "(processed_path) or the result for further processing in a sandbox "
    "environment."
)


SIMPLE_SYS_PROMPT = "You are a helpful assistant."


def generate_prompt_simple_qa(user_question: str) -> str:
    """Build a minimal VQA prompt that answers directly with no reasoning."""
    # Construct the prompt based on the given requirements
    prompt = (
        "You are an advanced AI assistant specializing in visual question "
        "answering (VQA). You don't need to perform any image manipulation "
        "or reasoning. Give the answer to the following question directly.\n"
        f'**User\'s Question:** "{user_question}"'
    )
    return prompt


def generate_prompt_final_qa(user_question: str, user_image_path: str) -> str:
    """Build a reasoning-mode VQA prompt with image metadata (WxH)."""
    try:
        with Image.open(user_image_path) as img:
            user_image_size = f"{img.width}x{img.height}"
    except (FileNotFoundError, OSError, IOError):
        user_image_size = "Unable to determine (error reading image)"

    prompt = f"""<image>
{user_question}

### User Image Path: "{user_image_path}"
### User Image Size: "{user_image_size}"

### **Output Format (strict adherence required):**

<think>Your detailed reasoning process, including any code, should go here.</think>
<answer>Your final answer to the user's question goes here.</answer>
"""
    return prompt


SPECIAL_STRING_LIST = ["</code>", "</answer>"]