"""
tools/visual_qa.py —— 工具⑥：看图回答问题

有的题目附带一张图片（比如国际象棋棋盘截图），需要"看懂图"才能答。
这个工具把图片交给会"看图"的模型（gpt-4o），并附上一个具体问题，让它看图作答。

关键技巧：图片不能直接当文字发送，要先把它编码成一长串文本（Base64 编码），
再拼成一种叫 data URI 的特殊格式，模型才能"收到"这张图。下面的代码就是在做这件事。
"""

import os
import base64      # 用于把图片转成可传输的文本编码
import mimetypes   # 用于猜测图片的具体类型（png/jpeg...）

from langchain_core.tools import tool

from config import LLM_BASE_URL, LLM_API_KEY, VLM_MODEL_ID


@tool
def visual_qa(file_path: str, question: str) -> str:
    """Answer a question about a local image file using a vision-language model. Pass the
    image path and a precise question, e.g. 'What chess move should black play? Answer in
    algebraic notation.' or 'Transcribe all text in this image.'"""
    if not os.path.exists(file_path):
        return f"File not found: {file_path}"
    try:
        from openai import OpenAI

        # 猜测图片类型（如 image/png）；猜不出就默认按 png 处理。
        mime = mimetypes.guess_type(file_path)[0] or "image/png"
        # 以二进制读入图片，编码成 Base64 文本，再拼成 data URI（模型能识别的"图片文本"格式）。
        with open(file_path, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("utf-8")
        data_uri = f"data:{mime};base64,{b64}"

        client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
        # 一条消息里同时塞进两样东西：要问的问题(text) + 那张图(image_url)，一起发给看图模型。
        response = client.chat.completions.create(
            model=VLM_MODEL_ID,
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": data_uri}},
                    ],
                }
            ],
        )
        # 取出模型看图后给出的回答文字。
        return response.choices[0].message.content
    except Exception as e:
        return f"Error analysing image '{file_path}': {e}"