| """ |
| tools/visual_qa.py —— 工具⑥:看图回答问题 |
| |
| 有的题目附带一张图片(比如国际象棋棋盘截图),需要"看懂图"才能答。 |
| 这个工具把图片交给会"看图"的模型(gpt-4o),并附上一个具体问题,让它看图作答。 |
| |
| 关键技巧:图片不能直接当文字发送,要先把它编码成一长串文本(Base64 编码), |
| 再拼成一种叫 data URI 的特殊格式,模型才能"收到"这张图。下面的代码就是在做这件事。 |
| """ |
|
|
| import os |
| import base64 |
| import mimetypes |
|
|
| from langchain_core.tools import tool |
|
|
| from config import LLM_BASE_URL, LLM_API_KEY, VLM_MODEL_ID |
|
|
|
|
| @tool |
| def visual_qa(file_path: str, question: str) -> str: |
| """Answer a question about a local image file using a vision-language model. Pass the |
| image path and a precise question, e.g. 'What chess move should black play? Answer in |
| algebraic notation.' or 'Transcribe all text in this image.'""" |
| if not os.path.exists(file_path): |
| return f"File not found: {file_path}" |
| try: |
| from openai import OpenAI |
|
|
| |
| mime = mimetypes.guess_type(file_path)[0] or "image/png" |
| |
| with open(file_path, "rb") as f: |
| b64 = base64.b64encode(f.read()).decode("utf-8") |
| data_uri = f"data:{mime};base64,{b64}" |
|
|
| client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY) |
| |
| response = client.chat.completions.create( |
| model=VLM_MODEL_ID, |
| max_tokens=1024, |
| messages=[ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": question}, |
| {"type": "image_url", "image_url": {"url": data_uri}}, |
| ], |
| } |
| ], |
| ) |
| |
| return response.choices[0].message.content |
| except Exception as e: |
| return f"Error analysing image '{file_path}': {e}" |
|
|