Spaces:

BiGuan
/

Agent

Sleeping

App Files Files Community

Agent / tools /visual_qa.py

BiGuan

Upload 13 files

e15103f verified 26 days ago

Raw

History Blame Contribute Delete

2.44 kB

	"""
	tools/visual_qa.py —— 工具⑥：看图回答问题

	有的题目附带一张图片（比如国际象棋棋盘截图），需要"看懂图"才能答。
	这个工具把图片交给会"看图"的模型（gpt-4o），并附上一个具体问题，让它看图作答。

	关键技巧：图片不能直接当文字发送，要先把它编码成一长串文本（Base64 编码），
	再拼成一种叫 data URI 的特殊格式，模型才能"收到"这张图。下面的代码就是在做这件事。
	"""

	import os
	import base64 # 用于把图片转成可传输的文本编码
	import mimetypes # 用于猜测图片的具体类型（png/jpeg...）

	from langchain_core.tools import tool

	from config import LLM_BASE_URL, LLM_API_KEY, VLM_MODEL_ID


	@tool
	def visual_qa(file_path: str, question: str) -> str:
	"""Answer a question about a local image file using a vision-language model. Pass the
	image path and a precise question, e.g. 'What chess move should black play? Answer in
	algebraic notation.' or 'Transcribe all text in this image.'"""
	if not os.path.exists(file_path):
	return f"File not found: {file_path}"
	try:
	from openai import OpenAI

	# 猜测图片类型（如 image/png）；猜不出就默认按 png 处理。
	mime = mimetypes.guess_type(file_path)[0] or "image/png"
	# 以二进制读入图片，编码成 Base64 文本，再拼成 data URI（模型能识别的"图片文本"格式）。
	with open(file_path, "rb") as f:
	b64 = base64.b64encode(f.read()).decode("utf-8")
	data_uri = f"data:{mime};base64,{b64}"

	client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
	# 一条消息里同时塞进两样东西：要问的问题(text) + 那张图(image_url)，一起发给看图模型。
	response = client.chat.completions.create(
	model=VLM_MODEL_ID,
	max_tokens=1024,
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{"type": "image_url", "image_url": {"url": data_uri}},
	],
	}
	],
	)
	# 取出模型看图后给出的回答文字。
	return response.choices[0].message.content
	except Exception as e:
	return f"Error analysing image '{file_path}': {e}"