Spaces:

BiGuan
/

LangGraph_GAIA

Sleeping

App Files Files Community

LangGraph_GAIA / tools /image_caption.py

BiGuan

Update tools/image_caption.py

f2c08d2 verified 28 days ago

Raw

History Blame Contribute Delete

2.85 kB

	import os
	import requests
	import base64
	from openai import OpenAI

	def image_caption(image_url: str) -> str:
	"""
	Describe an image or answer a specific question about it.
	If OPENAI_API_KEY is set, uses GPT-4V (or configured VLM_MODEL_ID) for visual QA.
	Otherwise falls back to HuggingFace BLIP captioning.
	"""
	# 检查是否使用 OpenAI VLM
	openai_api_key = os.getenv("OPENAI_API_KEY")
	vlm_model = os.getenv("VLM_MODEL_ID", "gpt-4-vision-preview")
	use_openai = openai_api_key and vlm_model.startswith("gpt-4")

	if use_openai:
	try:
	# 下载图片并转为 base64
	headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} if "huggingface.co" in image_url else {}
	img_response = requests.get(image_url, headers=headers, timeout=15)
	img_response.raise_for_status()
	img_b64 = base64.b64encode(img_response.content).decode()

	client = OpenAI(api_key=openai_api_key)
	response = client.chat.completions.create(
	model=vlm_model,
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Describe the chess position and provide the best next move for black (algebraic notation). If it's not chess, just describe the image."},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}
	]
	}
	],
	max_tokens=256,
	temperature=0.0,
	)
	answer = response.choices[0].message.content.strip()
	return answer if answer else "No answer from VLM."
	except Exception as e:
	return f"VLM error: {str(e)}"
	else:
	# 降级到 BLIP 描述
	try:
	headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
	img_data = requests.get(image_url, headers=headers, timeout=10).content
	img_b64 = base64.b64encode(img_data).decode()
	api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
	api_headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
	response = requests.post(api_url, headers=api_headers, json={"inputs": img_b64}, timeout=30)
	if response.status_code == 200:
	result = response.json()
	if isinstance(result, list) and len(result) > 0:
	return result[0].get('generated_text', 'No caption')
	return str(result)
	else:
	return f"Caption API error: {response.status_code}"
	except Exception as e:
	return f"Image processing error: {str(e)}"