LangGraph_GAIA / tools /image_caption.py
BiGuan's picture
Update tools/image_caption.py
f2c08d2 verified
Raw
History Blame Contribute Delete
2.85 kB
import os
import requests
import base64
from openai import OpenAI
def image_caption(image_url: str) -> str:
"""
Describe an image or answer a specific question about it.
If OPENAI_API_KEY is set, uses GPT-4V (or configured VLM_MODEL_ID) for visual QA.
Otherwise falls back to HuggingFace BLIP captioning.
"""
# 检查是否使用 OpenAI VLM
openai_api_key = os.getenv("OPENAI_API_KEY")
vlm_model = os.getenv("VLM_MODEL_ID", "gpt-4-vision-preview")
use_openai = openai_api_key and vlm_model.startswith("gpt-4")
if use_openai:
try:
# 下载图片并转为 base64
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} if "huggingface.co" in image_url else {}
img_response = requests.get(image_url, headers=headers, timeout=15)
img_response.raise_for_status()
img_b64 = base64.b64encode(img_response.content).decode()
client = OpenAI(api_key=openai_api_key)
response = client.chat.completions.create(
model=vlm_model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe the chess position and provide the best next move for black (algebraic notation). If it's not chess, just describe the image."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}
]
}
],
max_tokens=256,
temperature=0.0,
)
answer = response.choices[0].message.content.strip()
return answer if answer else "No answer from VLM."
except Exception as e:
return f"VLM error: {str(e)}"
else:
# 降级到 BLIP 描述
try:
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
img_data = requests.get(image_url, headers=headers, timeout=10).content
img_b64 = base64.b64encode(img_data).decode()
api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
api_headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
response = requests.post(api_url, headers=api_headers, json={"inputs": img_b64}, timeout=30)
if response.status_code == 200:
result = response.json()
if isinstance(result, list) and len(result) > 0:
return result[0].get('generated_text', 'No caption')
return str(result)
else:
return f"Caption API error: {response.status_code}"
except Exception as e:
return f"Image processing error: {str(e)}"