Spaces:
Sleeping
Sleeping
File size: 2,850 Bytes
131990b 2270c30 48021f7 f2c08d2 131990b 48021f7 2270c30 f2c08d2 2270c30 f2c08d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import os
import requests
import base64
from openai import OpenAI
def image_caption(image_url: str) -> str:
"""
Describe an image or answer a specific question about it.
If OPENAI_API_KEY is set, uses GPT-4V (or configured VLM_MODEL_ID) for visual QA.
Otherwise falls back to HuggingFace BLIP captioning.
"""
# 检查是否使用 OpenAI VLM
openai_api_key = os.getenv("OPENAI_API_KEY")
vlm_model = os.getenv("VLM_MODEL_ID", "gpt-4-vision-preview")
use_openai = openai_api_key and vlm_model.startswith("gpt-4")
if use_openai:
try:
# 下载图片并转为 base64
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} if "huggingface.co" in image_url else {}
img_response = requests.get(image_url, headers=headers, timeout=15)
img_response.raise_for_status()
img_b64 = base64.b64encode(img_response.content).decode()
client = OpenAI(api_key=openai_api_key)
response = client.chat.completions.create(
model=vlm_model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe the chess position and provide the best next move for black (algebraic notation). If it's not chess, just describe the image."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}
]
}
],
max_tokens=256,
temperature=0.0,
)
answer = response.choices[0].message.content.strip()
return answer if answer else "No answer from VLM."
except Exception as e:
return f"VLM error: {str(e)}"
else:
# 降级到 BLIP 描述
try:
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
img_data = requests.get(image_url, headers=headers, timeout=10).content
img_b64 = base64.b64encode(img_data).decode()
api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
api_headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
response = requests.post(api_url, headers=api_headers, json={"inputs": img_b64}, timeout=30)
if response.status_code == 200:
result = response.json()
if isinstance(result, list) and len(result) > 0:
return result[0].get('generated_text', 'No caption')
return str(result)
else:
return f"Caption API error: {response.status_code}"
except Exception as e:
return f"Image processing error: {str(e)}" |