|
|
import base64 |
|
|
import json |
|
|
import io |
|
|
import os |
|
|
from typing import Any, Dict, List, Optional |
|
|
|
|
|
from PIL import Image |
|
|
|
|
|
from transformers import AutoProcessor |
|
|
from qwen_vl_utils import process_vision_info |
|
|
import re |
|
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
|
|
|
def _b64_to_pil(data_url: str) -> Image.Image: |
|
|
if not isinstance(data_url, str) or not data_url.startswith("data:"): |
|
|
raise ValueError("Expected a data URL starting with 'data:'") |
|
|
header, b64data = data_url.split(',', 1) |
|
|
raw = base64.b64decode(b64data) |
|
|
img = Image.open(io.BytesIO(raw)) |
|
|
img.load() |
|
|
return img |
|
|
|
|
|
|
|
|
class EndpointHandler: |
|
|
"""Custom handler for Hugging Face Inference Endpoints (Qwen2.5-VL). |
|
|
|
|
|
Input (OpenAI-style): |
|
|
{ "messages": [ { "role":"user", "content": [ {"type":"image_url","image_url":{"url":"data:..."}}, {"type":"text","text":"..."} ] } ] } |
|
|
|
|
|
Output: { raw: string, width?: number, height?: number } |
|
|
""" |
|
|
|
|
|
def __init__(self, path: str = "") -> None: |
|
|
|
|
|
model_id = os.environ.get("MODEL_ID") or "HelloKKMe/GTA1-7B" |
|
|
|
|
|
os.environ.setdefault("OMP_NUM_THREADS", "1") |
|
|
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") |
|
|
|
|
|
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") |
|
|
os.environ.setdefault("HF_HUB_ENABLE_QUIC", "1") |
|
|
|
|
|
hub_token = ( |
|
|
os.environ.get("HUGGINGFACE_HUB_TOKEN") |
|
|
or os.environ.get("HF_HUB_TOKEN") |
|
|
or os.environ.get("HF_TOKEN") |
|
|
) |
|
|
|
|
|
if hub_token and not os.environ.get("HF_TOKEN"): |
|
|
try: |
|
|
os.environ["HF_TOKEN"] = hub_token |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") |
|
|
visible = os.environ.get("CUDA_VISIBLE_DEVICES") |
|
|
if visible and visible.strip(): |
|
|
try: |
|
|
candidates = [d for d in visible.split(",") if d.strip() and d.strip() != "-1"] |
|
|
tp = max(1, len(candidates)) |
|
|
except Exception: |
|
|
tp = 1 |
|
|
else: |
|
|
try: |
|
|
import torch |
|
|
tp = max(1, int(torch.cuda.device_count())) if torch.cuda.is_available() else 1 |
|
|
except Exception: |
|
|
tp = 1 |
|
|
|
|
|
|
|
|
self._model_id = model_id |
|
|
self._tp = tp |
|
|
self.llm = None |
|
|
self.processor = AutoProcessor.from_pretrained( |
|
|
model_id, trust_remote_code=True, token=hub_token |
|
|
) |
|
|
|
|
|
def _ensure_llm(self) -> None: |
|
|
if self.llm is not None: |
|
|
return |
|
|
self.llm = LLM( |
|
|
model=self._model_id, |
|
|
tensor_parallel_size=self._tp, |
|
|
pipeline_parallel_size=1, |
|
|
gpu_memory_utilization=0.95, |
|
|
dtype="auto", |
|
|
distributed_executor_backend="mp", |
|
|
enforce_eager=True, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> Any: |
|
|
|
|
|
if isinstance(data, dict) and "inputs" in data: |
|
|
inputs_val = data.get("inputs") |
|
|
if isinstance(inputs_val, dict): |
|
|
data = inputs_val |
|
|
elif isinstance(inputs_val, (str, bytes, bytearray)): |
|
|
try: |
|
|
if isinstance(inputs_val, (bytes, bytearray)): |
|
|
inputs_val = inputs_val.decode("utf-8") |
|
|
parsed = json.loads(inputs_val) |
|
|
if isinstance(parsed, dict): |
|
|
data = parsed |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
img_for_dims: Optional[Image.Image] = None |
|
|
system_prompt: Optional[str] = None |
|
|
user_text: Optional[str] = None |
|
|
image_data_url: Optional[str] = None |
|
|
|
|
|
if isinstance(data, dict) and ("system" in data or "user" in data or "image" in data): |
|
|
system_prompt = data.get("system") |
|
|
user_text = data.get("user") |
|
|
image_data_url = data.get("image") |
|
|
if not isinstance(image_data_url, str) or not image_data_url.startswith("data:"): |
|
|
return {"error": "image must be a data URL (data:...)"} |
|
|
try: |
|
|
img_for_dims = _b64_to_pil(image_data_url) |
|
|
except Exception as e: |
|
|
return {"error": f"Failed to decode image: {e}"} |
|
|
messages = [ |
|
|
{"role": "system", "content": system_prompt or ""}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": img_for_dims}, |
|
|
{"type": "text", "text": user_text or ""}, |
|
|
], |
|
|
}, |
|
|
] |
|
|
else: |
|
|
messages = data.get("messages") |
|
|
if not messages: |
|
|
return {"error": "Provide 'system','user','image' or legacy 'messages'"} |
|
|
normalized: List[Dict[str, Any]] = [] |
|
|
first_img: Optional[Image.Image] = None |
|
|
for msg in messages: |
|
|
if msg.get("role") == "system" and system_prompt is None: |
|
|
system_prompt = msg.get("content") if isinstance(msg.get("content"), str) else None |
|
|
if msg.get("role") == "user": |
|
|
content = msg.get("content", []) |
|
|
image_url: Optional[str] = None |
|
|
text_piece: Optional[str] = None |
|
|
for part in content: |
|
|
if part.get("type") == "image_url": |
|
|
image_url = part.get("image_url", {}).get("url") |
|
|
elif part.get("type") == "text": |
|
|
text_piece = part.get("text") |
|
|
if not image_url or not text_piece: |
|
|
return {"error": "Content must include image_url (data URL) and text."} |
|
|
if not isinstance(image_url, str) or not image_url.startswith("data:"): |
|
|
return {"error": "image_url.url must be a data URL (data:...)"} |
|
|
try: |
|
|
img_for_dims = _b64_to_pil(image_url) |
|
|
first_img = first_img or img_for_dims |
|
|
except Exception: |
|
|
img_for_dims = None |
|
|
user_text = user_text or text_piece |
|
|
normalized.append( |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": image_url}, |
|
|
{"type": "text", "text": text_piece}, |
|
|
], |
|
|
} |
|
|
) |
|
|
messages = [{"role": "system", "content": system_prompt or ""}] + normalized |
|
|
if first_img is not None: |
|
|
img_for_dims = first_img |
|
|
|
|
|
width = getattr(img_for_dims, "width", None) |
|
|
height = getattr(img_for_dims, "height", None) |
|
|
if width and height: |
|
|
try: |
|
|
print(f"[gta1-endpoint] Received image size: {width}x{height}") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
if not isinstance(img_for_dims, Image.Image) or not isinstance(user_text, str): |
|
|
return {"error": "Failed to prepare image/text for inference."} |
|
|
|
|
|
|
|
|
system_message = {"role": "system", "content": system_prompt or ""} |
|
|
user_message = { |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": img_for_dims}, |
|
|
{"type": "text", "text": user_text}, |
|
|
], |
|
|
} |
|
|
|
|
|
image_inputs, video_inputs = process_vision_info([system_message, user_message]) |
|
|
|
|
|
text = self.processor.apply_chat_template( |
|
|
[system_message, user_message], tokenize=False, add_generation_prompt=True |
|
|
) |
|
|
|
|
|
request: Dict[str, Any] = {"prompt": text} |
|
|
if image_inputs: |
|
|
request["multi_modal_data"] = {"image": image_inputs} |
|
|
|
|
|
import time |
|
|
t_start = time.time() |
|
|
self._ensure_llm() |
|
|
sampling_params = SamplingParams(max_tokens=32, temperature=0.0, top_p=1.0) |
|
|
outputs = self.llm.generate([request], sampling_params=sampling_params, use_tqdm=False) |
|
|
out_text = outputs[0].outputs[0].text |
|
|
t_infer = time.time() - t_start |
|
|
|
|
|
|
|
|
def _extract_xy(s: str): |
|
|
try: |
|
|
m = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", s) |
|
|
if not m: |
|
|
return None |
|
|
x_str, y_str = m[0] |
|
|
return float(x_str), float(y_str) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
pred = _extract_xy(out_text) |
|
|
|
|
|
def _se(s: Optional[str], n: int = 120): |
|
|
if not s: |
|
|
return ("", "") |
|
|
return (s[:n], s[-n:] if len(s) > n else s) |
|
|
sys_start, sys_end = _se(system_prompt) |
|
|
usr_start, usr_end = _se(user_text) |
|
|
try: |
|
|
print(f"[gta1-endpoint] System prompt (start): {sys_start}") |
|
|
print(f"[gta1-endpoint] System prompt (end): {sys_end}") |
|
|
print(f"[gta1-endpoint] User prompt (full): {user_text}") |
|
|
print(f"[gta1-endpoint] Raw output: {out_text}") |
|
|
print(f"[gta1-endpoint] Inference time: {t_infer:.3f}s") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
if pred is None or not (width and height): |
|
|
return {"error": "Failed to parse coordinates or missing image dimensions."} |
|
|
|
|
|
|
|
|
px = max(0.0, min(float(pred[0]), float(width))) |
|
|
py = max(0.0, min(float(pred[1]), float(height))) |
|
|
|
|
|
nx = px / float(width) |
|
|
ny = py / float(height) |
|
|
return { |
|
|
"points": [{"x": nx, "y": ny}], |
|
|
"raw": out_text, |
|
|
} |
|
|
|
|
|
|
|
|
|