1ForrestW1
/

qwen3-vl-endpoint

Model card Files Files and versions

xet

Community

Forrest Wargo commited on Oct 20, 2025

Commit

92c5d3d

0 Parent(s):

Add vLLM Qwen3-VL endpoint

Browse files

Files changed (2) hide show

handler.py +213 -0
requirements.txt +5 -0

handler.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import base64
+import io
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
+from PIL import Image
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+def _b64_to_pil(data_url: str) -> Image.Image:
+    if not isinstance(data_url, str) or not data_url.startswith("data:"):
+        raise ValueError("Expected a data URL starting with 'data:'")
+    header, b64data = data_url.split(",", 1)
+    raw = base64.b64decode(b64data)
+    img = Image.open(io.BytesIO(raw))
+    img.load()
+    return img
+class EndpointHandler:
+    """HF Inference Endpoint handler for Qwen3-VL chat-to-point.
+    Input:
+      - { system, user, image(data URL) }
+      - or legacy OpenAI-style messages with image_url + text
+    Output:
+      - { points: [{x,y}], raw: <string> }
+        where x,y are normalized [0,1]
+    """
+    def __init__(self, path: str = "") -> None:
+        model_id = os.environ.get("MODEL_ID") or "Qwen/Qwen3-VL-8B-Instruct"
+        os.environ.setdefault("OMP_NUM_THREADS", "1")
+        os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+        os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+        os.environ.setdefault("HF_HUB_ENABLE_QUIC", "1")
+        os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+        # Auto TP detection from visible GPUs
+        visible = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if visible and visible.strip():
+            try:
+                candidates = [d for d in visible.split(",") if d.strip() and d.strip() != "-1"]
+                tp = max(1, len(candidates))
+            except Exception:
+                tp = 1
+        else:
+            try:
+                import torch  # local import to avoid global dependency if CPU-only
+                tp = max(1, int(torch.cuda.device_count())) if torch.cuda.is_available() else 1
+            except Exception:
+                tp = 1
+        self._model_id = model_id
+        self._tp = tp
+        self.llm = None  # type: ignore
+        hub_token = (
+            os.environ.get("HUGGINGFACE_HUB_TOKEN")
+            or os.environ.get("HF_HUB_TOKEN")
+            or os.environ.get("HF_TOKEN")
+        )
+        if hub_token and not os.environ.get("HF_TOKEN"):
+            try:
+                os.environ["HF_TOKEN"] = hub_token
+            except Exception:
+                pass
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, token=hub_token)
+    def _ensure_llm(self) -> None:
+        if self.llm is not None:
+            return
+        self.llm = LLM(
+            model=self._model_id,
+            tensor_parallel_size=self._tp,
+            pipeline_parallel_size=1,
+            gpu_memory_utilization=0.95,
+            dtype="auto",
+            distributed_executor_backend="mp",
+            enforce_eager=True,
+            trust_remote_code=True,
+        )
+    @staticmethod
+    def _parse_legacy_messages(messages: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        system_prompt: Optional[str] = None
+        first_image_data_url: Optional[str] = None
+        first_text: Optional[str] = None
+        for msg in messages:
+            if msg.get("role") == "system" and system_prompt is None:
+                content = msg.get("content")
+                if isinstance(content, str):
+                    system_prompt = content
+            if msg.get("role") == "user":
+                content = msg.get("content", [])
+                if not isinstance(content, list):
+                    continue
+                for part in content:
+                    if part.get("type") == "image_url" and not first_image_data_url:
+                        url = part.get("image_url", {}).get("url")
+                        if isinstance(url, str) and url.startswith("data:"):
+                            first_image_data_url = url
+                    if part.get("type") == "text" and not first_text:
+                        t = part.get("text")
+                        if isinstance(t, str):
+                            first_text = t
+        return system_prompt, first_text, first_image_data_url
+    def __call__(self, data: Dict[str, Any]) -> Any:
+        # Normalize HF toolkit payloads
+        if isinstance(data, dict) and "inputs" in data:
+            inputs_val = data.get("inputs")
+            if isinstance(inputs_val, dict):
+                data = inputs_val
+            elif isinstance(inputs_val, (str, bytes, bytearray)):
+                try:
+                    if isinstance(inputs_val, (bytes, bytearray)):
+                        inputs_val = inputs_val.decode("utf-8")
+                    parsed = json.loads(inputs_val)
+                    if isinstance(parsed, dict):
+                        data = parsed
+                except Exception:
+                    pass
+        system_prompt: Optional[str] = None
+        user_text: Optional[str] = None
+        image_data_url: Optional[str] = None
+        if isinstance(data, dict) and ("system" in data or "user" in data or "image" in data):
+            system_prompt = data.get("system")
+            user_text = data.get("user")
+            image_data_url = data.get("image")
+            if not isinstance(image_data_url, str) or not image_data_url.startswith("data:"):
+                return {"error": "image must be a data URL (data:...)"}
+        else:
+            messages = data.get("messages") if isinstance(data, dict) else None
+            if not messages:
+                return {"error": "Provide 'system','user','image' or legacy 'messages'"}
+            system_prompt, user_text, image_data_url = self._parse_legacy_messages(messages)
+            if not isinstance(image_data_url, str) or not image_data_url.startswith("data:"):
+                return {"error": "messages.content image_url.url must be a data URL (data:...)"}
+        try:
+            pil = _b64_to_pil(image_data_url)
+        except Exception as e:
+            return {"error": f"Failed to decode image: {e}"}
+        width = getattr(pil, "width", None)
+        height = getattr(pil, "height", None)
+        if isinstance(width, int) and isinstance(height, int):
+            try:
+                print(f"[qwen3-vl-endpoint] Received image size: {width}x{height}")
+            except Exception:
+                pass
+        if not isinstance(user_text, str):
+            return {"error": "user text must be provided"}
+        system_message = {"role": "system", "content": system_prompt or ""}
+        user_message = {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil},
+                {"type": "text", "text": user_text},
+            ],
+        }
+        prompt = self.processor.apply_chat_template(
+            [system_message, user_message], tokenize=False, add_generation_prompt=True
+        )
+        request: Dict[str, Any] = {"prompt": prompt}
+        request["multi_modal_data"] = {"image": [pil]}
+        import time
+        t0 = time.time()
+        self._ensure_llm()
+        sampling_params = SamplingParams(max_tokens=32, temperature=0.0, top_p=1.0)
+        outputs = self.llm.generate([request], sampling_params=sampling_params, use_tqdm=False)
+        out_text = outputs[0].outputs[0].text
+        t1 = time.time()
+        try:
+            print(f"[qwen3-vl-endpoint] Prompt: {user_text}")
+            print(f"[qwen3-vl-endpoint] Raw output: {out_text}")
+            print(f"[qwen3-vl-endpoint] Inference time: {t1 - t0:.3f}s")
+        except Exception:
+            pass
+        try:
+            import re
+            m = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", out_text)
+            if not m:
+                return {"error": "Failed to parse coordinates from model output."}
+            x_str, y_str = m[0]
+            px, py = float(x_str), float(y_str)
+            if not isinstance(width, int) or not isinstance(height, int):
+                return {"error": "Missing image dimensions for normalization."}
+            w, h = float(width), float(height)
+            px = max(0.0, min(px, w))
+            py = max(0.0, min(py, h))
+            nx, ny = px / w, py / h
+            return {"points": [{"x": nx, "y": ny}], "raw": out_text}
+        except Exception as e:
+            return {"error": f"Postprocessing failed: {e}"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==2.8.0
+Pillow
+transformers==4.57.1
+vllm==0.11.0