Spaces:

mlbench123
/

mudflap_LLM

Sleeping

App Files Files Community

mlbench123 commited on Apr 8

Commit

f5c4e2c

verified ·

1 Parent(s): a6e7e36

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -64

app.py CHANGED Viewed

@@ -19,20 +19,20 @@ import os
 import re
 import io
-import requests
 from PIL import Image
 # ──────────────────────────────────────────────────────────────────────────────
 # MODELS  — ordered by reliability on HF free tier (most reliable first)
 # ──────────────────────────────────────────────────────────────────────────────
 MODELS = [
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",   # Best free vision model on HF
-    "Qwen/Qwen2.5-VL-7B-Instruct",                # Good fallback
-    "google/gemma-3-4b-it",                        # Smaller, faster fallback
 ]
 # HF Serverless Inference — new router endpoint (api-inference.huggingface.co is deprecated as of 2026)
-HF_CHAT_URL = "https://router.huggingface.co/hf-inference/models/{model}/v1/chat/completions"
 # ──────────────────────────────────────────────────────────────────────────────
 # DETECTION PROMPT
@@ -165,72 +165,41 @@ def validate_result(data: dict) -> dict | None:
 def call_model(img: Image.Image, model: str, token: str) -> dict:
     """
-    Call one HF vision model via the chat-completions endpoint.
     Returns validated result dict on success.
     Raises RuntimeError with a clear message on failure.
     """
     b64 = pil_to_b64(img)
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {token}",
-    }
-    payload = {
-        "model": model,
-        "messages": [
-            {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
-                    },
-                    {
-                        "type": "text",
-                        "text": DETECTION_PROMPT,
-                    },
                 ],
-            }
-        ],
-        "max_tokens": 512,
-        "temperature": 0.05,
-        "stream": False,
-    }
-    url = HF_CHAT_URL.format(model=model)
-    short = model.split("/")[-1]
-    try:
-        resp = requests.post(url, headers=headers, json=payload, timeout=90)
-    except requests.exceptions.Timeout:
-        raise RuntimeError(f"{short}: request timed out (90s)")
-    except requests.exceptions.ConnectionError as e:
-        raise RuntimeError(f"{short}: connection error — {e}")
-    # ── HTTP-level error handling ────────────────────────────────────────────
-    if resp.status_code == 401:
-        raise RuntimeError(f"{short}: 401 Unauthorized — HF_TOKEN is missing or invalid")
-    if resp.status_code == 403:
-        raise RuntimeError(f"{short}: 403 Forbidden — token may not have access to this model")
-    if resp.status_code == 404:
-        raise RuntimeError(f"{short}: 404 Not Found — model not available on serverless endpoint")
-    if resp.status_code == 422:
-        raise RuntimeError(f"{short}: 422 Unprocessable — model may not support vision input")
-    if resp.status_code == 429:
-        raise RuntimeError(f"{short}: 429 Rate Limited — try again in ~60 seconds")
-    if resp.status_code in (502, 503):
-        raise RuntimeError(f"{short}: {resp.status_code} Service Unavailable — model is loading")
-    if resp.status_code != 200:
-        body_preview = resp.text[:200].replace("\n", " ")
-        raise RuntimeError(f"{short}: HTTP {resp.status_code} — {body_preview}")
-    # ── Parse response ───────────────────────────────────────────────────────
-    try:
-        body = resp.json()
-        content = body["choices"][0]["message"]["content"]
-    except (KeyError, IndexError, json.JSONDecodeError) as e:
-        raise RuntimeError(f"{short}: unexpected response shape — {e} | body: {resp.text[:200]}")
     print(f"[{short}] raw LLM output: {content[:300]}")  # visible in Space logs
@@ -487,7 +456,7 @@ print("=" * 60)
 print("  Amazon Trailer Inspector — startup")
 print(f"  HF_TOKEN : {'SET (' + str(len(_tok)) + ' chars)' if _tok else 'NOT SET ← add to Space Secrets!'}")
 print(f"  Models   : {[m.split('/')[-1] for m in MODELS]}")
-print(f"  Endpoint : {HF_CHAT_URL[:60]}...")
 print("=" * 60)
 # ──────────────────────────────────────────────────────────────────────────────

 import re
 import io
 from PIL import Image
+from huggingface_hub import InferenceClient
 # ──────────────────────────────────────────────────────────────────────────────
 # MODELS  — ordered by reliability on HF free tier (most reliable first)
 # ──────────────────────────────────────────────────────────────────────────────
+# Verify live status: huggingface.co/models?pipeline_tag=image-text-to-text&inference=warm
 MODELS = [
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",   # Primary
+    "Qwen/Qwen2.5-VL-3B-Instruct",                # Smaller Qwen — more likely warm
+    "microsoft/Phi-3.5-vision-instruct",           # Fallback
 ]
 # HF Serverless Inference — new router endpoint (api-inference.huggingface.co is deprecated as of 2026)
 # ──────────────────────────────────────────────────────────────────────────────
 # DETECTION PROMPT
 def call_model(img: Image.Image, model: str, token: str) -> dict:
     """
+    Call one HF vision model via InferenceClient with provider='hf-inference'.
+    This is the official HF-recommended approach after api-inference deprecation.
     Returns validated result dict on success.
     Raises RuntimeError with a clear message on failure.
     """
     b64 = pil_to_b64(img)
+    short = model.split("/")[-1]
+    try:
+        client = InferenceClient(provider="hf-inference", api_key=token)
+        resp = client.chat_completion(
+            model=model,
+            messages=[{
                 "role": "user",
                 "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
+                    {"type": "text", "text": DETECTION_PROMPT},
                 ],
+            }],
+            max_tokens=512,
+            temperature=0.05,
+        )
+        raw_content = resp.choices[0].message.content
+    except Exception as e:
+        err = str(e)
+        if "401" in err or "403" in err:
+            raise RuntimeError(f"{short}: auth error — check HF_TOKEN ({err[:120]})")
+        elif "404" in err:
+            raise RuntimeError(f"{short}: 404 — model not on free serverless tier ({err[:120]})")
+        elif "429" in err:
+            raise RuntimeError(f"{short}: rate limited — retry in ~60s")
+        elif "503" in err or "502" in err:
+            raise RuntimeError(f"{short}: model loading/unavailable — retry shortly")
+        else:
+            raise RuntimeError(f"{short}: {err[:200]}")
     print(f"[{short}] raw LLM output: {content[:300]}")  # visible in Space logs
 print("  Amazon Trailer Inspector — startup")
 print(f"  HF_TOKEN : {'SET (' + str(len(_tok)) + ' chars)' if _tok else 'NOT SET ← add to Space Secrets!'}")
 print(f"  Models   : {[m.split('/')[-1] for m in MODELS]}")
+print(f"  Method   : InferenceClient(provider='hf-inference')")
 print("=" * 60)
 # ──────────────────────────────────────────────────────────────────────────────