"""
Amazon Trailer Inspector — app.py
HuggingFace Spaces · Gradio 5.x · Free vision LLMs
FIXES over previous version:
- Uses requests directly (avoids huggingface_hub API version breakage)
- Correct chat-completions endpoint format for HF Serverless Inference
- Updated model list to currently-working free vision models
- Removed blocking whoami() startup check
- Robust JSON extraction with multi-pass recovery
- Detailed per-model error logging to Space logs
"""
import gradio as gr
import base64
import concurrent.futures
import json
import os
import re
import io
from PIL import Image
from huggingface_hub import InferenceClient
# ──────────────────────────────────────────────────────────────────────────────
# MODELS — ordered by reliability on HF free tier (most reliable first)
# ──────────────────────────────────────────────────────────────────────────────
# Confirmed DEPLOYED vision models — verified from HF inference/models table April 9 2026
# google/gemma-4-31B-it → novita (cheapest) + together (fastest) ✅ VISION
# google/gemma-4-26B-A4B-it → novita ✅ VISION (MoE: faster/cheaper)
# Qwen/Qwen3-VL-8B-Instruct → novita + together ✅ VISION (VL = Vision-Language)
MODELS = [
"google/gemma-4-31B-it", # Primary — best quality, novita + together
"google/gemma-4-26B-A4B-it", # Fallback 1 — MoE variant, faster (4B active params)
"Qwen/Qwen3-VL-8B-Instruct", # Fallback 2 — dedicated VL model, novita + together
]
# HF Serverless Inference — new router endpoint (api-inference.huggingface.co is deprecated as of 2026)
# ──────────────────────────────────────────────────────────────────────────────
# DETECTION PROMPT
# ──────────────────────────────────────────────────────────────────────────────
DETECTION_PROMPT = """You are a precise visual inspector for Amazon trailer fleets.
Carefully examine the full trailer image and locate these 4 components:
1. SENSORS — Exactly TWO silver/beige DIAMOND (rhombus/rotated-square) shaped metal plates.
They are mounted near the lower-rear area on the back doors of the trailer.
2. GPS_DEVICE — A small white or light-gray rectangular electronic box mounted at the upper
corner of the trailer rear face. About the size of a paperback book.
- GPS_DEVICE — A small white or light-gray rectangular electronic box mounted at the upper corner
A tracking device mounted on the upper rear area of the trailer.
+ It may:
+ - be white, gray, or black
+ - include cables, mounts, or connectors
+ - appear inside a recessed panel or metal frame
+ - not be a perfect rectangle
3. PRIME_LOGO — The Amazon Prime branding logo: the word "prime" OR "amazon" OR the Amazon arrow/smile logo
OR both. Can be full or partially visible, on rear or side of trailer. Find it carefully. It can be partial, small/tiny, large etc.
4. TRAILER_ID — A vertical fluorescent-green or yellow-green label strip on the corner post/pillar,
showing an alphanumeric code like "SV2602705".
IMPORTANT: Reply ONLY with valid JSON — absolutely no extra text before or after, no markdown fences:
{
"sensors": {"found": true, "confidence": "high", "notes": "two diamond plates visible lower-left"},
"gps_device": {"found": false, "confidence": "medium", "notes": "top corner not visible in this angle"},
"prime_logo": {"found": true, "confidence": "high", "notes": "prime word visible on rear panel"},
"trailer_id": {"found": true, "confidence": "high", "notes": "SV2602705 on right corner post"}
}"""
KEYS = ["sensors", "gps_device", "prime_logo", "trailer_id"]
# ──────────────────────────────────────────────────────────────────────────────
# IMAGE HELPERS
# ──────────────────────────────────────────────────────────────────────────────
def pil_to_b64(img: Image.Image, max_side: int = 1024) -> str:
"""Resize large images and encode as base64 JPEG."""
img = img.copy().convert("RGB")
if max(img.size) > max_side:
img.thumbnail((max_side, max_side), Image.LANCZOS)
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=82)
return base64.b64encode(buf.getvalue()).decode("utf-8")
def load_images(file_paths) -> list[Image.Image]:
"""Load PIL images from Gradio 5.x file paths (str or filepath objects)."""
imgs = []
if not file_paths:
return imgs
if isinstance(file_paths, str):
file_paths = [file_paths]
for p in file_paths:
try:
path = p if isinstance(p, str) else getattr(p, "name", str(p))
imgs.append(Image.open(path).convert("RGB"))
except Exception as e:
print(f"[load_images] skipped {p}: {e}")
return imgs
# ──────────────────────────────────────────────────────────────────────────────
# JSON EXTRACTION — multi-pass recovery
# ──────────────────────────────────────────────────────────────────────────────
def extract_json(text: str) -> dict | None:
"""Try multiple strategies to pull valid JSON from LLM output."""
if not text:
return None
# Strip markdown code fences
text = re.sub(r"```(?:json)?", "", text, flags=re.IGNORECASE).replace("```", "").strip()
# Find outermost { ... } block
m = re.search(r"\{[\s\S]*\}", text)
if not m:
return None
raw = m.group()
# Pass 1: direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Pass 2: fix trailing commas
fixed = re.sub(r",\s*([}\]])", r"\1", raw)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
# Pass 3: extract only the lines containing our keys
try:
rebuilt = {
key: json.loads(
re.search(
rf'"{key}"\s*:\s*(\{{[^}}]+\}})', raw, re.DOTALL
).group(1)
)
for key in KEYS
if re.search(rf'"{key}"\s*:\s*\{{', raw)
}
if rebuilt:
return rebuilt
except Exception:
pass
return None
def validate_result(data: dict) -> dict | None:
"""Ensure result has all keys and correct types; coerce where possible."""
if not data:
return None
out = {}
for key in KEYS:
item = data.get(key)
if not isinstance(item, dict):
return None # hard fail — missing a required key
found = item.get("found", False)
if isinstance(found, str):
found = found.lower() in ("true", "yes", "1")
out[key] = {
"found": bool(found),
"confidence": item.get("confidence", "low") or "low",
"notes": (item.get("notes") or "").strip(),
}
return out
# ──────────────────────────────────────────────────────────────────────────────
# LLM CALL — direct requests, no huggingface_hub dependency for inference
# ──────────────────────────────────────────────────────────────────────────────
def call_model(img: Image.Image, model: str, token: str) -> dict:
"""
Call one HF vision model via InferenceClient with provider='auto'.
This is the official HF-recommended approach after api-inference deprecation.
Returns validated result dict on success.
Raises RuntimeError with a clear message on failure.
"""
b64 = pil_to_b64(img)
short = model.split("/")[-1]
try:
# provider="auto" = HF router picks best available provider for this model
# This works for vision LLMs unlike hf-inference which is CPU-only
client = InferenceClient(provider="auto", api_key=token)
resp = client.chat_completion(
model=model,
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
{"type": "text", "text": DETECTION_PROMPT},
],
}],
max_tokens=512,
temperature=0.05,
)
raw_content = resp.choices[0].message.content
except Exception as e:
err = str(e)
if "401" in err or "403" in err:
raise RuntimeError(f"{short}: auth error — check HF_TOKEN ({err[:120]})")
elif "404" in err:
raise RuntimeError(f"{short}: 404 — model not on free serverless tier ({err[:120]})")
elif "429" in err:
raise RuntimeError(f"{short}: rate limited — retry in ~60s")
elif "503" in err or "502" in err:
raise RuntimeError(f"{short}: model loading/unavailable — retry shortly")
else:
raise RuntimeError(f"{short}: {err[:200]}")
print(f"[{short}] raw LLM output: {raw_content[:300]}") # visible in Space logs
data = extract_json(raw_content)
result = validate_result(data)
if result is None:
raise RuntimeError(
f"{short}: could not extract valid JSON.\n"
f"Raw output (first 300 chars): {raw_content[:300]}"
)
return result
# ──────────────────────────────────────────────────────────────────────────────
# PER-IMAGE ANALYSIS — try each model in order
# ──────────────────────────────────────────────────────────────────────────────
def analyze_one(img: Image.Image, token: str) -> tuple[dict | None, str]:
"""
Try MODELS in order for a single image.
Returns (result_dict, model_short_name) on success,
(None, joined_error_string) on total failure.
"""
errors = []
for model in MODELS:
short = model.split("/")[-1]
try:
result = call_model(img, model, token)
print(f"[analyze_one] SUCCESS with {short}")
return result, short
except RuntimeError as e:
msg = str(e)
print(f"[analyze_one] FAIL {msg}")
errors.append(msg)
return None, " | ".join(errors)
# ──────────────────────────────────────────────────────────────────────────────
# RESULT MERGING
# ──────────────────────────────────────────────────────────────────────────────
CONF_RANK = {"high": 3, "medium": 2, "low": 1, "": 0}
def merge(results: list[dict]) -> dict:
"""found=True wins across images; highest confidence wins."""
merged = {k: {"found": False, "confidence": "low", "notes": ""} for k in KEYS}
for res in results:
if not res:
continue
for k in KEYS:
src = res.get(k, {})
if src.get("found"):
merged[k]["found"] = True
if CONF_RANK.get(src.get("confidence", ""), 0) > CONF_RANK.get(merged[k]["confidence"], 0):
merged[k]["confidence"] = src["confidence"]
if src.get("notes") and not merged[k]["notes"]:
merged[k]["notes"] = src["notes"]
return merged
# ──────────────────────────────────────────────────────────────────────────────
# MAIN GRADIO CALLBACK
# ──────────────────────────────────────────────────────────────────────────────
def analyze(file_paths):
token = os.environ.get("HF_TOKEN", "").strip()
# ── Token guard — show actionable message ───────────────────────────────
if not token:
return (
_error(
"Setup required: HF_TOKEN not set.
"
"Go to your Space → Settings → Repository Secrets "
"→ add a secret named HF_TOKEN with your "
"HuggingFace Read token.
"
"Get a free token at "
""
"huggingface.co/settings/tokens"
),
_status("error"),
)
images = load_images(file_paths)
if not images:
return _placeholder(), _status("idle")
n = len(images)
print(f"[analyze] processing {n} image(s)")
all_results, all_errors, models_used = [], [], set()
# Parallel: one thread per image (up to 4)
with concurrent.futures.ThreadPoolExecutor(max_workers=min(n, 4)) as pool:
futs = {pool.submit(analyze_one, img, token): i for i, img in enumerate(images)}
for fut in concurrent.futures.as_completed(futs):
res, meta = fut.result()
if res is not None:
all_results.append(res)
models_used.add(meta)
else:
all_errors.append(meta)
if not all_results:
err_lines = "
".join(
f"{e}" for e in all_errors
) or "Unknown error"
return (
_error(
f"All models failed for all images.
"
f"Exact errors:
{err_lines}
"
f"Most likely fixes:
"
f"• 401/403 → HF_TOKEN is wrong or expired — regenerate at "
f"hf.co/settings/tokens
"
f"• 429 → Rate limited — wait 60 seconds and retry
"
f"• 404 → Model temporarily unavailable — retry or report as issue
"
f"• 503 → Model is loading (cold start) — wait 30s and retry"
),
_status("error"),
)
merged = merge(all_results)
model_str = " · ".join(sorted(models_used)) or "AI"
warn = ""
if all_errors:
warn = (
f"
⚠️ {len(all_errors)} image(s) failed — "
f"{all_errors[0][:100]}"
)
return build_cards(merged, n, model_str, warn), _status("done", n, len(all_results))
# ──────────────────────────────────────────────────────────────────────────────
# HTML BUILDERS
# ──────────────────────────────────────────────────────────────────────────────
COMP_META = [
("sensors", "🔷", "Sensors", "Two diamond-shaped sensor plates", "#f59e0b", "#fef3c7"),
("gps_device", "📡", "GPS Device", "White electronic box — upper corner", "#3b82f6", "#dbeafe"),
("prime_logo", "🔶", "Prime Logo", "Amazon Prime logo (full or partial)", "#f97316", "#fff7ed"),
("trailer_id", "🏷️", "Trailer ID Label", "Vertical strip on the corner post", "#10b981", "#d1fae5"),
]
CONF_COLOR = {"high": "#15803d", "medium": "#b45309", "low": "#b91c1c"}
def build_cards(merged: dict, img_n: int, model_str: str, warn: str) -> str:
found_n = sum(1 for k, *_ in COMP_META if merged.get(k, {}).get("found"))
total = len(COMP_META)
all_ok = found_n == total
# Banner colours
if all_ok:
sc, sb, se, si, sl = "#16a34a", "#f0fdf4", "#86efac", "✅", "All Clear — All Components Found"
elif found_n >= 3:
sc, sb, se, si, sl = "#d97706", "#fffbeb", "#fde68a", "⚠️", "Mostly Complete"
elif found_n >= 2:
sc, sb, se, si, sl = "#ea580c", "#fff7ed", "#fed7aa", "⚠️", "Partially Complete"
else:
sc, sb, se, si, sl = "#dc2626", "#fef2f2", "#fca5a5", "❌", "Missing Components"
rows = ""
for key, icon, name, desc, accent, pill in COMP_META:
d = merged.get(key, {})
found = d.get("found", False)
conf = d.get("confidence", "low")
notes = d.get("notes", "")
rbg = "#f0fdf4" if found else "#fef2f2"
rbd = "#bbf7d0" if found else "#fecaca"
stc = "#15803d" if found else "#b91c1c"
stx = "✅ Found" if found else "❌ Missing"
cdc = CONF_COLOR.get(conf, "#9ca3af")
note_html = (
f'
HF_TOKEN = your Read token from '
'huggingface.co/settings/tokens'
'AI-powered verification of required trailer components from photos
💡 Upload front, rear, or side views — more angles = better accuracy
""") analyze_btn = gr.Button( "🔍 Analyze Trailer", variant="primary", size="lg", elem_id="analyze-btn", ) status_html = gr.HTML(_status("idle")) # RIGHT COLUMN — results with gr.Column(scale=1, min_width=320): result_html = gr.HTML(_placeholder()) gr.HTML("""