Spaces:

mlbench123
/

mudflap_LLM

Sleeping

App Files Files Community

mudflap_LLM / app.py

mlbench123

Update app.py

14785c0 verified about 1 month ago

raw

history blame contribute delete

27.4 kB

	"""
	Amazon Trailer Inspector — app.py
	HuggingFace Spaces · Gradio 5.x · Free vision LLMs

	FIXES over previous version:
	- Uses requests directly (avoids huggingface_hub API version breakage)
	- Correct chat-completions endpoint format for HF Serverless Inference
	- Updated model list to currently-working free vision models
	- Removed blocking whoami() startup check
	- Robust JSON extraction with multi-pass recovery
	- Detailed per-model error logging to Space logs
	"""

	import gradio as gr
	import base64
	import concurrent.futures
	import json
	import os
	import re
	import io

	from PIL import Image
	from huggingface_hub import InferenceClient

	# ──────────────────────────────────────────────────────────────────────────────
	# MODELS — ordered by reliability on HF free tier (most reliable first)
	# ──────────────────────────────────────────────────────────────────────────────
	# Confirmed DEPLOYED vision models — verified from HF inference/models table April 9 2026
	# google/gemma-4-31B-it → novita (cheapest) + together (fastest) ✅ VISION
	# google/gemma-4-26B-A4B-it → novita ✅ VISION (MoE: faster/cheaper)
	# Qwen/Qwen3-VL-8B-Instruct → novita + together ✅ VISION (VL = Vision-Language)
	MODELS = [
	"google/gemma-4-31B-it", # Primary — best quality, novita + together
	"google/gemma-4-26B-A4B-it", # Fallback 1 — MoE variant, faster (4B active params)
	"Qwen/Qwen3-VL-8B-Instruct", # Fallback 2 — dedicated VL model, novita + together
	]

	# HF Serverless Inference — new router endpoint (api-inference.huggingface.co is deprecated as of 2026)

	# ──────────────────────────────────────────────────────────────────────────────
	# DETECTION PROMPT
	# ──────────────────────────────────────────────────────────────────────────────
	DETECTION_PROMPT = """You are a precise visual inspector for Amazon trailer fleets.
	Carefully examine the full trailer image and locate these 4 components:

	1. SENSORS — Exactly TWO silver/beige DIAMOND (rhombus/rotated-square) shaped metal plates.
	They are mounted near the lower-rear area on the back doors of the trailer.
	2. GPS_DEVICE — A small white or light-gray rectangular electronic box mounted at the upper
	corner of the trailer rear face. About the size of a paperback book.
	- GPS_DEVICE — A small white or light-gray rectangular electronic box mounted at the upper corner
	A tracking device mounted on the upper rear area of the trailer.
	+ It may:
	+ - be white, gray, or black
	+ - include cables, mounts, or connectors
	+ - appear inside a recessed panel or metal frame
	+ - not be a perfect rectangle
	3. PRIME_LOGO — The Amazon Prime branding logo: the word "prime" OR "amazon" OR the Amazon arrow/smile logo
	OR both. Can be full or partially visible, on rear or side of trailer. Find it carefully. It can be partial, small/tiny, large etc.
	4. TRAILER_ID — A vertical fluorescent-green or yellow-green label strip on the corner post/pillar,
	showing an alphanumeric code like "SV2602705".

	IMPORTANT: Reply ONLY with valid JSON — absolutely no extra text before or after, no markdown fences:
	{
	"sensors": {"found": true, "confidence": "high", "notes": "two diamond plates visible lower-left"},
	"gps_device": {"found": false, "confidence": "medium", "notes": "top corner not visible in this angle"},
	"prime_logo": {"found": true, "confidence": "high", "notes": "prime word visible on rear panel"},
	"trailer_id": {"found": true, "confidence": "high", "notes": "SV2602705 on right corner post"}
	}"""

	KEYS = ["sensors", "gps_device", "prime_logo", "trailer_id"]

	# ──────────────────────────────────────────────────────────────────────────────
	# IMAGE HELPERS
	# ──────────────────────────────────────────────────────────────────────────────

	def pil_to_b64(img: Image.Image, max_side: int = 1024) -> str:
	"""Resize large images and encode as base64 JPEG."""
	img = img.copy().convert("RGB")
	if max(img.size) > max_side:
	img.thumbnail((max_side, max_side), Image.LANCZOS)
	buf = io.BytesIO()
	img.save(buf, format="JPEG", quality=82)
	return base64.b64encode(buf.getvalue()).decode("utf-8")


	def load_images(file_paths) -> list[Image.Image]:
	"""Load PIL images from Gradio 5.x file paths (str or filepath objects)."""
	imgs = []
	if not file_paths:
	return imgs
	if isinstance(file_paths, str):
	file_paths = [file_paths]
	for p in file_paths:
	try:
	path = p if isinstance(p, str) else getattr(p, "name", str(p))
	imgs.append(Image.open(path).convert("RGB"))
	except Exception as e:
	print(f"[load_images] skipped {p}: {e}")
	return imgs


	# ──────────────────────────────────────────────────────────────────────────────
	# JSON EXTRACTION — multi-pass recovery
	# ──────────────────────────────────────────────────────────────────────────────

	def extract_json(text: str) -> dict \| None:
	"""Try multiple strategies to pull valid JSON from LLM output."""
	if not text:
	return None

	# Strip markdown code fences
	text = re.sub(r"```(?:json)?", "", text, flags=re.IGNORECASE).replace("```", "").strip()

	# Find outermost { ... } block
	m = re.search(r"\{[\s\S]*\}", text)
	if not m:
	return None
	raw = m.group()

	# Pass 1: direct parse
	try:
	return json.loads(raw)
	except json.JSONDecodeError:
	pass

	# Pass 2: fix trailing commas
	fixed = re.sub(r",\s*([}\]])", r"\1", raw)
	try:
	return json.loads(fixed)
	except json.JSONDecodeError:
	pass

	# Pass 3: extract only the lines containing our keys
	try:
	rebuilt = {
	key: json.loads(
	re.search(
	rf'"{key}"\s:\s(\{{[^}}]+\}})', raw, re.DOTALL
	).group(1)
	)
	for key in KEYS
	if re.search(rf'"{key}"\s:\s\{{', raw)
	}
	if rebuilt:
	return rebuilt
	except Exception:
	pass

	return None


	def validate_result(data: dict) -> dict \| None:
	"""Ensure result has all keys and correct types; coerce where possible."""
	if not data:
	return None
	out = {}
	for key in KEYS:
	item = data.get(key)
	if not isinstance(item, dict):
	return None # hard fail — missing a required key
	found = item.get("found", False)
	if isinstance(found, str):
	found = found.lower() in ("true", "yes", "1")
	out[key] = {
	"found": bool(found),
	"confidence": item.get("confidence", "low") or "low",
	"notes": (item.get("notes") or "").strip(),
	}
	return out


	# ──────────────────────────────────────────────────────────────────────────────
	# LLM CALL — direct requests, no huggingface_hub dependency for inference
	# ──────────────────────────────────────────────────────────────────────────────

	def call_model(img: Image.Image, model: str, token: str) -> dict:
	"""
	Call one HF vision model via InferenceClient with provider='auto'.
	This is the official HF-recommended approach after api-inference deprecation.
	Returns validated result dict on success.
	Raises RuntimeError with a clear message on failure.
	"""
	b64 = pil_to_b64(img)
	short = model.split("/")[-1]

	try:
	# provider="auto" = HF router picks best available provider for this model
	# This works for vision LLMs unlike hf-inference which is CPU-only
	client = InferenceClient(provider="auto", api_key=token)
	resp = client.chat_completion(
	model=model,
	messages=[{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
	{"type": "text", "text": DETECTION_PROMPT},
	],
	}],
	max_tokens=512,
	temperature=0.05,
	)
	raw_content = resp.choices[0].message.content
	except Exception as e:
	err = str(e)
	if "401" in err or "403" in err:
	raise RuntimeError(f"{short}: auth error — check HF_TOKEN ({err[:120]})")
	elif "404" in err:
	raise RuntimeError(f"{short}: 404 — model not on free serverless tier ({err[:120]})")
	elif "429" in err:
	raise RuntimeError(f"{short}: rate limited — retry in ~60s")
	elif "503" in err or "502" in err:
	raise RuntimeError(f"{short}: model loading/unavailable — retry shortly")
	else:
	raise RuntimeError(f"{short}: {err[:200]}")

	print(f"[{short}] raw LLM output: {raw_content[:300]}") # visible in Space logs

	data = extract_json(raw_content)
	result = validate_result(data)
	if result is None:
	raise RuntimeError(
	f"{short}: could not extract valid JSON.\n"
	f"Raw output (first 300 chars): {raw_content[:300]}"
	)

	return result


	# ──────────────────────────────────────────────────────────────────────────────
	# PER-IMAGE ANALYSIS — try each model in order
	# ──────────────────────────────────────────────────────────────────────────────

	def analyze_one(img: Image.Image, token: str) -> tuple[dict \| None, str]:
	"""
	Try MODELS in order for a single image.
	Returns (result_dict, model_short_name) on success,
	(None, joined_error_string) on total failure.
	"""
	errors = []
	for model in MODELS:
	short = model.split("/")[-1]
	try:
	result = call_model(img, model, token)
	print(f"[analyze_one] SUCCESS with {short}")
	return result, short
	except RuntimeError as e:
	msg = str(e)
	print(f"[analyze_one] FAIL {msg}")
	errors.append(msg)

	return None, " \| ".join(errors)


	# ──────────────────────────────────────────────────────────────────────────────
	# RESULT MERGING
	# ──────────────────────────────────────────────────────────────────────────────

	CONF_RANK = {"high": 3, "medium": 2, "low": 1, "": 0}

	def merge(results: list[dict]) -> dict:
	"""found=True wins across images; highest confidence wins."""
	merged = {k: {"found": False, "confidence": "low", "notes": ""} for k in KEYS}
	for res in results:
	if not res:
	continue
	for k in KEYS:
	src = res.get(k, {})
	if src.get("found"):
	merged[k]["found"] = True
	if CONF_RANK.get(src.get("confidence", ""), 0) > CONF_RANK.get(merged[k]["confidence"], 0):
	merged[k]["confidence"] = src["confidence"]
	if src.get("notes") and not merged[k]["notes"]:
	merged[k]["notes"] = src["notes"]
	return merged


	# ──────────────────────────────────────────────────────────────────────────────
	# MAIN GRADIO CALLBACK
	# ──────────────────────────────────────────────────────────────────────────────

	def analyze(file_paths):
	token = os.environ.get("HF_TOKEN", "").strip()

	# ── Token guard — show actionable message ───────────────────────────────
	if not token:
	return (
	_error(
	"<b>Setup required: HF_TOKEN not set.</b><br><br>"
	"Go to your Space → <b>Settings → Repository Secrets</b> "
	"→ add a secret named <code>HF_TOKEN</code> with your "
	"HuggingFace Read token.<br>"
	"Get a free token at "
	"<a href='https://huggingface.co/settings/tokens' target='_blank'>"
	"huggingface.co/settings/tokens</a>"
	),
	_status("error"),
	)

	images = load_images(file_paths)
	if not images:
	return _placeholder(), _status("idle")

	n = len(images)
	print(f"[analyze] processing {n} image(s)")

	all_results, all_errors, models_used = [], [], set()

	# Parallel: one thread per image (up to 4)
	with concurrent.futures.ThreadPoolExecutor(max_workers=min(n, 4)) as pool:
	futs = {pool.submit(analyze_one, img, token): i for i, img in enumerate(images)}
	for fut in concurrent.futures.as_completed(futs):
	res, meta = fut.result()
	if res is not None:
	all_results.append(res)
	models_used.add(meta)
	else:
	all_errors.append(meta)

	if not all_results:
	err_lines = "<br>".join(
	f"<code style='font-size:11px;'>{e}</code>" for e in all_errors
	) or "<code>Unknown error</code>"

	return (
	_error(
	f"<b>All models failed for all images.</b><br><br>"
	f"<b>Exact errors:</b><br>{err_lines}<br><br>"
	f"<b>Most likely fixes:</b><br>"
	f"• <b>401/403</b> → HF_TOKEN is wrong or expired — regenerate at "
	f"<a href='https://huggingface.co/settings/tokens' target='_blank'>hf.co/settings/tokens</a><br>"
	f"• <b>429</b> → Rate limited — wait 60 seconds and retry<br>"
	f"• <b>404</b> → Model temporarily unavailable — retry or report as issue<br>"
	f"• <b>503</b> → Model is loading (cold start) — wait 30s and retry"
	),
	_status("error"),
	)

	merged = merge(all_results)
	model_str = " · ".join(sorted(models_used)) or "AI"
	warn = ""
	if all_errors:
	warn = (
	f"<br><small style='color:#d97706;'>⚠️ {len(all_errors)} image(s) failed — "
	f"{all_errors[0][:100]}</small>"
	)

	return build_cards(merged, n, model_str, warn), _status("done", n, len(all_results))


	# ──────────────────────────────────────────────────────────────────────────────
	# HTML BUILDERS
	# ──────────────────────────────────────────────────────────────────────────────

	COMP_META = [
	("sensors", "🔷", "Sensors", "Two diamond-shaped sensor plates", "#f59e0b", "#fef3c7"),
	("gps_device", "📡", "GPS Device", "White electronic box — upper corner", "#3b82f6", "#dbeafe"),
	("prime_logo", "🔶", "Prime Logo", "Amazon Prime logo (full or partial)", "#f97316", "#fff7ed"),
	("trailer_id", "🏷️", "Trailer ID Label", "Vertical strip on the corner post", "#10b981", "#d1fae5"),
	]

	CONF_COLOR = {"high": "#15803d", "medium": "#b45309", "low": "#b91c1c"}


	def build_cards(merged: dict, img_n: int, model_str: str, warn: str) -> str:
	found_n = sum(1 for k, *_ in COMP_META if merged.get(k, {}).get("found"))
	total = len(COMP_META)
	all_ok = found_n == total

	# Banner colours
	if all_ok:
	sc, sb, se, si, sl = "#16a34a", "#f0fdf4", "#86efac", "✅", "All Clear — All Components Found"
	elif found_n >= 3:
	sc, sb, se, si, sl = "#d97706", "#fffbeb", "#fde68a", "⚠️", "Mostly Complete"
	elif found_n >= 2:
	sc, sb, se, si, sl = "#ea580c", "#fff7ed", "#fed7aa", "⚠️", "Partially Complete"
	else:
	sc, sb, se, si, sl = "#dc2626", "#fef2f2", "#fca5a5", "❌", "Missing Components"

	rows = ""
	for key, icon, name, desc, accent, pill in COMP_META:
	d = merged.get(key, {})
	found = d.get("found", False)
	conf = d.get("confidence", "low")
	notes = d.get("notes", "")

	rbg = "#f0fdf4" if found else "#fef2f2"
	rbd = "#bbf7d0" if found else "#fecaca"
	stc = "#15803d" if found else "#b91c1c"
	stx = "✅ Found" if found else "❌ Missing"
	cdc = CONF_COLOR.get(conf, "#9ca3af")
	note_html = (
	f'<div style="margin-top:8px;padding-top:8px;border-top:1px solid {rbd};'
	f'font-size:12px;color:#4b5563;font-style:italic;line-height:1.5;">"{notes}"</div>'
	) if notes else ""

	rows += f"""
	<div style="background:{rbg};border:1.5px solid {rbd};border-radius:12px;
	padding:14px 16px;margin-bottom:10px;">
	<div style="display:flex;align-items:flex-start;gap:12px;">
	<div style="background:{pill};border-radius:10px;padding:9px 11px;
	font-size:22px;line-height:1;flex-shrink:0;">{icon}</div>
	<div style="flex:1;min-width:0;">
	<div style="font-weight:700;font-size:14px;color:#111827;">{name}</div>
	<div style="font-size:11px;color:#9ca3af;margin-top:1px;">{desc}</div>
	{note_html}
	</div>
	<div style="text-align:right;flex-shrink:0;padding-left:8px;">
	<div style="font-weight:700;color:{stc};font-size:13px;white-space:nowrap;">{stx}</div>
	<div style="font-size:11px;color:{cdc};margin-top:3px;">● {conf.capitalize()}</div>
	</div>
	</div>
	</div>"""

	return f"""
	<div style="font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;max-width:600px;">
	<div style="background:{sb};border:2px solid {se};border-radius:14px;
	padding:16px 20px;margin-bottom:18px;
	display:flex;align-items:center;justify-content:space-between;gap:12px;">
	<div>
	<div style="font-size:18px;font-weight:800;color:{sc};">{si} {found_n}/{total} — {sl}</div>
	<div style="font-size:12px;color:#6b7280;margin-top:3px;">
	{img_n} image{'s' if img_n > 1 else ''} · {model_str}{warn}
	</div>
	</div>
	<div style="font-size:36px;">🚛</div>
	</div>
	{rows}
	</div>"""


	def _placeholder() -> str:
	return """
	<div style="text-align:center;padding:60px 20px;color:#94a3b8;
	font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;">
	<div style="font-size:48px;margin-bottom:14px;">📷</div>
	<div style="font-size:15px;font-weight:600;color:#64748b;">Upload trailer images to begin</div>
	<div style="font-size:13px;margin-top:6px;">Front view, rear view, or both — all work</div>
	</div>"""


	def _status(state: str, total: int = 0, ok: int = 0) -> str:
	msgs = {
	"idle": ("🟡", "#d97706", "Waiting for images"),
	"done": ("🟢", "#16a34a", f"{ok}/{total} image{'s' if total > 1 else ''} processed"),
	"error": ("🔴", "#dc2626", "See error details →"),
	}
	icon, color, text = msgs.get(state, msgs["idle"])
	return (
	f'<div style="font-size:12px;color:{color};text-align:center;padding:6px 0 2px;">'
	f'{icon} {text}</div>'
	)


	def _error(msg: str) -> str:
	return (
	f'<div style="background:#fef2f2;border:1.5px solid #fca5a5;border-radius:12px;'
	f'padding:18px 20px;color:#b91c1c;font-family:-apple-system,sans-serif;'
	f'font-size:13px;line-height:1.8;">{msg}</div>'
	)


	# ──────────────────────────────────────────────────────────────────────────────
	# STARTUP LOG
	# ──────────────────────────────────────────────────────────────────────────────

	_tok = os.environ.get("HF_TOKEN", "")
	print("=" * 60)
	print(" Amazon Trailer Inspector — startup")
	print(f" HF_TOKEN : {'SET (' + str(len(_tok)) + ' chars)' if _tok else 'NOT SET ← add to Space Secrets!'}")
	print(f" Models : {[m.split('/')[-1] for m in MODELS]}")
	print(f" Method : InferenceClient(provider='auto') — router selects best provider")
	print("=" * 60)

	# ──────────────────────────────────────────────────────────────────────────────
	# GRADIO UI
	# ──────────────────────────────────────────────────────────────────────────────

	TOKEN_BANNER = "" if _tok else (
	'<div style="background:#fef3c7;border:1.5px solid #fde68a;border-radius:10px;'
	'padding:12px 16px;margin-bottom:14px;font-size:13px;color:#92400e;'
	'font-family:-apple-system,sans-serif;">'
	'⚠️ <b>HF_TOKEN not set.</b> Space Settings → Repository Secrets → add '
	'<code>HF_TOKEN</code> = your Read token from '
	'<a href="https://huggingface.co/settings/tokens" target="_blank">huggingface.co/settings/tokens</a>'
	'</div>'
	)

	CSS = """
	.gradio-container { max-width: 980px !important; margin: auto !important; }
	#analyze-btn { font-size: 15px !important; font-weight: 700 !important;
	letter-spacing: .02em !important; border-radius: 10px !important; }
	footer { display: none !important; }
	"""

	THEME = gr.themes.Soft(
	primary_hue=gr.themes.colors.blue,
	neutral_hue=gr.themes.colors.slate,
	font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
	)

	with gr.Blocks(title="🚛 Amazon Trailer Inspector", theme=THEME, css=CSS) as demo:

	gr.HTML(f"""
	<div style="text-align:center;padding:30px 0 18px;
	font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;">
	<div style="font-size:46px;margin-bottom:10px;">🚛</div>
	<h1 style="font-size:26px;font-weight:800;color:#0f172a;margin:0 0 6px;">
	Amazon Trailer Inspector
	</h1>
	<p style="color:#64748b;font-size:14px;margin:0;">
	AI-powered verification of required trailer components from photos
	</p>
	</div>
	{TOKEN_BANNER}""")

	with gr.Row(equal_height=False):

	# LEFT COLUMN — upload + checklist
	with gr.Column(scale=1, min_width=280):
	gr.HTML("""
	<div style="background:#f8fafc;border:1px solid #e2e8f0;border-radius:14px;
	padding:16px 18px;margin-bottom:14px;
	font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;">
	<div style="font-weight:700;font-size:12px;color:#475569;
	letter-spacing:.06em;text-transform:uppercase;margin-bottom:12px;">
	What we check
	</div>
	<div style="display:grid;gap:9px;font-size:13px;color:#000000;">
	<div style="display:flex;align-items:center;gap:10px;">
	<span style="background:#fef3c7;border-radius:7px;padding:4px 9px;">🔷</span>
	<span style="color:#000000 !important;">Sensors — two diamond-shaped plates</span>
	</div>
	<div style="display:flex;align-items:center;gap:10px;">
	<span style="background:#dbeafe;border-radius:7px;padding:4px 9px;">📡</span>
	<span style="color:#000000 !important;">GPS Device — white box, top corner</span>
	</div>
	<div style="display:flex;align-items:center;gap:10px;">
	<span style="background:#fff7ed;border-radius:7px;padding:4px 9px;">🔶</span>
	<span style="color:#000000 !important;">Prime Logo — Amazon Prime mark</span>
	</div>
	<div style="display:flex;align-items:center;gap:10px;">
	<span style="background:#d1fae5;border-radius:7px;padding:4px 9px;">🏷️</span>
	<span style="color:#000000 !important;">Trailer ID — corner post label strip</span>
	</div>
	</div>
	</div>""")

	file_input = gr.File(
	label="Upload Trailer Image(s)",
	file_count="multiple",
	file_types=["image"],
	type="filepath",
	)

	gr.HTML("""
	<p style="font-size:12px;color:#94a3b8;text-align:center;margin:8px 0 14px;
	font-family:-apple-system,sans-serif;">
	💡 Upload front, rear, or side views — more angles = better accuracy
	</p>""")

	analyze_btn = gr.Button(
	"🔍 Analyze Trailer",
	variant="primary",
	size="lg",
	elem_id="analyze-btn",
	)

	status_html = gr.HTML(_status("idle"))

	# RIGHT COLUMN — results
	with gr.Column(scale=1, min_width=320):
	result_html = gr.HTML(_placeholder())

	gr.HTML("""
	<div style="text-align:center;padding:20px 0 10px;color:#94a3b8;
	font-size:12px;font-family:-apple-system,sans-serif;">
	Llama 3.2 Vision · Qwen2.5-VL · Gemma 3  \|
	Images processed in parallel  \|  No data stored
	</div>""")

	analyze_btn.click(
	fn=analyze,
	inputs=[file_input],
	outputs=[result_html, status_html],
	)

	demo.launch()