Spaces:

prithivMLmods
/

Multimodal-Edge-Node

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

385889c

verified ·

1 Parent(s): fd39af9

Update app.py

Browse files

Files changed (1) hide show

app.py +416 -424

app.py CHANGED Viewed

@@ -4,14 +4,13 @@ import json
 import ast
 import re
 import uuid
-import base64
 import threading
-import numpy as np
 from pathlib import Path
 from typing import Optional
 import spaces
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from gradio import Server
@@ -59,7 +58,8 @@ try:
     print("Qwen3-VL-2B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
-    qwen_vl_2b_model = None; qwen_vl_2b_processor = None
 # ── Qwen3-VL-4B-Instruct ────────────────────────────────
 print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
@@ -71,7 +71,8 @@ try:
     print("Qwen3-VL-4B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
-    qwen_vl_4b_model = None; qwen_vl_4b_processor = None
 # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
 print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
@@ -83,7 +84,8 @@ try:
     print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
-    qwen_4b_unredacted_model = None; qwen_4b_unredacted_processor = None
 # ── Qwen3.5-4B ──────────────────────────────────────────
 print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
@@ -95,7 +97,8 @@ try:
     print("Qwen3.5-4B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
-    qwen_4b_model = None; qwen_4b_processor = None
 # ── Qwen3.5-2B ──────────────────────────────────────────
 print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
@@ -107,7 +110,8 @@ try:
     print("Qwen3.5-2B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
-    qwen_2b_model = None; qwen_2b_processor = None
 # ── LFM2.5-VL-450M ──────────────────────────────────────
 print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
@@ -119,7 +123,8 @@ try:
     print("LFM-450M model loaded successfully.")
 except Exception as e:
     print(f"Warning: LFM-450M model loading failed. Error: {e}")
-    lfm_450_model = None; lfm_450_processor = None
 # ── Gemma4-E2B-it ───────────────────────────────────────
 print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
@@ -134,7 +139,8 @@ try:
     print("Gemma4-E2B-it model loaded successfully.")
 except Exception as e:
     print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
-    gemma4_e2b_model = None; gemma4_e2b_processor = None
 # ── LFM2.5-VL-1.6B ──────────────────────────────────────
 print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
@@ -146,7 +152,8 @@ try:
     print("LFM-1.6B model loaded successfully.")
 except Exception as e:
     print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
-    lfm_16_model = None; lfm_16_processor = None
 # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
 print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
@@ -158,7 +165,8 @@ try:
     print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
-    qwen_unredacted_model = None; qwen_unredacted_processor = None
 # ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
 print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
@@ -170,42 +178,17 @@ try:
     print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
-    qwen25_vl_3b_model = None; qwen25_vl_3b_processor = None
-# ─────────────────────────────────────────────────────────────────────────────
-#  SERVER-SIDE ANNOTATION  (mirrors the reference app exactly)
-# ─────────────────────────────────────────────────────────────────────────────
-PALETTE_RGB = [
-    (78,  205, 196),   # teal
-    (124, 106, 247),   # purple
-    (255, 107, 107),   # red
-    (255, 217,  61),   # yellow
-    (107, 203, 119),   # green
-    (255, 146,  43),   # orange
-    (204,  93, 232),   # violet
-    (51,  154, 240),   # blue
-]
-def _get_font(size: int = 14):
-    """Try to load a TrueType font; fall back to PIL default."""
-    for name in ["DejaVuSans-Bold.ttf", "arial.ttf", "Arial.ttf",
-                 "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"]:
-        try:
-            return ImageFont.truetype(name, size)
-        except (IOError, OSError):
-            pass
-    return ImageFont.load_default()
 def safe_parse_json(text: str):
-    """Strip markdown fences + <think> blocks, then parse JSON."""
-    # Remove <think>…</think>
-    text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
     text = text.strip()
-    # Strip markdown fences
     text = re.sub(r"^```(json)?", "", text)
     text = re.sub(r"```$", "", text)
     text = text.strip()
@@ -213,204 +196,210 @@ def safe_parse_json(text: str):
         return json.loads(text)
     except json.JSONDecodeError:
         pass
-    # Try to find the first [...] or {...} block
-    for ch_open, ch_close in [('[', ']'), ('{', '}')]:
-        idx = text.find(ch_open)
-        if idx != -1:
-            depth, in_str, esc = 0, False, False
-            for i in range(idx, len(text)):
-                c = text[i]
-                if esc:          esc = False;  continue
-                if c == '\\':    esc = True;   continue
-                if c == '"':     in_str = not in_str; continue
-                if in_str:       continue
-                if c == ch_open: depth += 1
-                if c == ch_close:
-                    depth -= 1
-                    if depth == 0:
-                        try:
-                            return json.loads(text[idx:i+1])
-                        except Exception:
-                            break
     try:
         return ast.literal_eval(text)
     except Exception:
-        return {}
-def annotate_detections(image: Image.Image, parsed) -> Image.Image:
     """
-    Draw bounding boxes on image.
-    parsed: list of dicts with 'bbox_2d' ([x1,y1,x2,y2] in 0-1000 scale)
-            and optional 'label'.
-    Mirrors reference _run_detection_on_frame output → annotate_image.
     """
-    image = image.convert("RGB")
-    ow, oh = image.size
     draw = ImageDraw.Draw(image, "RGBA")
-    font_lbl = _get_font(max(12, min(ow // 35, 22)))
-    items = parsed if isinstance(parsed, list) else [parsed]
-    drawn = 0
-    for i, item in enumerate(items):
-        if not isinstance(item, dict):
-            continue
-        bbox = (item.get("bbox_2d") or item.get("bbox") or item.get("box"))
-        if not bbox or len(bbox) != 4:
             continue
-        col = PALETTE_RGB[i % len(PALETTE_RGB)]
-        # ── Normalise coordinates (0-1000 → pixels) ──────────────────────
-        x1, y1, x2, y2 = [float(v) for v in bbox]
-        max_v = max(x1, y1, x2, y2)
-        if max_v <= 1.0:                        # 0-1 fraction
-            x1, y1, x2, y2 = x1*ow, y1*oh, x2*ow, y2*oh
-        elif max_v <= 1000.0:                   # 0-1000 Qwen scale
-            x1, y1, x2, y2 = x1/1000*ow, y1/1000*oh, x2/1000*ow, y2/1000*oh
-        # else already in pixels
-        if x2 < x1: x1, x2 = x2, x1
-        if y2 < y1: y1, y2 = y2, y1
-        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
-        # ── Fill (semi-transparent) ───────────────────────────────────────
-        draw.rectangle([x1, y1, x2, y2], fill=(*col, 46))
-        # ── Border ───────────────────────────────────────────────────────
-        lw = max(2, ow // 200)
         for t in range(lw):
-            draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=(*col, 255))
-        # ── Corner accent marks ───────────────────────────────────────────
-        clen = max(10, min(int((x2-x1)*0.18), int((y2-y1)*0.18), 24))
-        corners = [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]
-        for cx, cy, sx, sy in corners:
-            draw.line([(cx, cy),(cx+sx*clen, cy)], fill=col, width=lw+1)
-            draw.line([(cx, cy),(cx, cy+sy*clen)], fill=col, width=lw+1)
-        # ── Label ─────────────────────────────────────────────────────────
-        label = str(item.get("label") or item.get("class_name") or item.get("name") or f"obj {i+1}")
         try:
             bb = font_lbl.getbbox(label)
             tw, th = bb[2]-bb[0], bb[3]-bb[1]
-        except AttributeError:
-            tw, th = font_lbl.getsize(label)
         pad = 5
-        lx = max(0, min(x1, ow - tw - pad*2))
-        ly = max(0, y1 - th - pad*2) if y1 - th - pad*2 >= 0 else y1 + 2
-        draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 230))
         draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
-        drawn += 1
     return image
-def annotate_points(image: Image.Image, parsed) -> Image.Image:
     """
-    Draw point markers on image.
-    parsed: list of dicts with 'point_2d' ([x,y] in 0-1000 scale)
-            and optional 'label'.
-    Mirrors reference _run_point_detection_on_frame → annotate_image_red_points.
     """
-    image = image.convert("RGB")
-    ow, oh = image.size
     draw = ImageDraw.Draw(image, "RGBA")
-    font_lbl = _get_font(max(12, min(ow // 35, 22)))
-    items = parsed if isinstance(parsed, list) else [parsed]
-    drawn = 0
-    for i, item in enumerate(items):
-        if not isinstance(item, dict):
-            continue
-        pt = (item.get("point_2d") or item.get("point") or item.get("coord"))
-        if not pt or len(pt) != 2:
-            continue
-        col = PALETTE_RGB[i % len(PALETTE_RGB)]
-        # ── Normalise coordinates ─────────────────────────────────────────
-        x, y = float(pt[0]), float(pt[1])
-        max_v = max(x, y)
-        if max_v <= 1.0:
-            x, y = x*ow, y*oh
-        elif max_v <= 1000.0:
-            x, y = x/1000*ow, y/1000*oh
-        cx, cy = int(x), int(y)
-        r = max(7, min(ow // 55, 18))
-        # ── Glow rings ───────────────────────────────────────────────────
-        draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=(*col, 38))
-        draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)],
-                     fill=(*col, 64))
-        # ── Core dot ─────────────────────────────────────────────────────
-        draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=(*col, 255),
-                     outline=(255,255,255,255), width=max(2, r//4))
-        # ── Centre pip ───────────────────────────────────────────────────
-        rp = max(2, r//4)
-        draw.ellipse([cx-rp, cy-rp, cx+rp, cy+rp], fill=(255,255,255,255))
-        # ── Label ─────────────────────────────────────────────────────────
-        label = str(item.get("label") or item.get("name") or f"pt {i+1}")
-        try:
-            bb = font_lbl.getbbox(label)
-            tw, th = bb[2]-bb[0], bb[3]-bb[1]
-        except AttributeError:
-            tw, th = font_lbl.getsize(label)
-        pad = 5
-        lx = min(cx + r + 8, ow - tw - pad*2)
-        ly = max(0, cy - th//2 - pad)
-        draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 220))
-        draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
-        drawn += 1
     return image
-def image_to_b64(img: Image.Image, fmt: str = "PNG") -> str:
-    """Convert PIL image → base64 data-URI."""
-    buf = io.BytesIO()
-    img.save(buf, format=fmt)
-    buf.seek(0)
-    return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
-# ─────────────────────────────────────────────────────────────────────────────
-#  NEW ENDPOINT: /api/annotate
-#  Receives the image + raw model output text + category,
-#  runs server-side annotation, returns base64 PNG.
-# ─────────────────────────────────────────────────────────────────────────────
-@app.post("/api/annotate")
-async def annotate_endpoint(
-    image:    UploadFile = File(...),
-    text:     str        = Form(...),
-    category: str        = Form(...),
-):
-    try:
-        img_bytes = await image.read()
-        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        img.thumbnail((512, 512))
-        parsed = safe_parse_json(text)
-        if not parsed:
-            return JSONResponse({"error": "no_json", "b64": None})
-        if category == "Detect":
-            annotated = annotate_detections(img, parsed)
-        elif category == "Point":
-            annotated = annotate_points(img, parsed)
-        else:
-            return JSONResponse({"error": "unsupported_category", "b64": None})
-        return JSONResponse({"b64": image_to_b64(annotated)})
-    except Exception as e:
-        return JSONResponse({"error": str(e), "b64": None}, status_code=500)
-# ─────────────────────────────────────────────────────────────────────────────
-#  STREAMING INFERENCE
-# ─────────────────────────────────────────────────────────────────────────────
 @spaces.GPU(duration=120)
 def generate_inference_stream(
     image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
@@ -432,19 +421,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen_vl_2b_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = qwen_vl_2b_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_vl_2b_model.device)
-        streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen_vl_2b_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.0, do_sample=True)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Qwen3-VL-4B ─────────────────────────────────────
     elif model_id == "qwen_vl_4b":
@@ -452,19 +439,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen_vl_4b_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = qwen_vl_4b_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_vl_4b_model.device)
-        streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen_vl_4b_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.0, do_sample=True)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
     elif model_id == "qwen_4b_unredacted":
@@ -472,19 +457,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen_4b_unredacted_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = qwen_4b_unredacted_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_4b_unredacted_model.device)
-        streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen_4b_unredacted_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.5, min_p=0.1)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Qwen3.5-4B ──────────────────────────────────────
     elif model_id == "qwen_4b":
@@ -492,19 +475,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen_4b_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = qwen_4b_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_4b_model.device)
-        streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen_4b_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.5, min_p=0.1)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Qwen3.5-2B ──────────────────────────────────────
     elif model_id == "qwen_2b":
@@ -512,19 +493,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen_2b_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = qwen_2b_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_2b_model.device)
-        streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen_2b_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.5, min_p=0.1)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── LFM-450M ────────────────────────────────────────
     elif model_id == "lfm_450":
@@ -532,18 +511,19 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         conversation = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
         inputs = lfm_450_processor.apply_chat_template(
             conversation, add_generation_prompt=True,
             return_tensors="pt", return_dict=True, tokenize=True,
         ).to(lfm_450_model.device)
-        streamer = TextIteratorStreamer(lfm_450_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=lfm_450_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Gemma4-E2B-it ──────────────��────────────────────
     elif model_id == "gemma4_e2b":
@@ -551,19 +531,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = gemma4_e2b_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = gemma4_e2b_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True,
-        ).to(gemma4_e2b_model.device)
-        streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=gemma4_e2b_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.0, do_sample=True)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── LFM-1.6B ────────────────────────────────────────
     elif model_id == "lfm_16":
@@ -571,18 +549,19 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         conversation = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
         inputs = lfm_16_processor.apply_chat_template(
             conversation, add_generation_prompt=True,
             return_tensors="pt", return_dict=True, tokenize=True,
         ).to(lfm_16_model.device)
-        streamer = TextIteratorStreamer(lfm_16_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=lfm_16_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
     elif model_id == "qwen_unredacted":
@@ -590,19 +569,17 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen_unredacted_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        inputs = qwen_unredacted_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_unredacted_model.device)
-        streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen_unredacted_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.5, min_p=0.1)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     # ── Qwen2.5-VL-3B-Instruct ──────────────────────────
     elif model_id == "qwen25_vl_3b":
@@ -610,28 +587,51 @@ def generate_inference_stream(
             yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
-            {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
-        text_input = qwen25_vl_3b_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = qwen25_vl_3b_processor(
             text=[text_input], images=image_inputs, videos=video_inputs,
             return_tensors="pt", padding=True,
         ).to(qwen25_vl_3b_model.device)
-        streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer,
-                                       skip_prompt=True, skip_special_tokens=True, timeout=120)
-        threading.Thread(target=qwen25_vl_3b_model.generate,
-                         kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                                     use_cache=True, temperature=1.0, do_sample=True)).start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
     yield "data: [DONE]\n\n"
-# ─────────────────────────────────────────────────────────────────────────────
-#  FastAPI Endpoints
-# ─────────────────────────────────────────────────────────────────────────────
 @app.post("/api/run")
 async def run_inference(
     image:    UploadFile = File(...),
@@ -651,12 +651,12 @@ async def run_inference(
         return JSONResponse({"error": str(e)}, status_code=500)
-# ─────────────────────────────────────────────────────────────────────────────
-#  Frontend UI
-# ─────────────────────────────────────────────────────────────────────────────
 @app.get("/", response_class=HTMLResponse)
 async def homepage(request: Request):
-    return """
 <!DOCTYPE html>
 <html lang="en">
 <head>
@@ -697,18 +697,17 @@ async def homepage(request: Request):
         /* ── Top Bar ── */
         .top-bar {
             position: sticky; top: 0; left: 0; right: 0; height: 42px;
-            background: rgba(13,13,15,0.95);
-            border-bottom: 1px solid var(--node-border);
             display: flex; align-items: center; padding: 0 20px;
             gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
         }
-        .top-bar .logo { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
-        .top-bar .sep  { color: var(--node-border); }
-        .top-bar .sub  { font-size: 11px; color: var(--muted); }
         .top-bar .badge {
-            margin-left: auto;
-            background: rgba(124,106,247,0.15); border: 1px solid rgba(124,106,247,0.3);
-            padding: 3px 10px; border-radius: 20px; font-size: 10px; color: var(--accent);
         }
         /* ── Canvas ── */
         #canvas {
@@ -797,9 +796,9 @@ async def homepage(request: Request):
             border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
         }
         .img-chip.visible { display: flex; }
-        .img-chip .chip-dot { width:5px;height:5px;border-radius:50%;background:var(--accent2);flex-shrink:0;box-shadow:0 0 4px var(--accent2); }
-        .img-chip .chip-name { overflow:hidden;text-overflow:ellipsis;white-space:nowrap;flex:1;color:var(--text);font-size:9px; }
-        .img-chip .chip-size { color:var(--muted);flex-shrink:0;font-size:9px; }
         select, textarea {
             width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
             color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
@@ -826,18 +825,17 @@ async def homepage(request: Request):
         .icon-btn {
             display: flex; align-items: center; gap: 5px;
             background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
-            border-radius: 5px; padding: 3px 8px;
-            font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
-            color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
-            transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
-            text-decoration: none; border: 1px solid rgba(124,106,247,0.25);
         }
         .icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
         .icon-btn:active { transform: scale(0.95); }
-        .icon-btn.teal { background:rgba(78,205,196,0.10);border-color:rgba(78,205,196,0.25);color:var(--accent2); }
-        .icon-btn.teal:hover { background:rgba(78,205,196,0.22);border-color:var(--accent2); }
-        .icon-btn.copied { background:rgba(78,205,196,0.15);border-color:var(--accent2);color:var(--accent2); }
-        .icon-btn svg { pointer-events:none;flex-shrink:0; }
         .output-box {
             background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
             border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
@@ -852,8 +850,10 @@ async def homepage(request: Request):
             border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
             display: flex; align-items: center; justify-content: center;
         }
-        .ground-img-wrap img {
-            width: 100%; height: 100%; object-fit: contain; display: block;
         }
         .ground-placeholder {
             position: absolute; inset: 0; display: flex; align-items: center;
@@ -866,24 +866,24 @@ async def homepage(request: Request):
             animation: spin 0.7s linear infinite; display: none;
         }
         @keyframes spin { to { transform: rotate(360deg); } }
-        .status-dot { width:6px;height:6px;border-radius:50%;background:var(--muted);display:inline-block;margin-right:6px; }
-        .status-dot.active { background:var(--accent2);box-shadow:0 0 5px var(--accent2); }
         /* ── Model badges ── */
         .model-badge {
-            display:inline-block;padding:2px 7px;border-radius:4px;
-            font-size:9px;font-weight:700;letter-spacing:0.06em;text-transform:uppercase;
         }
-        .model-badge.qvl2b    { background:rgba(255,150,50,0.15); color:#ff9632;        border:1px solid rgba(255,150,50,0.35); }
-        .model-badge.qvl4b    { background:rgba(255,100,80,0.15); color:#ff6450;        border:1px solid rgba(255,100,80,0.35); }
-        .model-badge.q4bunred { background:rgba(255,80,80,0.18);  color:#ff5050;        border:1px solid rgba(255,80,80,0.40); }
-        .model-badge.q4b      { background:rgba(255,200,80,0.15); color:#ffc850;        border:1px solid rgba(255,200,80,0.35); }
-        .model-badge.q2b      { background:rgba(124,106,247,0.2); color:var(--accent);  border:1px solid rgba(124,106,247,0.3); }
-        .model-badge.lfm450   { background:rgba(78,205,196,0.15); color:var(--accent2); border:1px solid rgba(78,205,196,0.3); }
-        .model-badge.g4e2b    { background:rgba(66,197,107,0.15); color:#42c56b;        border:1px solid rgba(66,197,107,0.35); }
-        .model-badge.lfm16    { background:rgba(107,203,119,0.15);color:#6bcb77;        border:1px solid rgba(107,203,119,0.35); }
-        .model-badge.qunred   { background:rgba(255,80,160,0.15); color:#ff50a0;        border:1px solid rgba(255,80,160,0.35); }
-        .model-badge.q25vl3b  { background:rgba(80,180,255,0.15); color:#50b4ff;        border:1px solid rgba(80,180,255,0.35); }
-        .model-info-box { border-radius:6px;padding:9px;font-size:10px;color:var(--muted);line-height:1.55;flex-shrink:0; }
         .canvas-footer { height: 36px; }
     </style>
 </head>
@@ -1049,9 +1049,8 @@ async def homepage(request: Request):
                     SAVE
                 </a>
             </div>
-            <div class="ground-img-wrap">
-                <!-- Server-rendered annotated image displayed here -->
-                <img id="groundImg" src="" alt="" style="display:none;" />
                 <div class="ground-placeholder" id="groundPlaceholder">
                     Active for Point / Detect tasks.<br>Run inference to visualise.
                 </div>
@@ -1103,8 +1102,7 @@ document.querySelectorAll('.node').forEach(node => {
     });
     document.addEventListener('mousemove', e => {
         if (!drag) return;
-        node.style.left=`${il+e.clientX-sx}px`;
-        node.style.top=`${it+e.clientY-sy}px`;
         updateWires();
     });
     document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
@@ -1134,27 +1132,24 @@ function formatBytes(b) {
     return (b/1048576).toFixed(1)+' MB';
 }
 function handleFile(file) {
-    if (!file || !file.type.startsWith('image/')) return;
-    currentFile = file;
-    imgPreview.src = URL.createObjectURL(file);
     previewWrap.classList.add('visible');
-    dropZone.style.display = 'none';
-    chipName.textContent = file.name;
-    chipSize.textContent = formatBytes(file.size);
     imgChip.classList.add('visible');
     dotImg.classList.add('active');
     requestAnimationFrame(updateWires);
 }
 function clearImage() {
-    currentFile = null;
-    imgPreview.src = '';
     previewWrap.classList.remove('visible');
-    dropZone.style.display = '';
     imgChip.classList.remove('visible');
-    chipName.textContent = '—';
-    chipSize.textContent = '';
-    fileInput.value = '';
-    dotImg.classList.remove('active');
     requestAnimationFrame(updateWires);
 }
 dropZone.onclick     = () => fileInput.click();
@@ -1203,7 +1198,7 @@ const MODEL_INFO = {
     qwen_2b: {
         html: `<span class="model-badge q2b">QWEN 3.5 · 2B</span><br><br>
                Qwen3.5 2B multimodal model by Alibaba Cloud.
-               Lightweight &amp; fast — ideal for quick tasks.`,
         bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
     },
     lfm_450: {
@@ -1256,9 +1251,7 @@ const PLACEHOLDERS = {
     Point:   'e.g., The gun held by the person.',
     Detect:  'e.g., The headlight of the car.',
 };
-categorySelect.onchange = e => {
-    promptInput.placeholder = PLACEHOLDERS[e.target.value] || '';
-};
 // ══════════════════════════════════════════════
 //  COPY BUTTON
@@ -1291,35 +1284,37 @@ copyBtn.onclick = () => {
     }).catch(() => {
         const ta = document.createElement('textarea');
         ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
-        document.body.appendChild(ta); ta.select();
-        document.execCommand('copy'); document.body.removeChild(ta);
     });
 };
 // ══════════════════════════════════════════════
-//  GROUNDING IMAGE  (server-rendered, base64)
 // ══════════════════════════════════════════════
-const groundImg         = document.getElementById('groundImg');
 const groundPlaceholder = document.getElementById('groundPlaceholder');
 const downloadBtn       = document.getElementById('downloadBtn');
 const dotGnd            = document.getElementById('dot-gnd');
-function showGroundingImage(b64DataUri) {
-    groundImg.src = b64DataUri;
-    groundImg.style.display = 'block';
     groundPlaceholder.style.display = 'none';
-    // Wire up download button
     const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
-    downloadBtn.href     = b64DataUri;
     downloadBtn.download = `grounding_${ts}.png`;
     downloadBtn.style.display = 'flex';
-    dotGnd.classList.add('active');
 }
-function resetGrounding(msg) {
-    groundImg.src = '';
-    groundImg.style.display = 'none';
-    groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks. Run inference to visualise.';
     groundPlaceholder.style.display = 'flex';
     downloadBtn.style.display = 'none';
     dotGnd.classList.remove('active');
@@ -1339,7 +1334,7 @@ runBtn.onclick = async () => {
     const promptStr = promptInput.value.trim();
     if (!promptStr)  { alert('Please enter a prompt directive.'); return; }
-    // ── Reset UI ─────────────────────────────────────────
     runBtn.disabled = true;
     btnLoader.style.display = 'inline-block';
     outputBox.innerText = '';
@@ -1348,23 +1343,21 @@ runBtn.onclick = async () => {
     dotOut.classList.remove('active');
     allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
     resetCopyBtn();
-    const cat = categorySelect.value;
-    if (cat === 'Point' || cat === 'Detect') {
-        resetGrounding('Running inference…');
-    }
-    // ── Build FormData ────────────────────────────────────
     const formData = new FormData();
     formData.append('image',    currentFile);
-    formData.append('category', cat);
     formData.append('prompt',   promptStr);
-    formData.append('model_id', modelSelect.value);
     let fullText = '';
     try {
-        // ── 1. Stream inference ───────────────────────────
         const response = await fetch('/api/run', { method: 'POST', body: formData });
         if (!response.ok) {
             const err = await response.json();
@@ -1373,15 +1366,14 @@ runBtn.onclick = async () => {
         const reader  = response.body.getReader();
         const decoder = new TextDecoder('utf-8');
-        let   buffer  = '';
         while (true) {
             const { value, done } = await reader.read();
             if (done) break;
             buffer += decoder.decode(value, { stream: true });
-            const lines = buffer.split('\\n\\n');
-            buffer = lines.pop();          // keep incomplete chunk
             for (const line of lines) {
                 if (!line.startsWith('data: ')) continue;
                 const payload = line.slice(6);
@@ -1399,42 +1391,42 @@ runBtn.onclick = async () => {
         dotOut.classList.add('active');
-        // ── 2. Server-side annotation for Point / Detect ──
-        if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
-            resetGrounding('Annotating image…');
             try {
-                const annForm = new FormData();
-                annForm.append('image',    currentFile);
-                annForm.append('text',     fullText);
-                annForm.append('category', cat);
-                const annResp = await fetch('/api/annotate', {
-                    method: 'POST', body: annForm,
                 });
-                const annData = await annResp.json();
-                if (annData.b64) {
-                    showGroundingImage(annData.b64);
                 } else {
-                    resetGrounding(
-                        annData.error === 'no_json'
-                            ? 'No grounding coordinates found in model output.'
-                            : `Annotation error: ${annData.error || 'unknown'}`
-                    );
                 }
-            } catch (annErr) {
-                resetGrounding(`Annotation failed: ${annErr.message}`);
             }
-        } else if (cat !== 'Point' && cat !== 'Detect') {
-            resetGrounding('Active for Point / Detect tasks. Run inference to visualise.');
         }
     } catch (err) {
         outputBox.innerText = `[Error] ${err.message}`;
         outputBox.style.color = '#ff6b6b';
-        if (cat === 'Point' || cat === 'Detect') {
-            resetGrounding('Inference error — see Output Stream node.');
-        }
     } finally {
         runBtn.disabled = false;
         btnLoader.style.display = 'none';

 import ast
 import re
 import uuid
 import threading
 from pathlib import Path
 from typing import Optional
 import spaces
 import torch
+import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from gradio import Server
     print("Qwen3-VL-2B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
+    qwen_vl_2b_model = None
+    qwen_vl_2b_processor = None
 # ── Qwen3-VL-4B-Instruct ────────────────────────────────
 print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
     print("Qwen3-VL-4B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
+    qwen_vl_4b_model = None
+    qwen_vl_4b_processor = None
 # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
 print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
     print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
+    qwen_4b_unredacted_model = None
+    qwen_4b_unredacted_processor = None
 # ── Qwen3.5-4B ──────────────────────────────────────────
 print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
     print("Qwen3.5-4B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
+    qwen_4b_model = None
+    qwen_4b_processor = None
 # ── Qwen3.5-2B ──────────────────────────────────────────
 print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
     print("Qwen3.5-2B model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
+    qwen_2b_model = None
+    qwen_2b_processor = None
 # ── LFM2.5-VL-450M ──────────────────────────────────────
 print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
     print("LFM-450M model loaded successfully.")
 except Exception as e:
     print(f"Warning: LFM-450M model loading failed. Error: {e}")
+    lfm_450_model = None
+    lfm_450_processor = None
 # ── Gemma4-E2B-it ───────────────────────────────────────
 print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
     print("Gemma4-E2B-it model loaded successfully.")
 except Exception as e:
     print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
+    gemma4_e2b_model = None
+    gemma4_e2b_processor = None
 # ── LFM2.5-VL-1.6B ──────────────────────────────────────
 print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
     print("LFM-1.6B model loaded successfully.")
 except Exception as e:
     print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
+    lfm_16_model = None
+    lfm_16_processor = None
 # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
 print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
     print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
+    qwen_unredacted_model = None
+    qwen_unredacted_processor = None
 # ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
 print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
     print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
 except Exception as e:
     print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
+    qwen25_vl_3b_model = None
+    qwen25_vl_3b_processor = None
+# ---------------------------------------------------------------------------
+# Utility: safe JSON parser (strips markdown fences, handles ast fallback)
+# ---------------------------------------------------------------------------
 def safe_parse_json(text: str):
     text = text.strip()
+    # strip <think>…</think>
+    text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE).strip()
     text = re.sub(r"^```(json)?", "", text)
     text = re.sub(r"```$", "", text)
     text = text.strip()
         return json.loads(text)
     except json.JSONDecodeError:
         pass
     try:
         return ast.literal_eval(text)
     except Exception:
+        pass
+    # Try to find the first JSON array or object in the text
+    for pattern in [r'\[[\s\S]*\]', r'\{[\s\S]*\}']:
+        m = re.search(pattern, text)
+        if m:
+            try:
+                return json.loads(m.group())
+            except Exception:
+                pass
+    return None
+# ---------------------------------------------------------------------------
+# Server-side annotation  (mirrors reference annotate_image exactly)
+# ---------------------------------------------------------------------------
+PALETTE_COLORS = [
+    (78, 205, 196),   # teal
+    (124, 106, 247),  # purple
+    (255, 107, 107),  # red
+    (255, 217, 61),   # yellow
+    (107, 203, 119),  # green
+    (255, 146, 43),   # orange
+    (204, 93, 232),   # magenta
+    (51, 154, 240),   # blue
+]
+def _get_font(size: int = 14):
+    """Try to load a truetype font, fall back to default."""
+    for font_name in ["arial.ttf", "Arial.ttf", "DejaVuSans.ttf",
+                      "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+                      "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]:
+        try:
+            return ImageFont.truetype(font_name, size)
+        except (IOError, OSError):
+            continue
+    return ImageFont.load_default()
+def annotate_detections(image: Image.Image, objects: list) -> Image.Image:
     """
+    Draw bounding boxes + labels on image.
+    objects: list of {label, x_min, y_min, x_max, y_max}  (all coords 0-1 fractions)
     """
+    image = image.convert("RGB").copy()
+    W, H = image.size
     draw = ImageDraw.Draw(image, "RGBA")
+    font_lbl = _get_font(max(12, W // 40))
+    for i, obj in enumerate(objects):
+        col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
+        col_rgba_fill  = col + (46,)   # ~18% opacity fill
+        col_rgba_solid = col + (255,)
+        x1 = int(obj["x_min"] * W)
+        y1 = int(obj["y_min"] * H)
+        x2 = int(obj["x_max"] * W)
+        y2 = int(obj["y_max"] * H)
+        # clamp
+        x1, x2 = max(0, x1), min(W, x2)
+        y1, y2 = max(0, y1), min(H, y2)
+        if x2 <= x1 or y2 <= y1:
             continue
+        # Filled rectangle
+        draw.rectangle([x1, y1, x2, y2], fill=col_rgba_fill)
+        # Border (draw 2px by drawing twice)
+        lw = max(2, W // 200)
         for t in range(lw):
+            draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=col_rgba_solid)
+        # Corner accents
+        ca = min(18, (x2-x1)//4, (y2-y1)//4)
+        cw = max(2, lw + 1)
+        for (cx, cy, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]:
+            draw.line([cx, cy, cx+dx*ca, cy], fill=col_rgba_solid, width=cw)
+            draw.line([cx, cy, cx, cy+dy*ca], fill=col_rgba_solid, width=cw)
+        # Label pill
+        label = obj.get("label", "object")
         try:
             bb = font_lbl.getbbox(label)
             tw, th = bb[2]-bb[0], bb[3]-bb[1]
+        except Exception:
+            tw, th = len(label)*7, 12
         pad = 5
+        pw, ph = tw + pad*2, th + pad*2
+        lx = max(0, min(x1, W - pw))
+        ly = max(0, y1 - ph) if y1 - ph >= 0 else y1 + 2
+        draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba_solid)
         draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
     return image
+def annotate_points(image: Image.Image, points: list) -> Image.Image:
     """
+    Draw point markers + labels on image.
+    points: list of {label, x, y}  (coords 0-1 fractions)
     """
+    image = image.convert("RGB").copy()
+    W, H = image.size
     draw = ImageDraw.Draw(image, "RGBA")
+    font_lbl = _get_font(max(12, W // 40))
+    r = max(7, W // 55)
+    for i, pt in enumerate(points):
+        col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
+        col_rgba = col + (255,)
+        glow_rgba = col + (40,)
+        mid_rgba  = col + (64,)
+        cx = int(pt["x"] * W)
+        cy = int(pt["y"] * H)
+        cx = max(r, min(W-r, cx))
+        cy = max(r, min(H-r, cy))
+        # Outer glow
+        draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=glow_rgba)
+        # Mid ring
+        draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)], fill=mid_rgba)
+        # Core dot
+        draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=col_rgba, outline=(255,255,255,255), width=max(2,r//3))
+        # Centre white dot
+        cr = max(2, r//3)
+        draw.ellipse([cx-cr, cy-cr, cx+cr, cy+cr], fill=(255,255,255,255))
+        # Label
+        label = pt.get("label", "")
+        if label:
+            try:
+                bb = font_lbl.getbbox(label)
+                tw, th = bb[2]-bb[0], bb[3]-bb[1]
+            except Exception:
+                tw, th = len(label)*7, 12
+            pad = 5
+            pw, ph = tw + pad*2, th + pad*2
+            lx = min(cx + r + 6, W - pw)
+            ly = max(0, cy - ph//2)
+            draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba)
+            draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
     return image
+def parse_and_annotate(image: Image.Image, full_text: str, category: str):
+    """
+    Parse model output and return annotated PIL image + structured result dict.
+    Mirrors the reference code logic exactly.
+    """
+    parsed = safe_parse_json(full_text)
+    if parsed is None:
+        return image, {"error": "No JSON found in model output", "raw": full_text[:500]}
+    if category == "Point":
+        result = {"points": []}
+        items = parsed if isinstance(parsed, list) else [parsed]
+        for item in items:
+            if isinstance(item, dict) and "point_2d" in item:
+                coords = item["point_2d"]
+                if isinstance(coords, (list, tuple)) and len(coords) == 2:
+                    x, y = float(coords[0]), float(coords[1])
+                    # Reference divides by 1000.0 — Qwen uses 0-1000 scale
+                    result["points"].append({
+                        "label": item.get("label", ""),
+                        "x": x / 1000.0,
+                        "y": y / 1000.0,
+                    })
+        annotated = annotate_points(image.copy(), result["points"])
+        return annotated, result
+    elif category == "Detect":
+        result = {"objects": []}
+        items = parsed if isinstance(parsed, list) else [parsed]
+        for item in items:
+            if isinstance(item, dict) and "bbox_2d" in item:
+                coords = item["bbox_2d"]
+                if isinstance(coords, (list, tuple)) and len(coords) == 4:
+                    xmin, ymin, xmax, ymax = [float(v) for v in coords]
+                    result["objects"].append({
+                        "label": item.get("label", "object"),
+                        "x_min": xmin / 1000.0,
+                        "y_min": ymin / 1000.0,
+                        "x_max": xmax / 1000.0,
+                        "y_max": ymax / 1000.0,
+                    })
+        annotated = annotate_detections(image.copy(), result["objects"])
+        return annotated, result
+    return image, {}
+def pil_to_png_bytes(image: Image.Image) -> bytes:
+    buf = io.BytesIO()
+    image.save(buf, format="PNG")
+    return buf.getvalue()
+# ---------------------------------------------------------------------------
+# Inference Generator (Streaming)
+# ---------------------------------------------------------------------------
 @spaces.GPU(duration=120)
 def generate_inference_stream(
     image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen_vl_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = qwen_vl_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_2b_model.device)
+        streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen_vl_2b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen3-VL-4B ─────────────────────────────────────
     elif model_id == "qwen_vl_4b":
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen_vl_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = qwen_vl_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_4b_model.device)
+        streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen_vl_4b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
     elif model_id == "qwen_4b_unredacted":
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen_4b_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = qwen_4b_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_unredacted_model.device)
+        streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen_4b_unredacted_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen3.5-4B ──────────────────────────────────────
     elif model_id == "qwen_4b":
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = qwen_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_model.device)
+        streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen_4b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen3.5-2B ──────────────────────────────────────
     elif model_id == "qwen_2b":
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = qwen_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_2b_model.device)
+        streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen_2b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── LFM-450M ────────────────────────────────────────
     elif model_id == "lfm_450":
             yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         conversation = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
         inputs = lfm_450_processor.apply_chat_template(
             conversation, add_generation_prompt=True,
             return_tensors="pt", return_dict=True, tokenize=True,
         ).to(lfm_450_model.device)
+        streamer = TextIteratorStreamer(lfm_450_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=lfm_450_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Gemma4-E2B-it ──────────────��────────────────────
     elif model_id == "gemma4_e2b":
             yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = gemma4_e2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = gemma4_e2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(gemma4_e2b_model.device)
+        streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=gemma4_e2b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── LFM-1.6B ────────────────────────────────────────
     elif model_id == "lfm_16":
             yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         conversation = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
         inputs = lfm_16_processor.apply_chat_template(
             conversation, add_generation_prompt=True,
             return_tensors="pt", return_dict=True, tokenize=True,
         ).to(lfm_16_model.device)
+        streamer = TextIteratorStreamer(lfm_16_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=lfm_16_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
     elif model_id == "qwen_unredacted":
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = qwen_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_unredacted_model.device)
+        streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen_unredacted_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen2.5-VL-3B-Instruct ──────────────────────────
     elif model_id == "qwen25_vl_3b":
             yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"; return
         messages = [{"role": "user", "content": [
+            {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
+        ]}]
+        text_input = qwen25_vl_3b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = qwen25_vl_3b_processor(
             text=[text_input], images=image_inputs, videos=video_inputs,
             return_tensors="pt", padding=True,
         ).to(qwen25_vl_3b_model.device)
+        streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
+        thread = threading.Thread(target=qwen25_vl_3b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
+        thread.start()
         for tok in streamer:
             if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     yield "data: [DONE]\n\n"
+# ---------------------------------------------------------------------------
+# New endpoint: /api/annotate  — receives image + model output text + category
+# Returns annotated PNG + structured JSON
+# ---------------------------------------------------------------------------
+@app.post("/api/annotate")
+async def annotate_endpoint(
+    image:    UploadFile = File(...),
+    text:     str        = Form(...),
+    category: str        = Form(...),
+):
+    try:
+        img_bytes = await image.read()
+        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        annotated_img, result_dict = parse_and_annotate(img, text, category)
+        png_bytes = pil_to_png_bytes(annotated_img)
+        return JSONResponse({
+            "image_b64": __import__("base64").b64encode(png_bytes).decode(),
+            "result": result_dict,
+        })
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+# ---------------------------------------------------------------------------
+# Main inference endpoint
+# ---------------------------------------------------------------------------
 @app.post("/api/run")
 async def run_inference(
     image:    UploadFile = File(...),
         return JSONResponse({"error": str(e)}, status_code=500)
+# ---------------------------------------------------------------------------
+# Frontend
+# ---------------------------------------------------------------------------
 @app.get("/", response_class=HTMLResponse)
 async def homepage(request: Request):
+    return r"""
 <!DOCTYPE html>
 <html lang="en">
 <head>
         /* ── Top Bar ── */
         .top-bar {
             position: sticky; top: 0; left: 0; right: 0; height: 42px;
+            background: rgba(13,13,15,0.95); border-bottom: 1px solid var(--node-border);
             display: flex; align-items: center; padding: 0 20px;
             gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
         }
+        .top-bar .logo  { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
+        .top-bar .sep   { color: var(--node-border); }
+        .top-bar .sub   { font-size: 11px; color: var(--muted); }
         .top-bar .badge {
+            margin-left: auto; background: rgba(124,106,247,0.15);
+            border: 1px solid rgba(124,106,247,0.3); padding: 3px 10px;
+            border-radius: 20px; font-size: 10px; color: var(--accent);
         }
         /* ── Canvas ── */
         #canvas {
             border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
         }
         .img-chip.visible { display: flex; }
+        .img-chip .chip-dot { width: 5px; height: 5px; border-radius: 50%; background: var(--accent2); flex-shrink: 0; box-shadow: 0 0 4px var(--accent2); }
+        .img-chip .chip-name { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; flex: 1; color: var(--text); font-size: 9px; }
+        .img-chip .chip-size { color: var(--muted); flex-shrink: 0; font-size: 9px; }
         select, textarea {
             width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
             color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
         .icon-btn {
             display: flex; align-items: center; gap: 5px;
             background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
+            border-radius: 5px; padding: 3px 8px; font-size: 9px; font-weight: 700;
+            font-family: 'JetBrains Mono', monospace; color: var(--accent); cursor: pointer;
+            letter-spacing: 0.05em; transition: background 0.18s, border-color 0.18s, transform 0.1s;
+            flex-shrink: 0; text-decoration: none;
         }
         .icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
         .icon-btn:active { transform: scale(0.95); }
+        .icon-btn.teal { background: rgba(78,205,196,0.10); border-color: rgba(78,205,196,0.25); color: var(--accent2); }
+        .icon-btn.teal:hover { background: rgba(78,205,196,0.22); border-color: var(--accent2); }
+        .icon-btn.copied { background: rgba(78,205,196,0.15); border-color: var(--accent2); color: var(--accent2); }
+        .icon-btn svg { pointer-events: none; flex-shrink: 0; }
         .output-box {
             background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
             border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
             border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
             display: flex; align-items: center; justify-content: center;
         }
+        /* annotated image displayed via <img> tag — no canvas needed */
+        .ground-img-wrap img.overlay-img {
+            max-width: 100%; max-height: 100%;
+            object-fit: contain; display: block;
         }
         .ground-placeholder {
             position: absolute; inset: 0; display: flex; align-items: center;
             animation: spin 0.7s linear infinite; display: none;
         }
         @keyframes spin { to { transform: rotate(360deg); } }
+        .status-dot { width: 6px; height: 6px; border-radius: 50%; background: var(--muted); display: inline-block; margin-right: 6px; }
+        .status-dot.active { background: var(--accent2); box-shadow: 0 0 5px var(--accent2); }
         /* ── Model badges ── */
         .model-badge {
+            display: inline-block; padding: 2px 7px; border-radius: 4px;
+            font-size: 9px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase;
         }
+        .model-badge.qvl2b    { background: rgba(255,150,50,0.15);  color: #ff9632; border: 1px solid rgba(255,150,50,0.35); }
+        .model-badge.qvl4b    { background: rgba(255,100,80,0.15);  color: #ff6450; border: 1px solid rgba(255,100,80,0.35); }
+        .model-badge.q4bunred { background: rgba(255,80,80,0.18);   color: #ff5050; border: 1px solid rgba(255,80,80,0.40); }
+        .model-badge.q4b      { background: rgba(255,200,80,0.15);  color: #ffc850; border: 1px solid rgba(255,200,80,0.35); }
+        .model-badge.q2b      { background: rgba(124,106,247,0.2);  color: var(--accent); border: 1px solid rgba(124,106,247,0.3); }
+        .model-badge.lfm450   { background: rgba(78,205,196,0.15);  color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
+        .model-badge.g4e2b    { background: rgba(66,197,107,0.15);  color: #42c56b; border: 1px solid rgba(66,197,107,0.35); }
+        .model-badge.lfm16    { background: rgba(107,203,119,0.15); color: #6bcb77; border: 1px solid rgba(107,203,119,0.35); }
+        .model-badge.qunred   { background: rgba(255,80,160,0.15);  color: #ff50a0; border: 1px solid rgba(255,80,160,0.35); }
+        .model-badge.q25vl3b  { background: rgba(80,180,255,0.15);  color: #50b4ff; border: 1px solid rgba(80,180,255,0.35); }
+        .model-info-box { border-radius: 6px; padding: 9px; font-size: 10px; color: var(--muted); line-height: 1.55; flex-shrink: 0; }
         .canvas-footer { height: 36px; }
     </style>
 </head>
                     SAVE
                 </a>
             </div>
+            <div class="ground-img-wrap" id="groundWrap">
+                <img class="overlay-img" id="overlayImg" src="" style="display:none;" />
                 <div class="ground-placeholder" id="groundPlaceholder">
                     Active for Point / Detect tasks.<br>Run inference to visualise.
                 </div>
     });
     document.addEventListener('mousemove', e => {
         if (!drag) return;
+        node.style.left=`${il+e.clientX-sx}px`; node.style.top=`${it+e.clientY-sy}px`;
         updateWires();
     });
     document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
     return (b/1048576).toFixed(1)+' MB';
 }
 function handleFile(file) {
+    if (!file||!file.type.startsWith('image/')) return;
+    currentFile=file;
+    imgPreview.src=URL.createObjectURL(file);
     previewWrap.classList.add('visible');
+    dropZone.style.display='none';
+    chipName.textContent=file.name;
+    chipSize.textContent=formatBytes(file.size);
     imgChip.classList.add('visible');
     dotImg.classList.add('active');
     requestAnimationFrame(updateWires);
 }
 function clearImage() {
+    currentFile=null; imgPreview.src='';
     previewWrap.classList.remove('visible');
+    dropZone.style.display='';
     imgChip.classList.remove('visible');
+    chipName.textContent='—'; chipSize.textContent='';
+    fileInput.value=''; dotImg.classList.remove('active');
     requestAnimationFrame(updateWires);
 }
 dropZone.onclick     = () => fileInput.click();
     qwen_2b: {
         html: `<span class="model-badge q2b">QWEN 3.5 · 2B</span><br><br>
                Qwen3.5 2B multimodal model by Alibaba Cloud.
+               Lightweight &amp; fast — ideal for quick Query, Caption, Point &amp; Detect tasks.`,
         bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
     },
     lfm_450: {
     Point:   'e.g., The gun held by the person.',
     Detect:  'e.g., The headlight of the car.',
 };
+categorySelect.onchange = e => { promptInput.placeholder = PLACEHOLDERS[e.target.value] || ''; };
 // ══════════════════════════════════════════════
 //  COPY BUTTON
     }).catch(() => {
         const ta = document.createElement('textarea');
         ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
+        document.body.appendChild(ta); ta.select(); document.execCommand('copy');
+        document.body.removeChild(ta);
     });
 };
 // ══════════════════════════════════════════════
+//  GROUNDING DISPLAY  (server-side annotated image)
 // ══════════════════════════════════════════════
+const overlayImg        = document.getElementById('overlayImg');
 const groundPlaceholder = document.getElementById('groundPlaceholder');
 const downloadBtn       = document.getElementById('downloadBtn');
 const dotGnd            = document.getElementById('dot-gnd');
+function showOverlay(b64png) {
+    const src = 'data:image/png;base64,' + b64png;
+    overlayImg.src = src;
+    overlayImg.style.display = 'block';
     groundPlaceholder.style.display = 'none';
+    dotGnd.classList.add('active');
+    // Update download button
     const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
+    downloadBtn.href     = src;
     downloadBtn.download = `grounding_${ts}.png`;
     downloadBtn.style.display = 'flex';
 }
+function resetOverlay(msg) {
+    overlayImg.src = '';
+    overlayImg.style.display = 'none';
+    groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks.\nRun inference to visualise.';
     groundPlaceholder.style.display = 'flex';
     downloadBtn.style.display = 'none';
     dotGnd.classList.remove('active');
     const promptStr = promptInput.value.trim();
     if (!promptStr)  { alert('Please enter a prompt directive.'); return; }
+    // ── Reset UI ──────────────────────────────
     runBtn.disabled = true;
     btnLoader.style.display = 'inline-block';
     outputBox.innerText = '';
     dotOut.classList.remove('active');
     allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
     resetCopyBtn();
+    resetOverlay('Running inference…');
+    const category = categorySelect.value;
+    const modelId  = modelSelect.value;
+    // ── Step 1: stream text from /api/run ─────
     const formData = new FormData();
     formData.append('image',    currentFile);
+    formData.append('category', category);
     formData.append('prompt',   promptStr);
+    formData.append('model_id', modelId);
     let fullText = '';
     try {
         const response = await fetch('/api/run', { method: 'POST', body: formData });
         if (!response.ok) {
             const err = await response.json();
         const reader  = response.body.getReader();
         const decoder = new TextDecoder('utf-8');
+        let buffer = '';
         while (true) {
             const { value, done } = await reader.read();
             if (done) break;
             buffer += decoder.decode(value, { stream: true });
+            const lines = buffer.split('\n\n');
+            buffer = lines.pop();
             for (const line of lines) {
                 if (!line.startsWith('data: ')) continue;
                 const payload = line.slice(6);
         dotOut.classList.add('active');
+        // ── Step 2: if Point or Detect → call /api/annotate ──
+        if ((category === 'Point' || category === 'Detect') && fullText.trim()) {
+            groundPlaceholder.textContent = 'Annotating image…';
+            groundPlaceholder.style.display = 'flex';
             try {
+                const annotForm = new FormData();
+                annotForm.append('image',    currentFile);
+                annotForm.append('text',     fullText);
+                annotForm.append('category', category);
+                const annotResp = await fetch('/api/annotate', {
+                    method: 'POST', body: annotForm,
                 });
+                if (!annotResp.ok) throw new Error('Annotation request failed');
+                const annotData = await annotResp.json();
+                if (annotData.error) {
+                    resetOverlay('Annotation error: ' + annotData.error);
+                } else if (annotData.image_b64) {
+                    showOverlay(annotData.image_b64);
                 } else {
+                    resetOverlay('No coordinates found in model output.');
                 }
+            } catch (annotErr) {
+                resetOverlay('Annotation failed: ' + annotErr.message);
+                console.error('Annotation error:', annotErr);
             }
+        } else if (category !== 'Point' && category !== 'Detect') {
+            resetOverlay('Active for Point / Detect tasks.\nRun inference to visualise.');
         }
     } catch (err) {
         outputBox.innerText = `[Error] ${err.message}`;
         outputBox.style.color = '#ff6b6b';
+        resetOverlay('Inference error — see Output Stream node.');
     } finally {
         runBtn.disabled = false;
         btnLoader.style.display = 'none';