prithivMLmods commited on
Commit
385889c
Β·
verified Β·
1 Parent(s): fd39af9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +416 -424
app.py CHANGED
@@ -4,14 +4,13 @@ import json
4
  import ast
5
  import re
6
  import uuid
7
- import base64
8
  import threading
9
- import numpy as np
10
  from pathlib import Path
11
  from typing import Optional
12
 
13
  import spaces
14
  import torch
 
15
  from PIL import Image, ImageDraw, ImageFont
16
 
17
  from gradio import Server
@@ -59,7 +58,8 @@ try:
59
  print("Qwen3-VL-2B model loaded successfully.")
60
  except Exception as e:
61
  print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
62
- qwen_vl_2b_model = None; qwen_vl_2b_processor = None
 
63
 
64
  # ── Qwen3-VL-4B-Instruct ────────────────────────────────
65
  print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
@@ -71,7 +71,8 @@ try:
71
  print("Qwen3-VL-4B model loaded successfully.")
72
  except Exception as e:
73
  print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
74
- qwen_vl_4b_model = None; qwen_vl_4b_processor = None
 
75
 
76
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
77
  print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
@@ -83,7 +84,8 @@ try:
83
  print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
84
  except Exception as e:
85
  print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
86
- qwen_4b_unredacted_model = None; qwen_4b_unredacted_processor = None
 
87
 
88
  # ── Qwen3.5-4B ──────────────────────────────────────────
89
  print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
@@ -95,7 +97,8 @@ try:
95
  print("Qwen3.5-4B model loaded successfully.")
96
  except Exception as e:
97
  print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
98
- qwen_4b_model = None; qwen_4b_processor = None
 
99
 
100
  # ── Qwen3.5-2B ──────────────────────────────────────────
101
  print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
@@ -107,7 +110,8 @@ try:
107
  print("Qwen3.5-2B model loaded successfully.")
108
  except Exception as e:
109
  print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
110
- qwen_2b_model = None; qwen_2b_processor = None
 
111
 
112
  # ── LFM2.5-VL-450M ──────────────────────────────────────
113
  print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
@@ -119,7 +123,8 @@ try:
119
  print("LFM-450M model loaded successfully.")
120
  except Exception as e:
121
  print(f"Warning: LFM-450M model loading failed. Error: {e}")
122
- lfm_450_model = None; lfm_450_processor = None
 
123
 
124
  # ── Gemma4-E2B-it ───────────────────────────────────────
125
  print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
@@ -134,7 +139,8 @@ try:
134
  print("Gemma4-E2B-it model loaded successfully.")
135
  except Exception as e:
136
  print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
137
- gemma4_e2b_model = None; gemma4_e2b_processor = None
 
138
 
139
  # ── LFM2.5-VL-1.6B ──────────────────────────────────────
140
  print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
@@ -146,7 +152,8 @@ try:
146
  print("LFM-1.6B model loaded successfully.")
147
  except Exception as e:
148
  print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
149
- lfm_16_model = None; lfm_16_processor = None
 
150
 
151
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
152
  print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
@@ -158,7 +165,8 @@ try:
158
  print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
159
  except Exception as e:
160
  print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
161
- qwen_unredacted_model = None; qwen_unredacted_processor = None
 
162
 
163
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
164
  print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
@@ -170,42 +178,17 @@ try:
170
  print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
171
  except Exception as e:
172
  print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
173
- qwen25_vl_3b_model = None; qwen25_vl_3b_processor = None
174
-
175
-
176
- # ─────────────────────────────────────────────────────────────────────────────
177
- # SERVER-SIDE ANNOTATION (mirrors the reference app exactly)
178
- # ─────────────────────────────────────────────────────────────────────────────
179
-
180
- PALETTE_RGB = [
181
- (78, 205, 196), # teal
182
- (124, 106, 247), # purple
183
- (255, 107, 107), # red
184
- (255, 217, 61), # yellow
185
- (107, 203, 119), # green
186
- (255, 146, 43), # orange
187
- (204, 93, 232), # violet
188
- (51, 154, 240), # blue
189
- ]
190
-
191
-
192
- def _get_font(size: int = 14):
193
- """Try to load a TrueType font; fall back to PIL default."""
194
- for name in ["DejaVuSans-Bold.ttf", "arial.ttf", "Arial.ttf",
195
- "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"]:
196
- try:
197
- return ImageFont.truetype(name, size)
198
- except (IOError, OSError):
199
- pass
200
- return ImageFont.load_default()
201
 
202
 
 
 
 
203
  def safe_parse_json(text: str):
204
- """Strip markdown fences + <think> blocks, then parse JSON."""
205
- # Remove <think>…</think>
206
- text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
207
  text = text.strip()
208
- # Strip markdown fences
 
209
  text = re.sub(r"^```(json)?", "", text)
210
  text = re.sub(r"```$", "", text)
211
  text = text.strip()
@@ -213,204 +196,210 @@ def safe_parse_json(text: str):
213
  return json.loads(text)
214
  except json.JSONDecodeError:
215
  pass
216
- # Try to find the first [...] or {...} block
217
- for ch_open, ch_close in [('[', ']'), ('{', '}')]:
218
- idx = text.find(ch_open)
219
- if idx != -1:
220
- depth, in_str, esc = 0, False, False
221
- for i in range(idx, len(text)):
222
- c = text[i]
223
- if esc: esc = False; continue
224
- if c == '\\': esc = True; continue
225
- if c == '"': in_str = not in_str; continue
226
- if in_str: continue
227
- if c == ch_open: depth += 1
228
- if c == ch_close:
229
- depth -= 1
230
- if depth == 0:
231
- try:
232
- return json.loads(text[idx:i+1])
233
- except Exception:
234
- break
235
  try:
236
  return ast.literal_eval(text)
237
  except Exception:
238
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
 
241
- def annotate_detections(image: Image.Image, parsed) -> Image.Image:
242
  """
243
- Draw bounding boxes on image.
244
- parsed: list of dicts with 'bbox_2d' ([x1,y1,x2,y2] in 0-1000 scale)
245
- and optional 'label'.
246
- Mirrors reference _run_detection_on_frame output β†’ annotate_image.
247
  """
248
- image = image.convert("RGB")
249
- ow, oh = image.size
250
  draw = ImageDraw.Draw(image, "RGBA")
251
- font_lbl = _get_font(max(12, min(ow // 35, 22)))
252
-
253
- items = parsed if isinstance(parsed, list) else [parsed]
254
- drawn = 0
255
- for i, item in enumerate(items):
256
- if not isinstance(item, dict):
257
- continue
258
- bbox = (item.get("bbox_2d") or item.get("bbox") or item.get("box"))
259
- if not bbox or len(bbox) != 4:
 
 
 
 
 
 
260
  continue
261
- col = PALETTE_RGB[i % len(PALETTE_RGB)]
262
-
263
- # ── Normalise coordinates (0-1000 β†’ pixels) ──────────────────────
264
- x1, y1, x2, y2 = [float(v) for v in bbox]
265
- max_v = max(x1, y1, x2, y2)
266
- if max_v <= 1.0: # 0-1 fraction
267
- x1, y1, x2, y2 = x1*ow, y1*oh, x2*ow, y2*oh
268
- elif max_v <= 1000.0: # 0-1000 Qwen scale
269
- x1, y1, x2, y2 = x1/1000*ow, y1/1000*oh, x2/1000*ow, y2/1000*oh
270
- # else already in pixels
271
-
272
- if x2 < x1: x1, x2 = x2, x1
273
- if y2 < y1: y1, y2 = y2, y1
274
- x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
275
-
276
- # ── Fill (semi-transparent) ───────────────────────────────────────
277
- draw.rectangle([x1, y1, x2, y2], fill=(*col, 46))
278
-
279
- # ── Border ───────────────────────────────────────────────────────
280
- lw = max(2, ow // 200)
281
  for t in range(lw):
282
- draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=(*col, 255))
283
 
284
- # ── Corner accent marks ───────────────────────────────────────────
285
- clen = max(10, min(int((x2-x1)*0.18), int((y2-y1)*0.18), 24))
286
- corners = [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]
287
- for cx, cy, sx, sy in corners:
288
- draw.line([(cx, cy),(cx+sx*clen, cy)], fill=col, width=lw+1)
289
- draw.line([(cx, cy),(cx, cy+sy*clen)], fill=col, width=lw+1)
290
 
291
- # ── Label ─────────────────────────────────────────────────────────
292
- label = str(item.get("label") or item.get("class_name") or item.get("name") or f"obj {i+1}")
293
  try:
294
  bb = font_lbl.getbbox(label)
295
  tw, th = bb[2]-bb[0], bb[3]-bb[1]
296
- except AttributeError:
297
- tw, th = font_lbl.getsize(label)
298
  pad = 5
299
- lx = max(0, min(x1, ow - tw - pad*2))
300
- ly = max(0, y1 - th - pad*2) if y1 - th - pad*2 >= 0 else y1 + 2
301
- draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 230))
 
302
  draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
303
- drawn += 1
304
 
305
  return image
306
 
307
 
308
- def annotate_points(image: Image.Image, parsed) -> Image.Image:
309
  """
310
- Draw point markers on image.
311
- parsed: list of dicts with 'point_2d' ([x,y] in 0-1000 scale)
312
- and optional 'label'.
313
- Mirrors reference _run_point_detection_on_frame β†’ annotate_image_red_points.
314
  """
315
- image = image.convert("RGB")
316
- ow, oh = image.size
317
  draw = ImageDraw.Draw(image, "RGBA")
318
- font_lbl = _get_font(max(12, min(ow // 35, 22)))
319
-
320
- items = parsed if isinstance(parsed, list) else [parsed]
321
- drawn = 0
322
- for i, item in enumerate(items):
323
- if not isinstance(item, dict):
324
- continue
325
- pt = (item.get("point_2d") or item.get("point") or item.get("coord"))
326
- if not pt or len(pt) != 2:
327
- continue
328
- col = PALETTE_RGB[i % len(PALETTE_RGB)]
329
-
330
- # ── Normalise coordinates ─────────────────────────────────────────
331
- x, y = float(pt[0]), float(pt[1])
332
- max_v = max(x, y)
333
- if max_v <= 1.0:
334
- x, y = x*ow, y*oh
335
- elif max_v <= 1000.0:
336
- x, y = x/1000*ow, y/1000*oh
337
-
338
- cx, cy = int(x), int(y)
339
- r = max(7, min(ow // 55, 18))
340
-
341
- # ── Glow rings ───────────────────────────────────────────────────
342
- draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=(*col, 38))
343
- draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)],
344
- fill=(*col, 64))
345
-
346
- # ── Core dot ─────────────────────────────────────────────────────
347
- draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=(*col, 255),
348
- outline=(255,255,255,255), width=max(2, r//4))
349
-
350
- # ── Centre pip ───────────────────────────────────────────────────
351
- rp = max(2, r//4)
352
- draw.ellipse([cx-rp, cy-rp, cx+rp, cy+rp], fill=(255,255,255,255))
353
-
354
- # ── Label ─────────────────────────────────────────────────────────
355
- label = str(item.get("label") or item.get("name") or f"pt {i+1}")
356
- try:
357
- bb = font_lbl.getbbox(label)
358
- tw, th = bb[2]-bb[0], bb[3]-bb[1]
359
- except AttributeError:
360
- tw, th = font_lbl.getsize(label)
361
- pad = 5
362
- lx = min(cx + r + 8, ow - tw - pad*2)
363
- ly = max(0, cy - th//2 - pad)
364
- draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 220))
365
- draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
366
- drawn += 1
367
 
368
  return image
369
 
370
 
371
- def image_to_b64(img: Image.Image, fmt: str = "PNG") -> str:
372
- """Convert PIL image β†’ base64 data-URI."""
373
- buf = io.BytesIO()
374
- img.save(buf, format=fmt)
375
- buf.seek(0)
376
- return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
377
-
378
-
379
- # ─────────────────────────────────────────────────────────────────────────────
380
- # NEW ENDPOINT: /api/annotate
381
- # Receives the image + raw model output text + category,
382
- # runs server-side annotation, returns base64 PNG.
383
- # ─────────────────────────────────────────────────────────────────────────────
384
- @app.post("/api/annotate")
385
- async def annotate_endpoint(
386
- image: UploadFile = File(...),
387
- text: str = Form(...),
388
- category: str = Form(...),
389
- ):
390
- try:
391
- img_bytes = await image.read()
392
- img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
393
- img.thumbnail((512, 512))
394
-
395
- parsed = safe_parse_json(text)
396
- if not parsed:
397
- return JSONResponse({"error": "no_json", "b64": None})
398
-
399
- if category == "Detect":
400
- annotated = annotate_detections(img, parsed)
401
- elif category == "Point":
402
- annotated = annotate_points(img, parsed)
403
- else:
404
- return JSONResponse({"error": "unsupported_category", "b64": None})
405
 
406
- return JSONResponse({"b64": image_to_b64(annotated)})
407
- except Exception as e:
408
- return JSONResponse({"error": str(e), "b64": None}, status_code=500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
 
411
- # ─────────────────────────────────────────────────────────────────────────────
412
- # STREAMING INFERENCE
413
- # ─────────────────────────────────────────────────────────────────────────────
414
  @spaces.GPU(duration=120)
415
  def generate_inference_stream(
416
  image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
@@ -432,19 +421,17 @@ def generate_inference_stream(
432
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
433
  yield "data: [DONE]\n\n"; return
434
  messages = [{"role": "user", "content": [
435
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
436
- text_input = qwen_vl_2b_processor.apply_chat_template(
437
- messages, tokenize=False, add_generation_prompt=True)
438
- inputs = qwen_vl_2b_processor(
439
- text=[text_input], images=[image], return_tensors="pt", padding=True
440
- ).to(qwen_vl_2b_model.device)
441
- streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer,
442
- skip_prompt=True, skip_special_tokens=True, timeout=120)
443
- threading.Thread(target=qwen_vl_2b_model.generate,
444
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
445
- use_cache=True, temperature=1.0, do_sample=True)).start()
446
  for tok in streamer:
447
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
448
 
449
  # ── Qwen3-VL-4B ─────────────────────────────────────
450
  elif model_id == "qwen_vl_4b":
@@ -452,19 +439,17 @@ def generate_inference_stream(
452
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
453
  yield "data: [DONE]\n\n"; return
454
  messages = [{"role": "user", "content": [
455
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
456
- text_input = qwen_vl_4b_processor.apply_chat_template(
457
- messages, tokenize=False, add_generation_prompt=True)
458
- inputs = qwen_vl_4b_processor(
459
- text=[text_input], images=[image], return_tensors="pt", padding=True
460
- ).to(qwen_vl_4b_model.device)
461
- streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer,
462
- skip_prompt=True, skip_special_tokens=True, timeout=120)
463
- threading.Thread(target=qwen_vl_4b_model.generate,
464
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
465
- use_cache=True, temperature=1.0, do_sample=True)).start()
466
  for tok in streamer:
467
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
468
 
469
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
470
  elif model_id == "qwen_4b_unredacted":
@@ -472,19 +457,17 @@ def generate_inference_stream(
472
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
473
  yield "data: [DONE]\n\n"; return
474
  messages = [{"role": "user", "content": [
475
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
476
- text_input = qwen_4b_unredacted_processor.apply_chat_template(
477
- messages, tokenize=False, add_generation_prompt=True)
478
- inputs = qwen_4b_unredacted_processor(
479
- text=[text_input], images=[image], return_tensors="pt", padding=True
480
- ).to(qwen_4b_unredacted_model.device)
481
- streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer,
482
- skip_prompt=True, skip_special_tokens=True, timeout=120)
483
- threading.Thread(target=qwen_4b_unredacted_model.generate,
484
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
485
- use_cache=True, temperature=1.5, min_p=0.1)).start()
486
  for tok in streamer:
487
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
488
 
489
  # ── Qwen3.5-4B ──────────────────────────────────────
490
  elif model_id == "qwen_4b":
@@ -492,19 +475,17 @@ def generate_inference_stream(
492
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
493
  yield "data: [DONE]\n\n"; return
494
  messages = [{"role": "user", "content": [
495
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
496
- text_input = qwen_4b_processor.apply_chat_template(
497
- messages, tokenize=False, add_generation_prompt=True)
498
- inputs = qwen_4b_processor(
499
- text=[text_input], images=[image], return_tensors="pt", padding=True
500
- ).to(qwen_4b_model.device)
501
- streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer,
502
- skip_prompt=True, skip_special_tokens=True, timeout=120)
503
- threading.Thread(target=qwen_4b_model.generate,
504
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
505
- use_cache=True, temperature=1.5, min_p=0.1)).start()
506
  for tok in streamer:
507
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
508
 
509
  # ── Qwen3.5-2B ──────────────────────────────────────
510
  elif model_id == "qwen_2b":
@@ -512,19 +493,17 @@ def generate_inference_stream(
512
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
513
  yield "data: [DONE]\n\n"; return
514
  messages = [{"role": "user", "content": [
515
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
516
- text_input = qwen_2b_processor.apply_chat_template(
517
- messages, tokenize=False, add_generation_prompt=True)
518
- inputs = qwen_2b_processor(
519
- text=[text_input], images=[image], return_tensors="pt", padding=True
520
- ).to(qwen_2b_model.device)
521
- streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer,
522
- skip_prompt=True, skip_special_tokens=True, timeout=120)
523
- threading.Thread(target=qwen_2b_model.generate,
524
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
525
- use_cache=True, temperature=1.5, min_p=0.1)).start()
526
  for tok in streamer:
527
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
528
 
529
  # ── LFM-450M ────────────────────────────────────────
530
  elif model_id == "lfm_450":
@@ -532,18 +511,19 @@ def generate_inference_stream(
532
  yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
533
  yield "data: [DONE]\n\n"; return
534
  conversation = [{"role": "user", "content": [
535
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
536
  inputs = lfm_450_processor.apply_chat_template(
537
  conversation, add_generation_prompt=True,
538
  return_tensors="pt", return_dict=True, tokenize=True,
539
  ).to(lfm_450_model.device)
540
- streamer = TextIteratorStreamer(lfm_450_processor.tokenizer,
541
- skip_prompt=True, skip_special_tokens=True, timeout=120)
542
- threading.Thread(target=lfm_450_model.generate,
543
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
544
- use_cache=True)).start()
545
  for tok in streamer:
546
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
547
 
548
  # ── Gemma4-E2B-it ──────────────��────────────────────
549
  elif model_id == "gemma4_e2b":
@@ -551,19 +531,17 @@ def generate_inference_stream(
551
  yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
552
  yield "data: [DONE]\n\n"; return
553
  messages = [{"role": "user", "content": [
554
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
555
- text_input = gemma4_e2b_processor.apply_chat_template(
556
- messages, tokenize=False, add_generation_prompt=True)
557
- inputs = gemma4_e2b_processor(
558
- text=[text_input], images=[image], return_tensors="pt", padding=True,
559
- ).to(gemma4_e2b_model.device)
560
- streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer,
561
- skip_prompt=True, skip_special_tokens=True, timeout=120)
562
- threading.Thread(target=gemma4_e2b_model.generate,
563
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
564
- use_cache=True, temperature=1.0, do_sample=True)).start()
565
  for tok in streamer:
566
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
567
 
568
  # ── LFM-1.6B ────────────────────────────────────────
569
  elif model_id == "lfm_16":
@@ -571,18 +549,19 @@ def generate_inference_stream(
571
  yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
572
  yield "data: [DONE]\n\n"; return
573
  conversation = [{"role": "user", "content": [
574
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
575
  inputs = lfm_16_processor.apply_chat_template(
576
  conversation, add_generation_prompt=True,
577
  return_tensors="pt", return_dict=True, tokenize=True,
578
  ).to(lfm_16_model.device)
579
- streamer = TextIteratorStreamer(lfm_16_processor.tokenizer,
580
- skip_prompt=True, skip_special_tokens=True, timeout=120)
581
- threading.Thread(target=lfm_16_model.generate,
582
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
583
- use_cache=True)).start()
584
  for tok in streamer:
585
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
586
 
587
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
588
  elif model_id == "qwen_unredacted":
@@ -590,19 +569,17 @@ def generate_inference_stream(
590
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
591
  yield "data: [DONE]\n\n"; return
592
  messages = [{"role": "user", "content": [
593
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
594
- text_input = qwen_unredacted_processor.apply_chat_template(
595
- messages, tokenize=False, add_generation_prompt=True)
596
- inputs = qwen_unredacted_processor(
597
- text=[text_input], images=[image], return_tensors="pt", padding=True
598
- ).to(qwen_unredacted_model.device)
599
- streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer,
600
- skip_prompt=True, skip_special_tokens=True, timeout=120)
601
- threading.Thread(target=qwen_unredacted_model.generate,
602
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
603
- use_cache=True, temperature=1.5, min_p=0.1)).start()
604
  for tok in streamer:
605
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
606
 
607
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────
608
  elif model_id == "qwen25_vl_3b":
@@ -610,28 +587,51 @@ def generate_inference_stream(
610
  yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
611
  yield "data: [DONE]\n\n"; return
612
  messages = [{"role": "user", "content": [
613
- {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
614
- text_input = qwen25_vl_3b_processor.apply_chat_template(
615
- messages, tokenize=False, add_generation_prompt=True)
616
  image_inputs, video_inputs = process_vision_info(messages)
617
  inputs = qwen25_vl_3b_processor(
618
  text=[text_input], images=image_inputs, videos=video_inputs,
619
  return_tensors="pt", padding=True,
620
  ).to(qwen25_vl_3b_model.device)
621
- streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer,
622
- skip_prompt=True, skip_special_tokens=True, timeout=120)
623
- threading.Thread(target=qwen25_vl_3b_model.generate,
624
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
625
- use_cache=True, temperature=1.0, do_sample=True)).start()
626
  for tok in streamer:
627
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
628
 
629
  yield "data: [DONE]\n\n"
630
 
631
 
632
- # ─────────────────────────────────────────────────────────────────────────────
633
- # FastAPI Endpoints
634
- # ─────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  @app.post("/api/run")
636
  async def run_inference(
637
  image: UploadFile = File(...),
@@ -651,12 +651,12 @@ async def run_inference(
651
  return JSONResponse({"error": str(e)}, status_code=500)
652
 
653
 
654
- # ─────────────────────────────────────────────────────────────────────────────
655
- # Frontend UI
656
- # ─────────────────────────────────────────────────────────────────────────────
657
  @app.get("/", response_class=HTMLResponse)
658
  async def homepage(request: Request):
659
- return """
660
  <!DOCTYPE html>
661
  <html lang="en">
662
  <head>
@@ -697,18 +697,17 @@ async def homepage(request: Request):
697
  /* ── Top Bar ── */
698
  .top-bar {
699
  position: sticky; top: 0; left: 0; right: 0; height: 42px;
700
- background: rgba(13,13,15,0.95);
701
- border-bottom: 1px solid var(--node-border);
702
  display: flex; align-items: center; padding: 0 20px;
703
  gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
704
  }
705
- .top-bar .logo { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
706
- .top-bar .sep { color: var(--node-border); }
707
- .top-bar .sub { font-size: 11px; color: var(--muted); }
708
  .top-bar .badge {
709
- margin-left: auto;
710
- background: rgba(124,106,247,0.15); border: 1px solid rgba(124,106,247,0.3);
711
- padding: 3px 10px; border-radius: 20px; font-size: 10px; color: var(--accent);
712
  }
713
  /* ── Canvas ── */
714
  #canvas {
@@ -797,9 +796,9 @@ async def homepage(request: Request):
797
  border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
798
  }
799
  .img-chip.visible { display: flex; }
800
- .img-chip .chip-dot { width:5px;height:5px;border-radius:50%;background:var(--accent2);flex-shrink:0;box-shadow:0 0 4px var(--accent2); }
801
- .img-chip .chip-name { overflow:hidden;text-overflow:ellipsis;white-space:nowrap;flex:1;color:var(--text);font-size:9px; }
802
- .img-chip .chip-size { color:var(--muted);flex-shrink:0;font-size:9px; }
803
  select, textarea {
804
  width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
805
  color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
@@ -826,18 +825,17 @@ async def homepage(request: Request):
826
  .icon-btn {
827
  display: flex; align-items: center; gap: 5px;
828
  background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
829
- border-radius: 5px; padding: 3px 8px;
830
- font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
831
- color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
832
- transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
833
- text-decoration: none; border: 1px solid rgba(124,106,247,0.25);
834
  }
835
  .icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
836
  .icon-btn:active { transform: scale(0.95); }
837
- .icon-btn.teal { background:rgba(78,205,196,0.10);border-color:rgba(78,205,196,0.25);color:var(--accent2); }
838
- .icon-btn.teal:hover { background:rgba(78,205,196,0.22);border-color:var(--accent2); }
839
- .icon-btn.copied { background:rgba(78,205,196,0.15);border-color:var(--accent2);color:var(--accent2); }
840
- .icon-btn svg { pointer-events:none;flex-shrink:0; }
841
  .output-box {
842
  background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
843
  border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
@@ -852,8 +850,10 @@ async def homepage(request: Request):
852
  border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
853
  display: flex; align-items: center; justify-content: center;
854
  }
855
- .ground-img-wrap img {
856
- width: 100%; height: 100%; object-fit: contain; display: block;
 
 
857
  }
858
  .ground-placeholder {
859
  position: absolute; inset: 0; display: flex; align-items: center;
@@ -866,24 +866,24 @@ async def homepage(request: Request):
866
  animation: spin 0.7s linear infinite; display: none;
867
  }
868
  @keyframes spin { to { transform: rotate(360deg); } }
869
- .status-dot { width:6px;height:6px;border-radius:50%;background:var(--muted);display:inline-block;margin-right:6px; }
870
- .status-dot.active { background:var(--accent2);box-shadow:0 0 5px var(--accent2); }
871
  /* ── Model badges ── */
872
  .model-badge {
873
- display:inline-block;padding:2px 7px;border-radius:4px;
874
- font-size:9px;font-weight:700;letter-spacing:0.06em;text-transform:uppercase;
875
  }
876
- .model-badge.qvl2b { background:rgba(255,150,50,0.15); color:#ff9632; border:1px solid rgba(255,150,50,0.35); }
877
- .model-badge.qvl4b { background:rgba(255,100,80,0.15); color:#ff6450; border:1px solid rgba(255,100,80,0.35); }
878
- .model-badge.q4bunred { background:rgba(255,80,80,0.18); color:#ff5050; border:1px solid rgba(255,80,80,0.40); }
879
- .model-badge.q4b { background:rgba(255,200,80,0.15); color:#ffc850; border:1px solid rgba(255,200,80,0.35); }
880
- .model-badge.q2b { background:rgba(124,106,247,0.2); color:var(--accent); border:1px solid rgba(124,106,247,0.3); }
881
- .model-badge.lfm450 { background:rgba(78,205,196,0.15); color:var(--accent2); border:1px solid rgba(78,205,196,0.3); }
882
- .model-badge.g4e2b { background:rgba(66,197,107,0.15); color:#42c56b; border:1px solid rgba(66,197,107,0.35); }
883
- .model-badge.lfm16 { background:rgba(107,203,119,0.15);color:#6bcb77; border:1px solid rgba(107,203,119,0.35); }
884
- .model-badge.qunred { background:rgba(255,80,160,0.15); color:#ff50a0; border:1px solid rgba(255,80,160,0.35); }
885
- .model-badge.q25vl3b { background:rgba(80,180,255,0.15); color:#50b4ff; border:1px solid rgba(80,180,255,0.35); }
886
- .model-info-box { border-radius:6px;padding:9px;font-size:10px;color:var(--muted);line-height:1.55;flex-shrink:0; }
887
  .canvas-footer { height: 36px; }
888
  </style>
889
  </head>
@@ -1049,9 +1049,8 @@ async def homepage(request: Request):
1049
  SAVE
1050
  </a>
1051
  </div>
1052
- <div class="ground-img-wrap">
1053
- <!-- Server-rendered annotated image displayed here -->
1054
- <img id="groundImg" src="" alt="" style="display:none;" />
1055
  <div class="ground-placeholder" id="groundPlaceholder">
1056
  Active for Point / Detect tasks.<br>Run inference to visualise.
1057
  </div>
@@ -1103,8 +1102,7 @@ document.querySelectorAll('.node').forEach(node => {
1103
  });
1104
  document.addEventListener('mousemove', e => {
1105
  if (!drag) return;
1106
- node.style.left=`${il+e.clientX-sx}px`;
1107
- node.style.top=`${it+e.clientY-sy}px`;
1108
  updateWires();
1109
  });
1110
  document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
@@ -1134,27 +1132,24 @@ function formatBytes(b) {
1134
  return (b/1048576).toFixed(1)+' MB';
1135
  }
1136
  function handleFile(file) {
1137
- if (!file || !file.type.startsWith('image/')) return;
1138
- currentFile = file;
1139
- imgPreview.src = URL.createObjectURL(file);
1140
  previewWrap.classList.add('visible');
1141
- dropZone.style.display = 'none';
1142
- chipName.textContent = file.name;
1143
- chipSize.textContent = formatBytes(file.size);
1144
  imgChip.classList.add('visible');
1145
  dotImg.classList.add('active');
1146
  requestAnimationFrame(updateWires);
1147
  }
1148
  function clearImage() {
1149
- currentFile = null;
1150
- imgPreview.src = '';
1151
  previewWrap.classList.remove('visible');
1152
- dropZone.style.display = '';
1153
  imgChip.classList.remove('visible');
1154
- chipName.textContent = 'β€”';
1155
- chipSize.textContent = '';
1156
- fileInput.value = '';
1157
- dotImg.classList.remove('active');
1158
  requestAnimationFrame(updateWires);
1159
  }
1160
  dropZone.onclick = () => fileInput.click();
@@ -1203,7 +1198,7 @@ const MODEL_INFO = {
1203
  qwen_2b: {
1204
  html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
1205
  Qwen3.5 2B multimodal model by Alibaba Cloud.
1206
- Lightweight &amp; fast β€” ideal for quick tasks.`,
1207
  bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
1208
  },
1209
  lfm_450: {
@@ -1256,9 +1251,7 @@ const PLACEHOLDERS = {
1256
  Point: 'e.g., The gun held by the person.',
1257
  Detect: 'e.g., The headlight of the car.',
1258
  };
1259
- categorySelect.onchange = e => {
1260
- promptInput.placeholder = PLACEHOLDERS[e.target.value] || '';
1261
- };
1262
 
1263
  // ══════════════════════════════════════════════
1264
  // COPY BUTTON
@@ -1291,35 +1284,37 @@ copyBtn.onclick = () => {
1291
  }).catch(() => {
1292
  const ta = document.createElement('textarea');
1293
  ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
1294
- document.body.appendChild(ta); ta.select();
1295
- document.execCommand('copy'); document.body.removeChild(ta);
1296
  });
1297
  };
1298
 
1299
  // ══════════════════════════════════════════════
1300
- // GROUNDING IMAGE (server-rendered, base64)
1301
  // ══════════════════════════════════════════════
1302
- const groundImg = document.getElementById('groundImg');
1303
  const groundPlaceholder = document.getElementById('groundPlaceholder');
1304
  const downloadBtn = document.getElementById('downloadBtn');
1305
  const dotGnd = document.getElementById('dot-gnd');
1306
 
1307
- function showGroundingImage(b64DataUri) {
1308
- groundImg.src = b64DataUri;
1309
- groundImg.style.display = 'block';
 
1310
  groundPlaceholder.style.display = 'none';
1311
- // Wire up download button
 
 
1312
  const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1313
- downloadBtn.href = b64DataUri;
1314
  downloadBtn.download = `grounding_${ts}.png`;
1315
  downloadBtn.style.display = 'flex';
1316
- dotGnd.classList.add('active');
1317
  }
1318
 
1319
- function resetGrounding(msg) {
1320
- groundImg.src = '';
1321
- groundImg.style.display = 'none';
1322
- groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks. Run inference to visualise.';
1323
  groundPlaceholder.style.display = 'flex';
1324
  downloadBtn.style.display = 'none';
1325
  dotGnd.classList.remove('active');
@@ -1339,7 +1334,7 @@ runBtn.onclick = async () => {
1339
  const promptStr = promptInput.value.trim();
1340
  if (!promptStr) { alert('Please enter a prompt directive.'); return; }
1341
 
1342
- // ── Reset UI ─────────────────────────────────────────
1343
  runBtn.disabled = true;
1344
  btnLoader.style.display = 'inline-block';
1345
  outputBox.innerText = '';
@@ -1348,23 +1343,21 @@ runBtn.onclick = async () => {
1348
  dotOut.classList.remove('active');
1349
  allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
1350
  resetCopyBtn();
 
1351
 
1352
- const cat = categorySelect.value;
1353
- if (cat === 'Point' || cat === 'Detect') {
1354
- resetGrounding('Running inference…');
1355
- }
1356
 
1357
- // ── Build FormData ────────────────────────────────────
1358
  const formData = new FormData();
1359
  formData.append('image', currentFile);
1360
- formData.append('category', cat);
1361
  formData.append('prompt', promptStr);
1362
- formData.append('model_id', modelSelect.value);
1363
 
1364
  let fullText = '';
1365
 
1366
  try {
1367
- // ── 1. Stream inference ───────────────────────────
1368
  const response = await fetch('/api/run', { method: 'POST', body: formData });
1369
  if (!response.ok) {
1370
  const err = await response.json();
@@ -1373,15 +1366,14 @@ runBtn.onclick = async () => {
1373
 
1374
  const reader = response.body.getReader();
1375
  const decoder = new TextDecoder('utf-8');
1376
- let buffer = '';
1377
 
1378
  while (true) {
1379
  const { value, done } = await reader.read();
1380
  if (done) break;
1381
  buffer += decoder.decode(value, { stream: true });
1382
- const lines = buffer.split('\\n\\n');
1383
- buffer = lines.pop(); // keep incomplete chunk
1384
-
1385
  for (const line of lines) {
1386
  if (!line.startsWith('data: ')) continue;
1387
  const payload = line.slice(6);
@@ -1399,42 +1391,42 @@ runBtn.onclick = async () => {
1399
 
1400
  dotOut.classList.add('active');
1401
 
1402
- // ── 2. Server-side annotation for Point / Detect ──
1403
- if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
1404
- resetGrounding('Annotating image…');
 
 
1405
  try {
1406
- const annForm = new FormData();
1407
- annForm.append('image', currentFile);
1408
- annForm.append('text', fullText);
1409
- annForm.append('category', cat);
1410
 
1411
- const annResp = await fetch('/api/annotate', {
1412
- method: 'POST', body: annForm,
1413
  });
1414
- const annData = await annResp.json();
1415
 
1416
- if (annData.b64) {
1417
- showGroundingImage(annData.b64);
 
 
 
1418
  } else {
1419
- resetGrounding(
1420
- annData.error === 'no_json'
1421
- ? 'No grounding coordinates found in model output.'
1422
- : `Annotation error: ${annData.error || 'unknown'}`
1423
- );
1424
  }
1425
- } catch (annErr) {
1426
- resetGrounding(`Annotation failed: ${annErr.message}`);
 
1427
  }
1428
- } else if (cat !== 'Point' && cat !== 'Detect') {
1429
- resetGrounding('Active for Point / Detect tasks. Run inference to visualise.');
1430
  }
1431
 
1432
  } catch (err) {
1433
  outputBox.innerText = `[Error] ${err.message}`;
1434
  outputBox.style.color = '#ff6b6b';
1435
- if (cat === 'Point' || cat === 'Detect') {
1436
- resetGrounding('Inference error β€” see Output Stream node.');
1437
- }
1438
  } finally {
1439
  runBtn.disabled = false;
1440
  btnLoader.style.display = 'none';
 
4
  import ast
5
  import re
6
  import uuid
 
7
  import threading
 
8
  from pathlib import Path
9
  from typing import Optional
10
 
11
  import spaces
12
  import torch
13
+ import numpy as np
14
  from PIL import Image, ImageDraw, ImageFont
15
 
16
  from gradio import Server
 
58
  print("Qwen3-VL-2B model loaded successfully.")
59
  except Exception as e:
60
  print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
61
+ qwen_vl_2b_model = None
62
+ qwen_vl_2b_processor = None
63
 
64
  # ── Qwen3-VL-4B-Instruct ────────────────────────────────
65
  print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
 
71
  print("Qwen3-VL-4B model loaded successfully.")
72
  except Exception as e:
73
  print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
74
+ qwen_vl_4b_model = None
75
+ qwen_vl_4b_processor = None
76
 
77
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
78
  print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
 
84
  print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
85
  except Exception as e:
86
  print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
87
+ qwen_4b_unredacted_model = None
88
+ qwen_4b_unredacted_processor = None
89
 
90
  # ── Qwen3.5-4B ──────────────────────────────────────────
91
  print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
 
97
  print("Qwen3.5-4B model loaded successfully.")
98
  except Exception as e:
99
  print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
100
+ qwen_4b_model = None
101
+ qwen_4b_processor = None
102
 
103
  # ── Qwen3.5-2B ──────────────────────────────────────────
104
  print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
 
110
  print("Qwen3.5-2B model loaded successfully.")
111
  except Exception as e:
112
  print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
113
+ qwen_2b_model = None
114
+ qwen_2b_processor = None
115
 
116
  # ── LFM2.5-VL-450M ──────────────────────────────────────
117
  print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
 
123
  print("LFM-450M model loaded successfully.")
124
  except Exception as e:
125
  print(f"Warning: LFM-450M model loading failed. Error: {e}")
126
+ lfm_450_model = None
127
+ lfm_450_processor = None
128
 
129
  # ── Gemma4-E2B-it ───────────────────────────────────────
130
  print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
 
139
  print("Gemma4-E2B-it model loaded successfully.")
140
  except Exception as e:
141
  print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
142
+ gemma4_e2b_model = None
143
+ gemma4_e2b_processor = None
144
 
145
  # ── LFM2.5-VL-1.6B ──────────────────────────────────────
146
  print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
 
152
  print("LFM-1.6B model loaded successfully.")
153
  except Exception as e:
154
  print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
155
+ lfm_16_model = None
156
+ lfm_16_processor = None
157
 
158
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
159
  print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
 
165
  print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
166
  except Exception as e:
167
  print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
168
+ qwen_unredacted_model = None
169
+ qwen_unredacted_processor = None
170
 
171
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
172
  print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
 
178
  print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
179
  except Exception as e:
180
  print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
181
+ qwen25_vl_3b_model = None
182
+ qwen25_vl_3b_processor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
 
185
+ # ---------------------------------------------------------------------------
186
+ # Utility: safe JSON parser (strips markdown fences, handles ast fallback)
187
+ # ---------------------------------------------------------------------------
188
  def safe_parse_json(text: str):
 
 
 
189
  text = text.strip()
190
+ # strip <think>…</think>
191
+ text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE).strip()
192
  text = re.sub(r"^```(json)?", "", text)
193
  text = re.sub(r"```$", "", text)
194
  text = text.strip()
 
196
  return json.loads(text)
197
  except json.JSONDecodeError:
198
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  try:
200
  return ast.literal_eval(text)
201
  except Exception:
202
+ pass
203
+ # Try to find the first JSON array or object in the text
204
+ for pattern in [r'\[[\s\S]*\]', r'\{[\s\S]*\}']:
205
+ m = re.search(pattern, text)
206
+ if m:
207
+ try:
208
+ return json.loads(m.group())
209
+ except Exception:
210
+ pass
211
+ return None
212
+
213
+
214
+ # ---------------------------------------------------------------------------
215
+ # Server-side annotation (mirrors reference annotate_image exactly)
216
+ # ---------------------------------------------------------------------------
217
+ PALETTE_COLORS = [
218
+ (78, 205, 196), # teal
219
+ (124, 106, 247), # purple
220
+ (255, 107, 107), # red
221
+ (255, 217, 61), # yellow
222
+ (107, 203, 119), # green
223
+ (255, 146, 43), # orange
224
+ (204, 93, 232), # magenta
225
+ (51, 154, 240), # blue
226
+ ]
227
+
228
+
229
+ def _get_font(size: int = 14):
230
+ """Try to load a truetype font, fall back to default."""
231
+ for font_name in ["arial.ttf", "Arial.ttf", "DejaVuSans.ttf",
232
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
233
+ "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]:
234
+ try:
235
+ return ImageFont.truetype(font_name, size)
236
+ except (IOError, OSError):
237
+ continue
238
+ return ImageFont.load_default()
239
 
240
 
241
+ def annotate_detections(image: Image.Image, objects: list) -> Image.Image:
242
  """
243
+ Draw bounding boxes + labels on image.
244
+ objects: list of {label, x_min, y_min, x_max, y_max} (all coords 0-1 fractions)
 
 
245
  """
246
+ image = image.convert("RGB").copy()
247
+ W, H = image.size
248
  draw = ImageDraw.Draw(image, "RGBA")
249
+ font_lbl = _get_font(max(12, W // 40))
250
+
251
+ for i, obj in enumerate(objects):
252
+ col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
253
+ col_rgba_fill = col + (46,) # ~18% opacity fill
254
+ col_rgba_solid = col + (255,)
255
+
256
+ x1 = int(obj["x_min"] * W)
257
+ y1 = int(obj["y_min"] * H)
258
+ x2 = int(obj["x_max"] * W)
259
+ y2 = int(obj["y_max"] * H)
260
+ # clamp
261
+ x1, x2 = max(0, x1), min(W, x2)
262
+ y1, y2 = max(0, y1), min(H, y2)
263
+ if x2 <= x1 or y2 <= y1:
264
  continue
265
+
266
+ # Filled rectangle
267
+ draw.rectangle([x1, y1, x2, y2], fill=col_rgba_fill)
268
+ # Border (draw 2px by drawing twice)
269
+ lw = max(2, W // 200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  for t in range(lw):
271
+ draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=col_rgba_solid)
272
 
273
+ # Corner accents
274
+ ca = min(18, (x2-x1)//4, (y2-y1)//4)
275
+ cw = max(2, lw + 1)
276
+ for (cx, cy, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]:
277
+ draw.line([cx, cy, cx+dx*ca, cy], fill=col_rgba_solid, width=cw)
278
+ draw.line([cx, cy, cx, cy+dy*ca], fill=col_rgba_solid, width=cw)
279
 
280
+ # Label pill
281
+ label = obj.get("label", "object")
282
  try:
283
  bb = font_lbl.getbbox(label)
284
  tw, th = bb[2]-bb[0], bb[3]-bb[1]
285
+ except Exception:
286
+ tw, th = len(label)*7, 12
287
  pad = 5
288
+ pw, ph = tw + pad*2, th + pad*2
289
+ lx = max(0, min(x1, W - pw))
290
+ ly = max(0, y1 - ph) if y1 - ph >= 0 else y1 + 2
291
+ draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba_solid)
292
  draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
 
293
 
294
  return image
295
 
296
 
297
+ def annotate_points(image: Image.Image, points: list) -> Image.Image:
298
  """
299
+ Draw point markers + labels on image.
300
+ points: list of {label, x, y} (coords 0-1 fractions)
 
 
301
  """
302
+ image = image.convert("RGB").copy()
303
+ W, H = image.size
304
  draw = ImageDraw.Draw(image, "RGBA")
305
+ font_lbl = _get_font(max(12, W // 40))
306
+ r = max(7, W // 55)
307
+
308
+ for i, pt in enumerate(points):
309
+ col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
310
+ col_rgba = col + (255,)
311
+ glow_rgba = col + (40,)
312
+ mid_rgba = col + (64,)
313
+
314
+ cx = int(pt["x"] * W)
315
+ cy = int(pt["y"] * H)
316
+ cx = max(r, min(W-r, cx))
317
+ cy = max(r, min(H-r, cy))
318
+
319
+ # Outer glow
320
+ draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=glow_rgba)
321
+ # Mid ring
322
+ draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)], fill=mid_rgba)
323
+ # Core dot
324
+ draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=col_rgba, outline=(255,255,255,255), width=max(2,r//3))
325
+ # Centre white dot
326
+ cr = max(2, r//3)
327
+ draw.ellipse([cx-cr, cy-cr, cx+cr, cy+cr], fill=(255,255,255,255))
328
+
329
+ # Label
330
+ label = pt.get("label", "")
331
+ if label:
332
+ try:
333
+ bb = font_lbl.getbbox(label)
334
+ tw, th = bb[2]-bb[0], bb[3]-bb[1]
335
+ except Exception:
336
+ tw, th = len(label)*7, 12
337
+ pad = 5
338
+ pw, ph = tw + pad*2, th + pad*2
339
+ lx = min(cx + r + 6, W - pw)
340
+ ly = max(0, cy - ph//2)
341
+ draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba)
342
+ draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  return image
345
 
346
 
347
+ def parse_and_annotate(image: Image.Image, full_text: str, category: str):
348
+ """
349
+ Parse model output and return annotated PIL image + structured result dict.
350
+ Mirrors the reference code logic exactly.
351
+ """
352
+ parsed = safe_parse_json(full_text)
353
+ if parsed is None:
354
+ return image, {"error": "No JSON found in model output", "raw": full_text[:500]}
355
+
356
+ if category == "Point":
357
+ result = {"points": []}
358
+ items = parsed if isinstance(parsed, list) else [parsed]
359
+ for item in items:
360
+ if isinstance(item, dict) and "point_2d" in item:
361
+ coords = item["point_2d"]
362
+ if isinstance(coords, (list, tuple)) and len(coords) == 2:
363
+ x, y = float(coords[0]), float(coords[1])
364
+ # Reference divides by 1000.0 β€” Qwen uses 0-1000 scale
365
+ result["points"].append({
366
+ "label": item.get("label", ""),
367
+ "x": x / 1000.0,
368
+ "y": y / 1000.0,
369
+ })
370
+ annotated = annotate_points(image.copy(), result["points"])
371
+ return annotated, result
 
 
 
 
 
 
 
 
 
372
 
373
+ elif category == "Detect":
374
+ result = {"objects": []}
375
+ items = parsed if isinstance(parsed, list) else [parsed]
376
+ for item in items:
377
+ if isinstance(item, dict) and "bbox_2d" in item:
378
+ coords = item["bbox_2d"]
379
+ if isinstance(coords, (list, tuple)) and len(coords) == 4:
380
+ xmin, ymin, xmax, ymax = [float(v) for v in coords]
381
+ result["objects"].append({
382
+ "label": item.get("label", "object"),
383
+ "x_min": xmin / 1000.0,
384
+ "y_min": ymin / 1000.0,
385
+ "x_max": xmax / 1000.0,
386
+ "y_max": ymax / 1000.0,
387
+ })
388
+ annotated = annotate_detections(image.copy(), result["objects"])
389
+ return annotated, result
390
+
391
+ return image, {}
392
+
393
+
394
+ def pil_to_png_bytes(image: Image.Image) -> bytes:
395
+ buf = io.BytesIO()
396
+ image.save(buf, format="PNG")
397
+ return buf.getvalue()
398
 
399
 
400
+ # ---------------------------------------------------------------------------
401
+ # Inference Generator (Streaming)
402
+ # ---------------------------------------------------------------------------
403
  @spaces.GPU(duration=120)
404
  def generate_inference_stream(
405
  image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
 
421
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
422
  yield "data: [DONE]\n\n"; return
423
  messages = [{"role": "user", "content": [
424
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
425
+ ]}]
426
+ text_input = qwen_vl_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
427
+ inputs = qwen_vl_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_2b_model.device)
428
+ streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
429
+ thread = threading.Thread(target=qwen_vl_2b_model.generate,
430
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
431
+ thread.start()
 
 
 
432
  for tok in streamer:
433
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
434
+ thread.join()
435
 
436
  # ── Qwen3-VL-4B ─────────────────────────────────────
437
  elif model_id == "qwen_vl_4b":
 
439
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
440
  yield "data: [DONE]\n\n"; return
441
  messages = [{"role": "user", "content": [
442
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
443
+ ]}]
444
+ text_input = qwen_vl_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
445
+ inputs = qwen_vl_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_4b_model.device)
446
+ streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
447
+ thread = threading.Thread(target=qwen_vl_4b_model.generate,
448
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
449
+ thread.start()
 
 
 
450
  for tok in streamer:
451
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
452
+ thread.join()
453
 
454
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
455
  elif model_id == "qwen_4b_unredacted":
 
457
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
458
  yield "data: [DONE]\n\n"; return
459
  messages = [{"role": "user", "content": [
460
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
461
+ ]}]
462
+ text_input = qwen_4b_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
463
+ inputs = qwen_4b_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_unredacted_model.device)
464
+ streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
465
+ thread = threading.Thread(target=qwen_4b_unredacted_model.generate,
466
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
467
+ thread.start()
 
 
 
468
  for tok in streamer:
469
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
470
+ thread.join()
471
 
472
  # ── Qwen3.5-4B ──────────────────────────────────────
473
  elif model_id == "qwen_4b":
 
475
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
476
  yield "data: [DONE]\n\n"; return
477
  messages = [{"role": "user", "content": [
478
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
479
+ ]}]
480
+ text_input = qwen_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
481
+ inputs = qwen_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_model.device)
482
+ streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
483
+ thread = threading.Thread(target=qwen_4b_model.generate,
484
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
485
+ thread.start()
 
 
 
486
  for tok in streamer:
487
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
488
+ thread.join()
489
 
490
  # ── Qwen3.5-2B ──────────────────────────────────────
491
  elif model_id == "qwen_2b":
 
493
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
494
  yield "data: [DONE]\n\n"; return
495
  messages = [{"role": "user", "content": [
496
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
497
+ ]}]
498
+ text_input = qwen_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
499
+ inputs = qwen_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_2b_model.device)
500
+ streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
501
+ thread = threading.Thread(target=qwen_2b_model.generate,
502
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
503
+ thread.start()
 
 
 
504
  for tok in streamer:
505
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
506
+ thread.join()
507
 
508
  # ── LFM-450M ────────────────────────────────────────
509
  elif model_id == "lfm_450":
 
511
  yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
512
  yield "data: [DONE]\n\n"; return
513
  conversation = [{"role": "user", "content": [
514
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
515
+ ]}]
516
  inputs = lfm_450_processor.apply_chat_template(
517
  conversation, add_generation_prompt=True,
518
  return_tensors="pt", return_dict=True, tokenize=True,
519
  ).to(lfm_450_model.device)
520
+ streamer = TextIteratorStreamer(lfm_450_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
521
+ thread = threading.Thread(target=lfm_450_model.generate,
522
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
523
+ thread.start()
 
524
  for tok in streamer:
525
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
526
+ thread.join()
527
 
528
  # ── Gemma4-E2B-it ──────────────��────────────────────
529
  elif model_id == "gemma4_e2b":
 
531
  yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
532
  yield "data: [DONE]\n\n"; return
533
  messages = [{"role": "user", "content": [
534
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
535
+ ]}]
536
+ text_input = gemma4_e2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
537
+ inputs = gemma4_e2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(gemma4_e2b_model.device)
538
+ streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
539
+ thread = threading.Thread(target=gemma4_e2b_model.generate,
540
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
541
+ thread.start()
 
 
 
542
  for tok in streamer:
543
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
544
+ thread.join()
545
 
546
  # ── LFM-1.6B ────────────────────────────────────────
547
  elif model_id == "lfm_16":
 
549
  yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
550
  yield "data: [DONE]\n\n"; return
551
  conversation = [{"role": "user", "content": [
552
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
553
+ ]}]
554
  inputs = lfm_16_processor.apply_chat_template(
555
  conversation, add_generation_prompt=True,
556
  return_tensors="pt", return_dict=True, tokenize=True,
557
  ).to(lfm_16_model.device)
558
+ streamer = TextIteratorStreamer(lfm_16_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
559
+ thread = threading.Thread(target=lfm_16_model.generate,
560
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
561
+ thread.start()
 
562
  for tok in streamer:
563
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
564
+ thread.join()
565
 
566
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
567
  elif model_id == "qwen_unredacted":
 
569
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
570
  yield "data: [DONE]\n\n"; return
571
  messages = [{"role": "user", "content": [
572
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
573
+ ]}]
574
+ text_input = qwen_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
575
+ inputs = qwen_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_unredacted_model.device)
576
+ streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
577
+ thread = threading.Thread(target=qwen_unredacted_model.generate,
578
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
579
+ thread.start()
 
 
 
580
  for tok in streamer:
581
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
582
+ thread.join()
583
 
584
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────
585
  elif model_id == "qwen25_vl_3b":
 
587
  yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
588
  yield "data: [DONE]\n\n"; return
589
  messages = [{"role": "user", "content": [
590
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt},
591
+ ]}]
592
+ text_input = qwen25_vl_3b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
593
  image_inputs, video_inputs = process_vision_info(messages)
594
  inputs = qwen25_vl_3b_processor(
595
  text=[text_input], images=image_inputs, videos=video_inputs,
596
  return_tensors="pt", padding=True,
597
  ).to(qwen25_vl_3b_model.device)
598
+ streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
599
+ thread = threading.Thread(target=qwen25_vl_3b_model.generate,
600
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
601
+ thread.start()
 
602
  for tok in streamer:
603
  if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
604
+ thread.join()
605
 
606
  yield "data: [DONE]\n\n"
607
 
608
 
609
+ # ---------------------------------------------------------------------------
610
+ # New endpoint: /api/annotate β€” receives image + model output text + category
611
+ # Returns annotated PNG + structured JSON
612
+ # ---------------------------------------------------------------------------
613
+ @app.post("/api/annotate")
614
+ async def annotate_endpoint(
615
+ image: UploadFile = File(...),
616
+ text: str = Form(...),
617
+ category: str = Form(...),
618
+ ):
619
+ try:
620
+ img_bytes = await image.read()
621
+ img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
622
+ annotated_img, result_dict = parse_and_annotate(img, text, category)
623
+ png_bytes = pil_to_png_bytes(annotated_img)
624
+ return JSONResponse({
625
+ "image_b64": __import__("base64").b64encode(png_bytes).decode(),
626
+ "result": result_dict,
627
+ })
628
+ except Exception as e:
629
+ return JSONResponse({"error": str(e)}, status_code=500)
630
+
631
+
632
+ # ---------------------------------------------------------------------------
633
+ # Main inference endpoint
634
+ # ---------------------------------------------------------------------------
635
  @app.post("/api/run")
636
  async def run_inference(
637
  image: UploadFile = File(...),
 
651
  return JSONResponse({"error": str(e)}, status_code=500)
652
 
653
 
654
+ # ---------------------------------------------------------------------------
655
+ # Frontend
656
+ # ---------------------------------------------------------------------------
657
  @app.get("/", response_class=HTMLResponse)
658
  async def homepage(request: Request):
659
+ return r"""
660
  <!DOCTYPE html>
661
  <html lang="en">
662
  <head>
 
697
  /* ── Top Bar ── */
698
  .top-bar {
699
  position: sticky; top: 0; left: 0; right: 0; height: 42px;
700
+ background: rgba(13,13,15,0.95); border-bottom: 1px solid var(--node-border);
 
701
  display: flex; align-items: center; padding: 0 20px;
702
  gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
703
  }
704
+ .top-bar .logo { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
705
+ .top-bar .sep { color: var(--node-border); }
706
+ .top-bar .sub { font-size: 11px; color: var(--muted); }
707
  .top-bar .badge {
708
+ margin-left: auto; background: rgba(124,106,247,0.15);
709
+ border: 1px solid rgba(124,106,247,0.3); padding: 3px 10px;
710
+ border-radius: 20px; font-size: 10px; color: var(--accent);
711
  }
712
  /* ── Canvas ── */
713
  #canvas {
 
796
  border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
797
  }
798
  .img-chip.visible { display: flex; }
799
+ .img-chip .chip-dot { width: 5px; height: 5px; border-radius: 50%; background: var(--accent2); flex-shrink: 0; box-shadow: 0 0 4px var(--accent2); }
800
+ .img-chip .chip-name { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; flex: 1; color: var(--text); font-size: 9px; }
801
+ .img-chip .chip-size { color: var(--muted); flex-shrink: 0; font-size: 9px; }
802
  select, textarea {
803
  width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
804
  color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
 
825
  .icon-btn {
826
  display: flex; align-items: center; gap: 5px;
827
  background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
828
+ border-radius: 5px; padding: 3px 8px; font-size: 9px; font-weight: 700;
829
+ font-family: 'JetBrains Mono', monospace; color: var(--accent); cursor: pointer;
830
+ letter-spacing: 0.05em; transition: background 0.18s, border-color 0.18s, transform 0.1s;
831
+ flex-shrink: 0; text-decoration: none;
 
832
  }
833
  .icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
834
  .icon-btn:active { transform: scale(0.95); }
835
+ .icon-btn.teal { background: rgba(78,205,196,0.10); border-color: rgba(78,205,196,0.25); color: var(--accent2); }
836
+ .icon-btn.teal:hover { background: rgba(78,205,196,0.22); border-color: var(--accent2); }
837
+ .icon-btn.copied { background: rgba(78,205,196,0.15); border-color: var(--accent2); color: var(--accent2); }
838
+ .icon-btn svg { pointer-events: none; flex-shrink: 0; }
839
  .output-box {
840
  background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
841
  border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
 
850
  border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
851
  display: flex; align-items: center; justify-content: center;
852
  }
853
+ /* annotated image displayed via <img> tag β€” no canvas needed */
854
+ .ground-img-wrap img.overlay-img {
855
+ max-width: 100%; max-height: 100%;
856
+ object-fit: contain; display: block;
857
  }
858
  .ground-placeholder {
859
  position: absolute; inset: 0; display: flex; align-items: center;
 
866
  animation: spin 0.7s linear infinite; display: none;
867
  }
868
  @keyframes spin { to { transform: rotate(360deg); } }
869
+ .status-dot { width: 6px; height: 6px; border-radius: 50%; background: var(--muted); display: inline-block; margin-right: 6px; }
870
+ .status-dot.active { background: var(--accent2); box-shadow: 0 0 5px var(--accent2); }
871
  /* ── Model badges ── */
872
  .model-badge {
873
+ display: inline-block; padding: 2px 7px; border-radius: 4px;
874
+ font-size: 9px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase;
875
  }
876
+ .model-badge.qvl2b { background: rgba(255,150,50,0.15); color: #ff9632; border: 1px solid rgba(255,150,50,0.35); }
877
+ .model-badge.qvl4b { background: rgba(255,100,80,0.15); color: #ff6450; border: 1px solid rgba(255,100,80,0.35); }
878
+ .model-badge.q4bunred { background: rgba(255,80,80,0.18); color: #ff5050; border: 1px solid rgba(255,80,80,0.40); }
879
+ .model-badge.q4b { background: rgba(255,200,80,0.15); color: #ffc850; border: 1px solid rgba(255,200,80,0.35); }
880
+ .model-badge.q2b { background: rgba(124,106,247,0.2); color: var(--accent); border: 1px solid rgba(124,106,247,0.3); }
881
+ .model-badge.lfm450 { background: rgba(78,205,196,0.15); color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
882
+ .model-badge.g4e2b { background: rgba(66,197,107,0.15); color: #42c56b; border: 1px solid rgba(66,197,107,0.35); }
883
+ .model-badge.lfm16 { background: rgba(107,203,119,0.15); color: #6bcb77; border: 1px solid rgba(107,203,119,0.35); }
884
+ .model-badge.qunred { background: rgba(255,80,160,0.15); color: #ff50a0; border: 1px solid rgba(255,80,160,0.35); }
885
+ .model-badge.q25vl3b { background: rgba(80,180,255,0.15); color: #50b4ff; border: 1px solid rgba(80,180,255,0.35); }
886
+ .model-info-box { border-radius: 6px; padding: 9px; font-size: 10px; color: var(--muted); line-height: 1.55; flex-shrink: 0; }
887
  .canvas-footer { height: 36px; }
888
  </style>
889
  </head>
 
1049
  SAVE
1050
  </a>
1051
  </div>
1052
+ <div class="ground-img-wrap" id="groundWrap">
1053
+ <img class="overlay-img" id="overlayImg" src="" style="display:none;" />
 
1054
  <div class="ground-placeholder" id="groundPlaceholder">
1055
  Active for Point / Detect tasks.<br>Run inference to visualise.
1056
  </div>
 
1102
  });
1103
  document.addEventListener('mousemove', e => {
1104
  if (!drag) return;
1105
+ node.style.left=`${il+e.clientX-sx}px`; node.style.top=`${it+e.clientY-sy}px`;
 
1106
  updateWires();
1107
  });
1108
  document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
 
1132
  return (b/1048576).toFixed(1)+' MB';
1133
  }
1134
  function handleFile(file) {
1135
+ if (!file||!file.type.startsWith('image/')) return;
1136
+ currentFile=file;
1137
+ imgPreview.src=URL.createObjectURL(file);
1138
  previewWrap.classList.add('visible');
1139
+ dropZone.style.display='none';
1140
+ chipName.textContent=file.name;
1141
+ chipSize.textContent=formatBytes(file.size);
1142
  imgChip.classList.add('visible');
1143
  dotImg.classList.add('active');
1144
  requestAnimationFrame(updateWires);
1145
  }
1146
  function clearImage() {
1147
+ currentFile=null; imgPreview.src='';
 
1148
  previewWrap.classList.remove('visible');
1149
+ dropZone.style.display='';
1150
  imgChip.classList.remove('visible');
1151
+ chipName.textContent='β€”'; chipSize.textContent='';
1152
+ fileInput.value=''; dotImg.classList.remove('active');
 
 
1153
  requestAnimationFrame(updateWires);
1154
  }
1155
  dropZone.onclick = () => fileInput.click();
 
1198
  qwen_2b: {
1199
  html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
1200
  Qwen3.5 2B multimodal model by Alibaba Cloud.
1201
+ Lightweight &amp; fast β€” ideal for quick Query, Caption, Point &amp; Detect tasks.`,
1202
  bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
1203
  },
1204
  lfm_450: {
 
1251
  Point: 'e.g., The gun held by the person.',
1252
  Detect: 'e.g., The headlight of the car.',
1253
  };
1254
+ categorySelect.onchange = e => { promptInput.placeholder = PLACEHOLDERS[e.target.value] || ''; };
 
 
1255
 
1256
  // ══════════════════════════════════════════════
1257
  // COPY BUTTON
 
1284
  }).catch(() => {
1285
  const ta = document.createElement('textarea');
1286
  ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
1287
+ document.body.appendChild(ta); ta.select(); document.execCommand('copy');
1288
+ document.body.removeChild(ta);
1289
  });
1290
  };
1291
 
1292
  // ══════════════════════════════════════════════
1293
+ // GROUNDING DISPLAY (server-side annotated image)
1294
  // ══════════════════════════════════════════════
1295
+ const overlayImg = document.getElementById('overlayImg');
1296
  const groundPlaceholder = document.getElementById('groundPlaceholder');
1297
  const downloadBtn = document.getElementById('downloadBtn');
1298
  const dotGnd = document.getElementById('dot-gnd');
1299
 
1300
+ function showOverlay(b64png) {
1301
+ const src = 'data:image/png;base64,' + b64png;
1302
+ overlayImg.src = src;
1303
+ overlayImg.style.display = 'block';
1304
  groundPlaceholder.style.display = 'none';
1305
+ dotGnd.classList.add('active');
1306
+
1307
+ // Update download button
1308
  const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1309
+ downloadBtn.href = src;
1310
  downloadBtn.download = `grounding_${ts}.png`;
1311
  downloadBtn.style.display = 'flex';
 
1312
  }
1313
 
1314
+ function resetOverlay(msg) {
1315
+ overlayImg.src = '';
1316
+ overlayImg.style.display = 'none';
1317
+ groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks.\nRun inference to visualise.';
1318
  groundPlaceholder.style.display = 'flex';
1319
  downloadBtn.style.display = 'none';
1320
  dotGnd.classList.remove('active');
 
1334
  const promptStr = promptInput.value.trim();
1335
  if (!promptStr) { alert('Please enter a prompt directive.'); return; }
1336
 
1337
+ // ── Reset UI ──────────────────────────────
1338
  runBtn.disabled = true;
1339
  btnLoader.style.display = 'inline-block';
1340
  outputBox.innerText = '';
 
1343
  dotOut.classList.remove('active');
1344
  allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
1345
  resetCopyBtn();
1346
+ resetOverlay('Running inference…');
1347
 
1348
+ const category = categorySelect.value;
1349
+ const modelId = modelSelect.value;
 
 
1350
 
1351
+ // ── Step 1: stream text from /api/run ─────
1352
  const formData = new FormData();
1353
  formData.append('image', currentFile);
1354
+ formData.append('category', category);
1355
  formData.append('prompt', promptStr);
1356
+ formData.append('model_id', modelId);
1357
 
1358
  let fullText = '';
1359
 
1360
  try {
 
1361
  const response = await fetch('/api/run', { method: 'POST', body: formData });
1362
  if (!response.ok) {
1363
  const err = await response.json();
 
1366
 
1367
  const reader = response.body.getReader();
1368
  const decoder = new TextDecoder('utf-8');
1369
+ let buffer = '';
1370
 
1371
  while (true) {
1372
  const { value, done } = await reader.read();
1373
  if (done) break;
1374
  buffer += decoder.decode(value, { stream: true });
1375
+ const lines = buffer.split('\n\n');
1376
+ buffer = lines.pop();
 
1377
  for (const line of lines) {
1378
  if (!line.startsWith('data: ')) continue;
1379
  const payload = line.slice(6);
 
1391
 
1392
  dotOut.classList.add('active');
1393
 
1394
+ // ── Step 2: if Point or Detect β†’ call /api/annotate ──
1395
+ if ((category === 'Point' || category === 'Detect') && fullText.trim()) {
1396
+ groundPlaceholder.textContent = 'Annotating image…';
1397
+ groundPlaceholder.style.display = 'flex';
1398
+
1399
  try {
1400
+ const annotForm = new FormData();
1401
+ annotForm.append('image', currentFile);
1402
+ annotForm.append('text', fullText);
1403
+ annotForm.append('category', category);
1404
 
1405
+ const annotResp = await fetch('/api/annotate', {
1406
+ method: 'POST', body: annotForm,
1407
  });
1408
+ if (!annotResp.ok) throw new Error('Annotation request failed');
1409
 
1410
+ const annotData = await annotResp.json();
1411
+ if (annotData.error) {
1412
+ resetOverlay('Annotation error: ' + annotData.error);
1413
+ } else if (annotData.image_b64) {
1414
+ showOverlay(annotData.image_b64);
1415
  } else {
1416
+ resetOverlay('No coordinates found in model output.');
 
 
 
 
1417
  }
1418
+ } catch (annotErr) {
1419
+ resetOverlay('Annotation failed: ' + annotErr.message);
1420
+ console.error('Annotation error:', annotErr);
1421
  }
1422
+ } else if (category !== 'Point' && category !== 'Detect') {
1423
+ resetOverlay('Active for Point / Detect tasks.\nRun inference to visualise.');
1424
  }
1425
 
1426
  } catch (err) {
1427
  outputBox.innerText = `[Error] ${err.message}`;
1428
  outputBox.style.color = '#ff6b6b';
1429
+ resetOverlay('Inference error β€” see Output Stream node.');
 
 
1430
  } finally {
1431
  runBtn.disabled = false;
1432
  btnLoader.style.display = 'none';