prithivMLmods commited on
Commit
fd39af9
Β·
verified Β·
1 Parent(s): 2969977

update app

Browse files
Files changed (1) hide show
  1. app.py +508 -692
app.py CHANGED
@@ -4,13 +4,15 @@ import json
4
  import ast
5
  import re
6
  import uuid
 
7
  import threading
 
8
  from pathlib import Path
9
  from typing import Optional
10
 
11
  import spaces
12
  import torch
13
- from PIL import Image
14
 
15
  from gradio import Server
16
  from fastapi import Request, UploadFile, File, Form
@@ -51,35 +53,25 @@ QWEN25_VL_3B_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
51
  print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...")
52
  try:
53
  qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained(
54
- QWEN_VL_2B_MODEL_NAME,
55
- trust_remote_code=True,
56
- torch_dtype=torch.bfloat16,
57
  ).to(DEVICE).eval()
58
- qwen_vl_2b_processor = AutoProcessor.from_pretrained(
59
- QWEN_VL_2B_MODEL_NAME, trust_remote_code=True
60
- )
61
  print("Qwen3-VL-2B model loaded successfully.")
62
  except Exception as e:
63
  print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
64
- qwen_vl_2b_model = None
65
- qwen_vl_2b_processor = None
66
 
67
  # ── Qwen3-VL-4B-Instruct ────────────────────────────────
68
  print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
69
  try:
70
  qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained(
71
- QWEN_VL_4B_MODEL_NAME,
72
- trust_remote_code=True,
73
- torch_dtype=torch.bfloat16,
74
  ).to(DEVICE).eval()
75
- qwen_vl_4b_processor = AutoProcessor.from_pretrained(
76
- QWEN_VL_4B_MODEL_NAME, trust_remote_code=True
77
- )
78
  print("Qwen3-VL-4B model loaded successfully.")
79
  except Exception as e:
80
  print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
81
- qwen_vl_4b_model = None
82
- qwen_vl_4b_processor = None
83
 
84
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
85
  print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
@@ -91,8 +83,7 @@ try:
91
  print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
92
  except Exception as e:
93
  print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
94
- qwen_4b_unredacted_model = None
95
- qwen_4b_unredacted_processor = None
96
 
97
  # ── Qwen3.5-4B ──────────────────────────────────────────
98
  print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
@@ -104,8 +95,7 @@ try:
104
  print("Qwen3.5-4B model loaded successfully.")
105
  except Exception as e:
106
  print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
107
- qwen_4b_model = None
108
- qwen_4b_processor = None
109
 
110
  # ── Qwen3.5-2B ──────────────────────────────────────────
111
  print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
@@ -117,30 +107,25 @@ try:
117
  print("Qwen3.5-2B model loaded successfully.")
118
  except Exception as e:
119
  print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
120
- qwen_2b_model = None
121
- qwen_2b_processor = None
122
 
123
  # ── LFM2.5-VL-450M ──────────────────────────────────────
124
  print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
125
  try:
126
  lfm_450_model = AutoModelForImageTextToText.from_pretrained(
127
- LFM_450_MODEL_NAME,
128
- device_map="auto",
129
- torch_dtype=torch.bfloat16,
130
  ).eval()
131
  lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME)
132
  print("LFM-450M model loaded successfully.")
133
  except Exception as e:
134
  print(f"Warning: LFM-450M model loading failed. Error: {e}")
135
- lfm_450_model = None
136
- lfm_450_processor = None
137
 
138
  # ── Gemma4-E2B-it ───────────────────────────────────────
139
  print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
140
  try:
141
  gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained(
142
- GEMMA4_E2B_NAME,
143
- torch_dtype=torch.bfloat16,
144
  device_map="auto" if torch.cuda.is_available() else None,
145
  ).eval()
146
  if not torch.cuda.is_available():
@@ -149,23 +134,19 @@ try:
149
  print("Gemma4-E2B-it model loaded successfully.")
150
  except Exception as e:
151
  print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
152
- gemma4_e2b_model = None
153
- gemma4_e2b_processor = None
154
 
155
  # ── LFM2.5-VL-1.6B ──────────────────────────────────────
156
  print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
157
  try:
158
  lfm_16_model = AutoModelForImageTextToText.from_pretrained(
159
- LFM_16_MODEL_NAME,
160
- device_map="auto",
161
- torch_dtype=torch.bfloat16,
162
  ).eval()
163
  lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME)
164
  print("LFM-1.6B model loaded successfully.")
165
  except Exception as e:
166
  print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
167
- lfm_16_model = None
168
- lfm_16_processor = None
169
 
170
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
171
  print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
@@ -177,28 +158,54 @@ try:
177
  print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
178
  except Exception as e:
179
  print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
180
- qwen_unredacted_model = None
181
- qwen_unredacted_processor = None
182
 
183
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
184
  print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
185
  try:
186
  qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
187
- QWEN25_VL_3B_NAME,
188
- torch_dtype="auto",
189
- device_map="auto",
190
  ).eval()
191
  qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME)
192
  print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
193
  except Exception as e:
194
  print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
195
- qwen25_vl_3b_model = None
196
- qwen25_vl_3b_processor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
 
199
- # --- Utility Functions ---
200
  def safe_parse_json(text: str):
 
 
 
201
  text = text.strip()
 
202
  text = re.sub(r"^```(json)?", "", text)
203
  text = re.sub(r"```$", "", text)
204
  text = text.strip()
@@ -206,13 +213,204 @@ def safe_parse_json(text: str):
206
  return json.loads(text)
207
  except json.JSONDecodeError:
208
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  try:
210
  return ast.literal_eval(text)
211
  except Exception:
212
  return {}
213
 
214
 
215
- # --- Inference Generator (Streaming) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  @spaces.GPU(duration=120)
217
  def generate_inference_stream(
218
  image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
@@ -232,315 +430,208 @@ def generate_inference_stream(
232
  if model_id == "qwen_vl_2b":
233
  if qwen_vl_2b_model is None or qwen_vl_2b_processor is None:
234
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
235
- yield "data: [DONE]\n\n"
236
- return
237
  messages = [{"role": "user", "content": [
238
- {"type": "image", "image": image},
239
- {"type": "text", "text": full_prompt},
240
- ]}]
241
  text_input = qwen_vl_2b_processor.apply_chat_template(
242
- messages, tokenize=False, add_generation_prompt=True
243
- )
244
  inputs = qwen_vl_2b_processor(
245
  text=[text_input], images=[image], return_tensors="pt", padding=True
246
  ).to(qwen_vl_2b_model.device)
247
- streamer = TextIteratorStreamer(
248
- qwen_vl_2b_processor.tokenizer,
249
- skip_prompt=True, skip_special_tokens=True, timeout=120,
250
- )
251
- thread = threading.Thread(
252
- target=qwen_vl_2b_model.generate,
253
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
254
- use_cache=True, temperature=1.0, do_sample=True),
255
- )
256
- thread.start()
257
  for tok in streamer:
258
- if tok:
259
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
260
- thread.join()
261
 
262
  # ── Qwen3-VL-4B ─────────────────────────────────────
263
  elif model_id == "qwen_vl_4b":
264
  if qwen_vl_4b_model is None or qwen_vl_4b_processor is None:
265
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
266
- yield "data: [DONE]\n\n"
267
- return
268
  messages = [{"role": "user", "content": [
269
- {"type": "image", "image": image},
270
- {"type": "text", "text": full_prompt},
271
- ]}]
272
  text_input = qwen_vl_4b_processor.apply_chat_template(
273
- messages, tokenize=False, add_generation_prompt=True
274
- )
275
  inputs = qwen_vl_4b_processor(
276
  text=[text_input], images=[image], return_tensors="pt", padding=True
277
  ).to(qwen_vl_4b_model.device)
278
- streamer = TextIteratorStreamer(
279
- qwen_vl_4b_processor.tokenizer,
280
- skip_prompt=True, skip_special_tokens=True, timeout=120,
281
- )
282
- thread = threading.Thread(
283
- target=qwen_vl_4b_model.generate,
284
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
285
- use_cache=True, temperature=1.0, do_sample=True),
286
- )
287
- thread.start()
288
  for tok in streamer:
289
- if tok:
290
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
291
- thread.join()
292
 
293
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
294
  elif model_id == "qwen_4b_unredacted":
295
  if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None:
296
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
297
- yield "data: [DONE]\n\n"
298
- return
299
  messages = [{"role": "user", "content": [
300
- {"type": "image", "image": image},
301
- {"type": "text", "text": full_prompt},
302
- ]}]
303
  text_input = qwen_4b_unredacted_processor.apply_chat_template(
304
- messages, tokenize=False, add_generation_prompt=True
305
- )
306
  inputs = qwen_4b_unredacted_processor(
307
  text=[text_input], images=[image], return_tensors="pt", padding=True
308
  ).to(qwen_4b_unredacted_model.device)
309
- streamer = TextIteratorStreamer(
310
- qwen_4b_unredacted_processor.tokenizer,
311
- skip_prompt=True, skip_special_tokens=True, timeout=120,
312
- )
313
- thread = threading.Thread(
314
- target=qwen_4b_unredacted_model.generate,
315
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
316
- use_cache=True, temperature=1.5, min_p=0.1),
317
- )
318
- thread.start()
319
  for tok in streamer:
320
- if tok:
321
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
322
- thread.join()
323
 
324
  # ── Qwen3.5-4B ───────────────────────────���──────────
325
  elif model_id == "qwen_4b":
326
  if qwen_4b_model is None or qwen_4b_processor is None:
327
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
328
- yield "data: [DONE]\n\n"
329
- return
330
  messages = [{"role": "user", "content": [
331
- {"type": "image", "image": image},
332
- {"type": "text", "text": full_prompt},
333
- ]}]
334
  text_input = qwen_4b_processor.apply_chat_template(
335
- messages, tokenize=False, add_generation_prompt=True
336
- )
337
  inputs = qwen_4b_processor(
338
  text=[text_input], images=[image], return_tensors="pt", padding=True
339
  ).to(qwen_4b_model.device)
340
- streamer = TextIteratorStreamer(
341
- qwen_4b_processor.tokenizer,
342
- skip_prompt=True, skip_special_tokens=True, timeout=120,
343
- )
344
- thread = threading.Thread(
345
- target=qwen_4b_model.generate,
346
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
347
- use_cache=True, temperature=1.5, min_p=0.1),
348
- )
349
- thread.start()
350
  for tok in streamer:
351
- if tok:
352
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
353
- thread.join()
354
 
355
  # ── Qwen3.5-2B ──────────────────────────────────────
356
  elif model_id == "qwen_2b":
357
  if qwen_2b_model is None or qwen_2b_processor is None:
358
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
359
- yield "data: [DONE]\n\n"
360
- return
361
  messages = [{"role": "user", "content": [
362
- {"type": "image", "image": image},
363
- {"type": "text", "text": full_prompt},
364
- ]}]
365
  text_input = qwen_2b_processor.apply_chat_template(
366
- messages, tokenize=False, add_generation_prompt=True
367
- )
368
  inputs = qwen_2b_processor(
369
  text=[text_input], images=[image], return_tensors="pt", padding=True
370
  ).to(qwen_2b_model.device)
371
- streamer = TextIteratorStreamer(
372
- qwen_2b_processor.tokenizer,
373
- skip_prompt=True, skip_special_tokens=True, timeout=120,
374
- )
375
- thread = threading.Thread(
376
- target=qwen_2b_model.generate,
377
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
378
- use_cache=True, temperature=1.5, min_p=0.1),
379
- )
380
- thread.start()
381
  for tok in streamer:
382
- if tok:
383
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
384
- thread.join()
385
 
386
  # ── LFM-450M ────────────────────────────────────────
387
  elif model_id == "lfm_450":
388
  if lfm_450_model is None or lfm_450_processor is None:
389
  yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
390
- yield "data: [DONE]\n\n"
391
- return
392
  conversation = [{"role": "user", "content": [
393
- {"type": "image", "image": image},
394
- {"type": "text", "text": full_prompt},
395
- ]}]
396
  inputs = lfm_450_processor.apply_chat_template(
397
  conversation, add_generation_prompt=True,
398
  return_tensors="pt", return_dict=True, tokenize=True,
399
  ).to(lfm_450_model.device)
400
- streamer = TextIteratorStreamer(
401
- lfm_450_processor.tokenizer,
402
- skip_prompt=True, skip_special_tokens=True, timeout=120,
403
- )
404
- thread = threading.Thread(
405
- target=lfm_450_model.generate,
406
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True),
407
- )
408
- thread.start()
409
  for tok in streamer:
410
- if tok:
411
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
412
- thread.join()
413
 
414
  # ── Gemma4-E2B-it ───────────────────────────────────
415
  elif model_id == "gemma4_e2b":
416
  if gemma4_e2b_model is None or gemma4_e2b_processor is None:
417
  yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
418
- yield "data: [DONE]\n\n"
419
- return
420
  messages = [{"role": "user", "content": [
421
- {"type": "image", "image": image},
422
- {"type": "text", "text": full_prompt},
423
- ]}]
424
  text_input = gemma4_e2b_processor.apply_chat_template(
425
- messages, tokenize=False, add_generation_prompt=True
426
- )
427
  inputs = gemma4_e2b_processor(
428
  text=[text_input], images=[image], return_tensors="pt", padding=True,
429
  ).to(gemma4_e2b_model.device)
430
- streamer = TextIteratorStreamer(
431
- gemma4_e2b_processor.tokenizer,
432
- skip_prompt=True, skip_special_tokens=True, timeout=120,
433
- )
434
- thread = threading.Thread(
435
- target=gemma4_e2b_model.generate,
436
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
437
- use_cache=True, temperature=1.0, do_sample=True),
438
- )
439
- thread.start()
440
  for tok in streamer:
441
- if tok:
442
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
443
- thread.join()
444
 
445
  # ── LFM-1.6B ────────────────────────────────────────
446
  elif model_id == "lfm_16":
447
  if lfm_16_model is None or lfm_16_processor is None:
448
  yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
449
- yield "data: [DONE]\n\n"
450
- return
451
  conversation = [{"role": "user", "content": [
452
- {"type": "image", "image": image},
453
- {"type": "text", "text": full_prompt},
454
- ]}]
455
  inputs = lfm_16_processor.apply_chat_template(
456
  conversation, add_generation_prompt=True,
457
  return_tensors="pt", return_dict=True, tokenize=True,
458
  ).to(lfm_16_model.device)
459
- streamer = TextIteratorStreamer(
460
- lfm_16_processor.tokenizer,
461
- skip_prompt=True, skip_special_tokens=True, timeout=120,
462
- )
463
- thread = threading.Thread(
464
- target=lfm_16_model.generate,
465
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True),
466
- )
467
- thread.start()
468
  for tok in streamer:
469
- if tok:
470
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
471
- thread.join()
472
 
473
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
474
  elif model_id == "qwen_unredacted":
475
  if qwen_unredacted_model is None or qwen_unredacted_processor is None:
476
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
477
- yield "data: [DONE]\n\n"
478
- return
479
  messages = [{"role": "user", "content": [
480
- {"type": "image", "image": image},
481
- {"type": "text", "text": full_prompt},
482
- ]}]
483
  text_input = qwen_unredacted_processor.apply_chat_template(
484
- messages, tokenize=False, add_generation_prompt=True
485
- )
486
  inputs = qwen_unredacted_processor(
487
  text=[text_input], images=[image], return_tensors="pt", padding=True
488
  ).to(qwen_unredacted_model.device)
489
- streamer = TextIteratorStreamer(
490
- qwen_unredacted_processor.tokenizer,
491
- skip_prompt=True, skip_special_tokens=True, timeout=120,
492
- )
493
- thread = threading.Thread(
494
- target=qwen_unredacted_model.generate,
495
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
496
- use_cache=True, temperature=1.5, min_p=0.1),
497
- )
498
- thread.start()
499
  for tok in streamer:
500
- if tok:
501
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
502
- thread.join()
503
 
504
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────
505
  elif model_id == "qwen25_vl_3b":
506
  if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None:
507
  yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
508
- yield "data: [DONE]\n\n"
509
- return
510
  messages = [{"role": "user", "content": [
511
- {"type": "image", "image": image},
512
- {"type": "text", "text": full_prompt},
513
- ]}]
514
  text_input = qwen25_vl_3b_processor.apply_chat_template(
515
- messages, tokenize=False, add_generation_prompt=True
516
- )
517
  image_inputs, video_inputs = process_vision_info(messages)
518
  inputs = qwen25_vl_3b_processor(
519
- text=[text_input],
520
- images=image_inputs,
521
- videos=video_inputs,
522
- return_tensors="pt",
523
- padding=True,
524
  ).to(qwen25_vl_3b_model.device)
525
- streamer = TextIteratorStreamer(
526
- qwen25_vl_3b_processor.tokenizer,
527
- skip_prompt=True, skip_special_tokens=True, timeout=120,
528
- )
529
- thread = threading.Thread(
530
- target=qwen25_vl_3b_model.generate,
531
- kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
532
- use_cache=True, temperature=1.0, do_sample=True),
533
- )
534
- thread.start()
535
  for tok in streamer:
536
- if tok:
537
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
538
- thread.join()
539
 
540
  yield "data: [DONE]\n\n"
541
 
542
 
543
- # --- FastAPI Endpoints ---
 
 
544
  @app.post("/api/run")
545
  async def run_inference(
546
  image: UploadFile = File(...),
@@ -560,7 +651,9 @@ async def run_inference(
560
  return JSONResponse({"error": str(e)}, status_code=500)
561
 
562
 
563
- # --- Frontend UI ---
 
 
564
  @app.get("/", response_class=HTMLResponse)
565
  async def homepage(request: Request):
566
  return """
@@ -614,10 +707,8 @@ async def homepage(request: Request):
614
  .top-bar .sub { font-size: 11px; color: var(--muted); }
615
  .top-bar .badge {
616
  margin-left: auto;
617
- background: rgba(124,106,247,0.15);
618
- border: 1px solid rgba(124,106,247,0.3);
619
- padding: 3px 10px; border-radius: 20px;
620
- font-size: 10px; color: var(--accent);
621
  }
622
  /* ── Canvas ── */
623
  #canvas {
@@ -625,8 +716,7 @@ async def homepage(request: Request):
625
  min-height: calc(100vh - 42px); height: 900px; margin: 0 auto;
626
  }
627
  svg.wires {
628
- position: absolute; top: 0; left: 0;
629
- width: 100%; height: 100%;
630
  pointer-events: none; z-index: 2; overflow: visible;
631
  }
632
  path.wire { fill: none; stroke: var(--wire); stroke-width: 2.5; stroke-linecap: round; }
@@ -707,9 +797,9 @@ async def homepage(request: Request):
707
  border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
708
  }
709
  .img-chip.visible { display: flex; }
710
- .img-chip .chip-dot { width: 5px; height: 5px; border-radius: 50%; background: var(--accent2); flex-shrink: 0; box-shadow: 0 0 4px var(--accent2); }
711
- .img-chip .chip-name { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; flex: 1; color: var(--text); font-size: 9px; }
712
- .img-chip .chip-size { color: var(--muted); flex-shrink: 0; font-size: 9px; }
713
  select, textarea {
714
  width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
715
  color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
@@ -732,7 +822,7 @@ async def homepage(request: Request):
732
  /* ── Output node ── */
733
  .output-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
734
  .output-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
735
- /* ── Icon buttons (copy / download) ── */
736
  .icon-btn {
737
  display: flex; align-items: center; gap: 5px;
738
  background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
@@ -740,16 +830,14 @@ async def homepage(request: Request):
740
  font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
741
  color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
742
  transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
743
- text-decoration: none;
744
  }
745
  .icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
746
  .icon-btn:active { transform: scale(0.95); }
747
- .icon-btn.teal {
748
- background: rgba(78,205,196,0.10); border-color: rgba(78,205,196,0.25); color: var(--accent2);
749
- }
750
- .icon-btn.teal:hover { background: rgba(78,205,196,0.22); border-color: var(--accent2); }
751
- .icon-btn.copied { background: rgba(78,205,196,0.15); border-color: var(--accent2); color: var(--accent2); }
752
- .icon-btn svg { pointer-events: none; flex-shrink: 0; }
753
  .output-box {
754
  background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
755
  border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
@@ -759,20 +847,18 @@ async def homepage(request: Request):
759
  /* ── Grounding node ─�� */
760
  .ground-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
761
  .ground-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
762
- .ground-canvas-wrap {
763
  position: relative; flex: 1; border: 1px solid var(--node-border);
764
  border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
 
765
  }
766
- .ground-canvas-wrap canvas {
767
- position: absolute; top: 0; left: 0;
768
- width: 100%; height: 100%;
769
- object-fit: contain; display: block;
770
- image-rendering: auto;
771
  }
772
  .ground-placeholder {
773
  position: absolute; inset: 0; display: flex; align-items: center;
774
- justify-content: center; font-size: 11px; color: var(--muted); text-align: center; padding: 10px;
775
- pointer-events: none; z-index: 5;
776
  }
777
  .loader {
778
  width: 11px; height: 11px; border: 2px solid rgba(255,255,255,0.3);
@@ -780,36 +866,25 @@ async def homepage(request: Request):
780
  animation: spin 0.7s linear infinite; display: none;
781
  }
782
  @keyframes spin { to { transform: rotate(360deg); } }
783
- .status-dot { width: 6px; height: 6px; border-radius: 50%; background: var(--muted); display: inline-block; margin-right: 6px; }
784
- .status-dot.active { background: var(--accent2); box-shadow: 0 0 5px var(--accent2); }
785
  /* ── Model badges ── */
786
  .model-badge {
787
- display: inline-block; padding: 2px 7px; border-radius: 4px;
788
- font-size: 9px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase;
789
  }
790
- .model-badge.qvl2b { background: rgba(255,150,50,0.15); color: #ff9632; border: 1px solid rgba(255,150,50,0.35); }
791
- .model-badge.qvl4b { background: rgba(255,100,80,0.15); color: #ff6450; border: 1px solid rgba(255,100,80,0.35); }
792
- .model-badge.q4bunred { background: rgba(255,80,80,0.18); color: #ff5050; border: 1px solid rgba(255,80,80,0.40); }
793
- .model-badge.q4b { background: rgba(255,200,80,0.15); color: #ffc850; border: 1px solid rgba(255,200,80,0.35); }
794
- .model-badge.q2b { background: rgba(124,106,247,0.2); color: var(--accent); border: 1px solid rgba(124,106,247,0.3); }
795
- .model-badge.lfm450 { background: rgba(78,205,196,0.15); color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
796
- .model-badge.g4e2b { background: rgba(66,197,107,0.15); color: #42c56b; border: 1px solid rgba(66,197,107,0.35); }
797
- .model-badge.lfm16 { background: rgba(107,203,119,0.15); color: #6bcb77; border: 1px solid rgba(107,203,119,0.35); }
798
- .model-badge.qunred { background: rgba(255,80,160,0.15); color: #ff50a0; border: 1px solid rgba(255,80,160,0.35); }
799
- .model-badge.q25vl3b { background: rgba(80,180,255,0.15); color: #50b4ff; border: 1px solid rgba(80,180,255,0.35); }
800
- .model-info-box { border-radius: 6px; padding: 9px; font-size: 10px; color: var(--muted); line-height: 1.55; flex-shrink: 0; }
801
  .canvas-footer { height: 36px; }
802
-
803
- /* ── Debug panel ── */
804
- #debugPanel {
805
- position: fixed; bottom: 12px; right: 12px; z-index: 9999;
806
- background: rgba(13,13,15,0.95); border: 1px solid var(--node-border);
807
- border-radius: 7px; padding: 8px 12px; font-size: 10px; color: var(--muted);
808
- max-width: 340px; display: none; backdrop-filter: blur(8px);
809
- }
810
- #debugPanel.visible { display: block; }
811
- #debugPanel .dbg-title { color: var(--accent2); font-weight: 700; margin-bottom: 4px; }
812
- #debugPanel pre { white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow-y: auto; color: #a0a0c0; }
813
  </style>
814
  </head>
815
  <body>
@@ -821,12 +896,6 @@ async def homepage(request: Request):
821
  <span class="badge">10x Vision Models</span>
822
  </div>
823
 
824
- <!-- Debug panel (toggle with D key) -->
825
- <div id="debugPanel">
826
- <div class="dbg-title">⬑ GROUNDING DEBUG</div>
827
- <pre id="debugPre"></pre>
828
- </div>
829
-
830
  <div id="canvas">
831
  <svg class="wires">
832
  <path id="wire-img-task" class="wire" />
@@ -980,8 +1049,9 @@ async def homepage(request: Request):
980
  SAVE
981
  </a>
982
  </div>
983
- <div class="ground-canvas-wrap" id="groundWrap">
984
- <canvas id="groundCanvas"></canvas>
 
985
  <div class="ground-placeholder" id="groundPlaceholder">
986
  Active for Point / Detect tasks.<br>Run inference to visualise.
987
  </div>
@@ -1033,7 +1103,8 @@ document.querySelectorAll('.node').forEach(node => {
1033
  });
1034
  document.addEventListener('mousemove', e => {
1035
  if (!drag) return;
1036
- node.style.left=`${il+e.clientX-sx}px`; node.style.top=`${it+e.clientY-sy}px`;
 
1037
  updateWires();
1038
  });
1039
  document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
@@ -1058,22 +1129,33 @@ const chipSize = document.getElementById('chipSize');
1058
  const dotImg = document.getElementById('dot-img');
1059
 
1060
  function formatBytes(b) {
1061
- if (b<1024) return b+' B'; if (b<1048576) return (b/1024).toFixed(1)+' KB';
 
1062
  return (b/1048576).toFixed(1)+' MB';
1063
  }
1064
  function handleFile(file) {
1065
- if (!file||!file.type.startsWith('image/')) return;
1066
- currentFile=file; imgPreview.src=URL.createObjectURL(file);
1067
- previewWrap.classList.add('visible'); dropZone.style.display='none';
1068
- chipName.textContent=file.name; chipSize.textContent=formatBytes(file.size);
1069
- imgChip.classList.add('visible'); dotImg.classList.add('active');
 
 
 
 
1070
  requestAnimationFrame(updateWires);
1071
  }
1072
  function clearImage() {
1073
- currentFile=null; imgPreview.src=''; previewWrap.classList.remove('visible');
1074
- dropZone.style.display=''; imgChip.classList.remove('visible');
1075
- chipName.textContent='β€”'; chipSize.textContent=''; fileInput.value='';
1076
- dotImg.classList.remove('active'); requestAnimationFrame(updateWires);
 
 
 
 
 
 
1077
  }
1078
  dropZone.onclick = () => fileInput.click();
1079
  fileInput.onchange = e => handleFile(e.target.files[0]);
@@ -1095,63 +1177,63 @@ dotModel.classList.add('active');
1095
 
1096
  const MODEL_INFO = {
1097
  qwen_vl_2b: {
1098
- html: `<span class="model-badge qvl2b">QWEN3-VL Β· 2B</span><br><br>
1099
- Qwen3-VL-2B-Instruct β€” dedicated vision-language model by Alibaba Cloud.
1100
- Strong spatial grounding, OCR &amp; instruction-following.`,
1101
  bg: 'rgba(255,150,50,0.07)', border: 'rgba(255,150,50,0.30)',
1102
  },
1103
  qwen_vl_4b: {
1104
- html: `<span class="model-badge qvl4b">QWEN3-VL Β· 4B</span><br><br>
1105
- Qwen3-VL-4B-Instruct β€” enhanced vision-language model by Alibaba Cloud.
1106
- Superior spatial grounding, richer OCR &amp; stronger multi-step reasoning.`,
1107
  bg: 'rgba(255,100,80,0.07)', border: 'rgba(255,100,80,0.25)',
1108
  },
1109
  qwen_4b_unredacted: {
1110
- html: `<span class="model-badge q4bunred">QWEN 3.5 Β· 4B UNREDACTED MAX</span><br><br>
1111
- Qwen3.5-4B-Unredacted-MAX by prithivMLmods. Uncensored fine-tune of Qwen3.5-4B
1112
- with extended instruction-following &amp; unrestricted reasoning.`,
1113
  bg: 'rgba(255,80,80,0.07)', border: 'rgba(255,80,80,0.30)',
1114
  },
1115
  qwen_4b: {
1116
- html: `<span class="model-badge q4b">QWEN 3.5 Β· 4B</span><br><br>
1117
- Qwen3.5 4B multimodal model by Alibaba Cloud.
1118
- Enhanced capacity β€” richer reasoning &amp; better instruction following.`,
1119
  bg: 'rgba(255,200,80,0.07)', border: 'rgba(255,200,80,0.30)',
1120
  },
1121
  qwen_2b: {
1122
- html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
1123
- Qwen3.5 2B multimodal model by Alibaba Cloud.
1124
- Lightweight &amp; fast β€” ideal for quick Query, Caption, Point &amp; Detect tasks.`,
1125
  bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
1126
  },
1127
  lfm_450: {
1128
- html: `<span class="model-badge lfm450">LFM Β· 450M</span><br><br>
1129
- LFM2.5-VL 450M by LiquidAI. Ultra-lightweight edge model
1130
- with solid grounding capabilities.`,
1131
  bg: 'rgba(78,205,196,0.07)', border: 'rgba(78,205,196,0.25)',
1132
  },
1133
  gemma4_e2b: {
1134
- html: `<span class="model-badge g4e2b">GEMMA 4 Β· E2B</span><br><br>
1135
- Gemma4-E2B-it by Google DeepMind. Efficient 2B multimodal model
1136
- with strong vision-language understanding &amp; instruction-following.`,
1137
  bg: 'rgba(66,197,107,0.07)', border: 'rgba(66,197,107,0.25)',
1138
  },
1139
  lfm_16: {
1140
- html: `<span class="model-badge lfm16">LFM Β· 1.6B</span><br><br>
1141
- LFM2.5-VL 1.6B by LiquidAI. Larger liquid-state model offering
1142
- enhanced reasoning &amp; richer visual understanding.`,
1143
  bg: 'rgba(107,203,119,0.07)', border: 'rgba(107,203,119,0.25)',
1144
  },
1145
  qwen_unredacted: {
1146
- html: `<span class="model-badge qunred">QWEN 3.5 Β· 2B UNREDACTED MAX</span><br><br>
1147
- Qwen3.5-2B-Unredacted-MAX by prithivMLmods. Fine-tuned variant of Qwen3.5-2B
1148
- with uncensored &amp; extended instruction-following capabilities.`,
1149
  bg: 'rgba(255,80,160,0.07)', border: 'rgba(255,80,160,0.25)',
1150
  },
1151
  qwen25_vl_3b: {
1152
- html: `<span class="model-badge q25vl3b">QWEN 2.5-VL Β· 3B</span><br><br>
1153
- Qwen2.5-VL-3B-Instruct by Alibaba Cloud. Powerful 3B vision-language model
1154
- with strong grounding, OCR &amp; multi-task visual reasoning.`,
1155
  bg: 'rgba(80,180,255,0.07)', border: 'rgba(80,180,255,0.25)',
1156
  },
1157
  };
@@ -1174,307 +1256,9 @@ const PLACEHOLDERS = {
1174
  Point: 'e.g., The gun held by the person.',
1175
  Detect: 'e.g., The headlight of the car.',
1176
  };
1177
- categorySelect.onchange = e => { promptInput.placeholder = PLACEHOLDERS[e.target.value]||''; };
1178
-
1179
- // ══════════════════════════════════════════════
1180
- // DEBUG PANEL (press D to toggle)
1181
- // ══════════════════════════════════════════════
1182
- const debugPanel = document.getElementById('debugPanel');
1183
- const debugPre = document.getElementById('debugPre');
1184
- let debugVisible = false;
1185
- document.addEventListener('keydown', e => {
1186
- if (e.key === 'd' || e.key === 'D') {
1187
- debugVisible = !debugVisible;
1188
- debugPanel.classList.toggle('visible', debugVisible);
1189
- }
1190
- });
1191
- function dbg(msg) {
1192
- debugPre.textContent = msg;
1193
- console.log('[GROUNDING]', msg);
1194
- }
1195
-
1196
- // ══════════════════════════════════════════════
1197
- // ROBUST JSON EXTRACTOR (handles all model output styles)
1198
- // ══════════════════════════════════════════════
1199
- function extractGroundingJSON(raw) {
1200
- // Step 1: strip <think>…</think> blocks completely
1201
- let text = raw;
1202
- for (let i = 0; i < 10; i++) {
1203
- const next = text.replace(/<think>[\s\S]*?<\/think>/gi, '');
1204
- if (next === text) break;
1205
- text = next;
1206
- }
1207
-
1208
- // Step 2: strip markdown fences
1209
- text = text.replace(/```(?:json)?\\s*/gi, '').replace(/```/g, '').trim();
1210
-
1211
- dbg('Cleaned text (first 400):' + text.slice(0, 400));
1212
-
1213
- // Step 3: Balanced bracket extractor
1214
- function extractBalanced(str, startIdx, openCh, closeCh) {
1215
- let depth = 0, inStr = false, esc = false;
1216
- for (let i = startIdx; i < str.length; i++) {
1217
- const c = str[i];
1218
- if (esc) { esc = false; continue; }
1219
- if (c === '\\\\') { esc = true; continue; }
1220
- if (c === '"') { inStr = !inStr; continue; }
1221
- if (inStr) continue;
1222
- if (c === openCh) depth++;
1223
- if (c === closeCh) {
1224
- depth--;
1225
- if (depth === 0) {
1226
- try { return JSON.parse(str.slice(startIdx, i + 1)); }
1227
- catch (_) { return null; }
1228
- }
1229
- }
1230
- }
1231
- return null;
1232
- }
1233
-
1234
- // Step 4: scan for ALL '[' positions, try each from last to first
1235
- const bracketPositions = [];
1236
- const bracePositions = [];
1237
- for (let i = 0; i < text.length; i++) {
1238
- if (text[i] === '[') bracketPositions.push(i);
1239
- if (text[i] === '{') bracePositions.push(i);
1240
- }
1241
-
1242
- // Prefer arrays (most models return [{...}, {...}])
1243
- for (let i = bracketPositions.length - 1; i >= 0; i--) {
1244
- const r = extractBalanced(text, bracketPositions[i], '[', ']');
1245
- if (r !== null && Array.isArray(r) && r.length > 0) {
1246
- dbg('Found array at pos ' + bracketPositions[i] + ': ' + JSON.stringify(r).slice(0, 200));
1247
- return r;
1248
- }
1249
- }
1250
- // Try objects
1251
- for (let i = bracePositions.length - 1; i >= 0; i--) {
1252
- const r = extractBalanced(text, bracePositions[i], '{', '}');
1253
- if (r !== null) {
1254
- dbg('Found object at pos ' + bracePositions[i] + ': ' + JSON.stringify(r).slice(0, 200));
1255
- return r;
1256
- }
1257
- }
1258
-
1259
- // Step 5: try whole-text parse
1260
- try { return JSON.parse(text); } catch (_) {}
1261
-
1262
- dbg('No JSON found. Raw tail: ' + text.slice(-300));
1263
- return null;
1264
- }
1265
-
1266
- // ══════════════════════════════════════════════
1267
- // COORDINATE NORMALISER
1268
- // Handles: absolute pixels, 0-1 fractions, 0-1000 Qwen scale
1269
- // ══════════════════════════════════════════════
1270
- function normaliseCoords(arr, W, H) {
1271
- // arr is [x1,y1,x2,y2] or [x,y]
1272
- const nums = arr.map(Number);
1273
-
1274
- if (arr.length === 4) {
1275
- let [x1,y1,x2,y2] = nums;
1276
- // Qwen VL often uses 0-1000 normalised coords
1277
- const maxVal = Math.max(x1, y1, x2, y2);
1278
- if (maxVal <= 1.0 && maxVal > 0) {
1279
- // 0-1 fraction
1280
- return [x1*W, y1*H, x2*W, y2*H];
1281
- } else if (maxVal <= 1000 && maxVal > 1) {
1282
- // 0-1000 scale (Qwen VL convention)
1283
- return [x1/1000*W, y1/1000*H, x2/1000*W, y2/1000*H];
1284
- }
1285
- // Already in pixels
1286
- return [x1, y1, x2, y2];
1287
- }
1288
-
1289
- if (arr.length === 2) {
1290
- let [x, y] = nums;
1291
- const maxVal = Math.max(x, y);
1292
- if (maxVal <= 1.0 && maxVal > 0) return [x*W, y*H];
1293
- if (maxVal <= 1000 && maxVal > 1) return [x/1000*W, y/1000*H];
1294
- return [x, y];
1295
- }
1296
-
1297
- return nums;
1298
- }
1299
-
1300
- // ══════════════════════════════════════════════
1301
- // GROUNDING VISUALIZER
1302
- // ══════════════════════════════════════════════
1303
- const groundCanvas = document.getElementById('groundCanvas');
1304
- const groundWrap = document.getElementById('groundWrap');
1305
- const groundPlaceholder = document.getElementById('groundPlaceholder');
1306
- const gCtx = groundCanvas.getContext('2d');
1307
- const downloadBtn = document.getElementById('downloadBtn');
1308
-
1309
- const PALETTE = ['#4ecdc4','#7c6af7','#ff6b6b','#ffd93d','#6bcb77','#ff922b','#cc5de8','#339af0'];
1310
-
1311
- function hexToRgba(hex, alpha) {
1312
- const r=parseInt(hex.slice(1,3),16), g=parseInt(hex.slice(3,5),16), b=parseInt(hex.slice(5,7),16);
1313
- return `rgba(${r},${g},${b},${alpha})`;
1314
- }
1315
- function drawRoundRect(ctx, x, y, w, h, r) {
1316
- r = Math.min(r, w/2, h/2);
1317
- ctx.beginPath();
1318
- ctx.moveTo(x+r, y);
1319
- ctx.lineTo(x+w-r, y); ctx.quadraticCurveTo(x+w, y, x+w, y+r);
1320
- ctx.lineTo(x+w, y+h-r); ctx.quadraticCurveTo(x+w, y+h, x+w-r, y+h);
1321
- ctx.lineTo(x+r, y+h); ctx.quadraticCurveTo(x, y+h, x, y+h-r);
1322
- ctx.lineTo(x, y+r); ctx.quadraticCurveTo(x, y, x+r, y);
1323
- ctx.closePath();
1324
- }
1325
-
1326
- function updateDownloadBtn() {
1327
- const dataURL = groundCanvas.toDataURL('image/png');
1328
- const ts = new Date().toISOString().replace(/[:.]/g,'-').slice(0,19);
1329
- downloadBtn.href = dataURL;
1330
- downloadBtn.download = `grounding_${ts}.png`;
1331
- downloadBtn.style.display = 'flex';
1332
- }
1333
-
1334
- function drawGrounding(imgSrc, rawText) {
1335
- const parsed = extractGroundingJSON(rawText);
1336
-
1337
- if (!parsed) {
1338
- dbg('drawGrounding: no JSON parsed from output.');
1339
- groundPlaceholder.textContent = 'No grounding coordinates found in model output.';
1340
- groundPlaceholder.style.display = 'flex';
1341
- return;
1342
- }
1343
-
1344
- const img = new Image();
1345
- img.crossOrigin = 'anonymous';
1346
-
1347
- img.onload = () => {
1348
- const W = img.naturalWidth || img.width || 512;
1349
- const H = img.naturalHeight || img.height || 512;
1350
-
1351
- // Set canvas to image natural size for crisp drawing
1352
- groundCanvas.width = W;
1353
- groundCanvas.height = H;
1354
-
1355
- // Draw base image
1356
- gCtx.drawImage(img, 0, 0, W, H);
1357
-
1358
- // Hide placeholder β€” canvas is now populated
1359
- groundPlaceholder.style.display = 'none';
1360
-
1361
- const lw = Math.max(2, W / 180);
1362
- const fs = Math.max(11, Math.min(W / 35, 22));
1363
- gCtx.lineWidth = lw;
1364
-
1365
- const items = Array.isArray(parsed) ? parsed : [parsed];
1366
- dbg('Drawing ' + items.length + ' item(s) on ' + W + 'x' + H);
1367
-
1368
- items.forEach((item, i) => {
1369
- const col = PALETTE[i % PALETTE.length];
1370
-
1371
- // ── Try to extract bbox ───────────────────────
1372
- let rawBbox = null;
1373
- if (Array.isArray(item?.bbox_2d) && item.bbox_2d.length === 4) rawBbox = item.bbox_2d;
1374
- else if (Array.isArray(item?.bbox) && item.bbox.length === 4) rawBbox = item.bbox;
1375
- else if (Array.isArray(item?.box) && item.box.length === 4) rawBbox = item.box;
1376
- // flat array of 4 numbers
1377
- else if (Array.isArray(item) && item.length === 4 && item.every(v => typeof v === 'number'))
1378
- rawBbox = item;
1379
-
1380
- if (rawBbox) {
1381
- let [x1, y1, x2, y2] = normaliseCoords(rawBbox, W, H);
1382
- // Ensure x1<x2, y1<y2
1383
- if (x2 < x1) [x1, x2] = [x2, x1];
1384
- if (y2 < y1) [y1, y2] = [y2, y1];
1385
- const bw = x2 - x1, bh = y2 - y1;
1386
-
1387
- // Fill
1388
- gCtx.fillStyle = hexToRgba(col, 0.18);
1389
- gCtx.fillRect(x1, y1, bw, bh);
1390
-
1391
- // Border
1392
- gCtx.strokeStyle = col;
1393
- gCtx.lineWidth = lw;
1394
- gCtx.strokeRect(x1, y1, bw, bh);
1395
-
1396
- // Corner accent marks
1397
- const cLen = Math.min(bw, bh, 18);
1398
- gCtx.lineWidth = lw * 1.8;
1399
- [[x1,y1],[x2,y1],[x2,y2],[x1,y2]].forEach(([cx,cy]) => {
1400
- const sx = cx === x1 ? 1 : -1, sy = cy === y1 ? 1 : -1;
1401
- gCtx.beginPath();
1402
- gCtx.moveTo(cx + sx*cLen, cy);
1403
- gCtx.lineTo(cx, cy);
1404
- gCtx.lineTo(cx, cy + sy*cLen);
1405
- gCtx.strokeStyle = col;
1406
- gCtx.stroke();
1407
- });
1408
- gCtx.lineWidth = lw;
1409
-
1410
- // Label
1411
- const lbl = (item?.label ?? item?.class_name ?? item?.name ?? `obj ${i+1}`).toString();
1412
- gCtx.font = `bold ${fs}px JetBrains Mono, monospace`;
1413
- const tw = gCtx.measureText(lbl).width;
1414
- const ph = fs * 1.5, pw = tw + 14;
1415
- const lx = Math.max(0, Math.min(x1, W - pw));
1416
- const ly = y1 - ph > 0 ? y1 - ph : y1 + 2;
1417
- drawRoundRect(gCtx, lx, ly, pw, ph, 4);
1418
- gCtx.fillStyle = col; gCtx.fill();
1419
- gCtx.fillStyle = '#fff';
1420
- gCtx.fillText(lbl, lx + 7, ly + ph * 0.74);
1421
- return;
1422
- }
1423
-
1424
- // ── Try to extract point ──────────────────────
1425
- let rawPt = null;
1426
- if (Array.isArray(item?.point_2d) && item.point_2d.length === 2) rawPt = item.point_2d;
1427
- else if (Array.isArray(item?.point) && item.point.length === 2) rawPt = item.point;
1428
- else if (Array.isArray(item?.coord) && item.coord.length === 2) rawPt = item.coord;
1429
- else if (Array.isArray(item) && item.length === 2 && item.every(v => typeof v === 'number'))
1430
- rawPt = item;
1431
-
1432
- if (rawPt) {
1433
- let [x, y] = normaliseCoords(rawPt, W, H);
1434
- const r = Math.max(7, Math.min(W / 55, 18));
1435
- const lbl = (item?.label ?? item?.name ?? `pt ${i+1}`).toString();
1436
-
1437
- // Outer glow ring
1438
- gCtx.beginPath(); gCtx.arc(x, y, r * 2.2, 0, Math.PI*2);
1439
- gCtx.fillStyle = hexToRgba(col, 0.15); gCtx.fill();
1440
-
1441
- // Middle ring
1442
- gCtx.beginPath(); gCtx.arc(x, y, r * 1.4, 0, Math.PI*2);
1443
- gCtx.fillStyle = hexToRgba(col, 0.25); gCtx.fill();
1444
-
1445
- // Core dot
1446
- gCtx.beginPath(); gCtx.arc(x, y, r, 0, Math.PI*2);
1447
- gCtx.fillStyle = col; gCtx.fill();
1448
- gCtx.strokeStyle = '#fff'; gCtx.lineWidth = Math.max(1.5, lw); gCtx.stroke();
1449
-
1450
- // Centre dot
1451
- gCtx.beginPath(); gCtx.arc(x, y, r * 0.3, 0, Math.PI*2);
1452
- gCtx.fillStyle = '#fff'; gCtx.fill();
1453
-
1454
- // Label
1455
- gCtx.font = `bold ${fs}px JetBrains Mono, monospace`;
1456
- const tw = gCtx.measureText(lbl).width;
1457
- const ph = fs * 1.45, pw = tw + 12;
1458
- const lx = Math.min(x + r + 6, W - pw);
1459
- const ly = Math.max(0, y - ph/2);
1460
- drawRoundRect(gCtx, lx, ly, pw, ph, 4);
1461
- gCtx.fillStyle = col; gCtx.fill();
1462
- gCtx.fillStyle = '#fff';
1463
- gCtx.fillText(lbl, lx + 6, ly + ph * 0.74);
1464
- }
1465
- });
1466
-
1467
- updateDownloadBtn();
1468
- };
1469
-
1470
- img.onerror = (e) => {
1471
- dbg('Image load error: ' + e);
1472
- groundPlaceholder.textContent = 'Failed to load image for overlay.';
1473
- groundPlaceholder.style.display = 'flex';
1474
- };
1475
-
1476
- img.src = imgSrc;
1477
- }
1478
 
1479
  // ══════════════════════════════════════════════
1480
  // COPY BUTTON
@@ -1493,8 +1277,8 @@ function resetCopyBtn() {
1493
  </svg> COPY`;
1494
  }
1495
  copyBtn.onclick = () => {
1496
- const txt = outputBox.innerText||'';
1497
- if (!txt||txt==='Results will stream here...') return;
1498
  navigator.clipboard.writeText(txt).then(() => {
1499
  copyBtn.classList.add('copied');
1500
  copyBtn.innerHTML = `
@@ -1502,14 +1286,45 @@ copyBtn.onclick = () => {
1502
  stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round">
1503
  <polyline points="20 6 9 17 4 12"/>
1504
  </svg> COPIED`;
1505
- clearTimeout(copyTimer); copyTimer=setTimeout(resetCopyBtn,2000);
 
1506
  }).catch(() => {
1507
- const ta=document.createElement('textarea'); ta.value=txt;
1508
- ta.style.position='fixed'; ta.style.opacity='0';
1509
- document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);
 
1510
  });
1511
  };
1512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1513
  // ══════════════════════════════════════════════
1514
  // RUN INFERENCE
1515
  // ══════════════════════════════════════════════
@@ -1518,41 +1333,38 @@ const btnLoader = document.getElementById('btnLoader');
1518
  const allWires = ['wire-img-task','wire-model-task','wire-task-out','wire-task-gnd'];
1519
  const dotTask = document.getElementById('dot-task');
1520
  const dotOut = document.getElementById('dot-out');
1521
- const dotGnd = document.getElementById('dot-gnd');
1522
 
1523
  runBtn.onclick = async () => {
1524
  if (!currentFile) { alert('Please upload an image into the Input Node.'); return; }
1525
  const promptStr = promptInput.value.trim();
1526
  if (!promptStr) { alert('Please enter a prompt directive.'); return; }
1527
 
1528
- // Reset UI
1529
  runBtn.disabled = true;
1530
  btnLoader.style.display = 'inline-block';
1531
  outputBox.innerText = '';
1532
  outputBox.style.color = '';
1533
- groundPlaceholder.style.display = 'flex';
1534
- groundPlaceholder.textContent = 'Running inference…';
1535
- gCtx.clearRect(0, 0, groundCanvas.width, groundCanvas.height);
1536
- groundCanvas.width = 1; // reset canvas
1537
- groundCanvas.height = 1;
1538
- downloadBtn.style.display = 'none';
1539
  dotTask.classList.add('active');
1540
  dotOut.classList.remove('active');
1541
- dotGnd.classList.remove('active');
1542
  allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
1543
  resetCopyBtn();
1544
 
 
 
 
 
 
 
1545
  const formData = new FormData();
1546
  formData.append('image', currentFile);
1547
- formData.append('category', categorySelect.value);
1548
  formData.append('prompt', promptStr);
1549
  formData.append('model_id', modelSelect.value);
1550
 
1551
  let fullText = '';
1552
- // Create a stable object URL for this run
1553
- const imgObjectURL = URL.createObjectURL(currentFile);
1554
 
1555
  try {
 
1556
  const response = await fetch('/api/run', { method: 'POST', body: formData });
1557
  if (!response.ok) {
1558
  const err = await response.json();
@@ -1561,18 +1373,18 @@ runBtn.onclick = async () => {
1561
 
1562
  const reader = response.body.getReader();
1563
  const decoder = new TextDecoder('utf-8');
1564
- let buffer = '';
1565
 
1566
  while (true) {
1567
  const { value, done } = await reader.read();
1568
  if (done) break;
1569
  buffer += decoder.decode(value, { stream: true });
1570
  const lines = buffer.split('\\n\\n');
1571
- buffer = lines.pop(); // keep incomplete chunk
1572
 
1573
  for (const line of lines) {
1574
  if (!line.startsWith('data: ')) continue;
1575
- const payload = line.slice(6); // remove 'data: '
1576
  if (payload === '[DONE]') break;
1577
  try {
1578
  const data = JSON.parse(payload);
@@ -1587,43 +1399,47 @@ runBtn.onclick = async () => {
1587
 
1588
  dotOut.classList.add('active');
1589
 
1590
- // ── Grounding overlay ─────────────────────────
1591
- const cat = categorySelect.value;
1592
  if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
1593
- groundPlaceholder.textContent = 'Parsing coordinates…';
1594
- groundPlaceholder.style.display = 'flex';
1595
-
1596
- // Small delay so the UI updates before heavy canvas work
1597
- setTimeout(() => {
1598
- const parsed = extractGroundingJSON(fullText);
1599
- if (parsed !== null) {
1600
- dotGnd.classList.add('active');
1601
- drawGrounding(imgObjectURL, fullText);
 
 
 
 
 
1602
  } else {
1603
- groundPlaceholder.textContent =
1604
- 'No grounding JSON detected in model output. ' +
1605
- 'Try rephrasing your prompt or use a VL model.';
1606
- groundPlaceholder.style.display = 'flex';
1607
- dbg('No JSON found. Full output: ' + fullText.slice(0, 500));
1608
  }
1609
- }, 50);
 
 
1610
  } else if (cat !== 'Point' && cat !== 'Detect') {
1611
- groundPlaceholder.textContent = 'Active for Point / Detect tasks. Run inference to visualise.';
1612
- groundPlaceholder.style.display = 'flex';
1613
  }
1614
 
1615
  } catch (err) {
1616
  outputBox.innerText = `[Error] ${err.message}`;
1617
  outputBox.style.color = '#ff6b6b';
1618
- groundPlaceholder.textContent = 'Inference error β€” see Output Stream node.';
1619
- groundPlaceholder.style.display = 'flex';
 
1620
  } finally {
1621
  runBtn.disabled = false;
1622
  btnLoader.style.display = 'none';
1623
  dotTask.classList.remove('active');
1624
  allWires.forEach(id => document.getElementById(id)?.classList.remove('active'));
1625
- // Revoke object URL after a delay to allow canvas drawing
1626
- setTimeout(() => URL.revokeObjectURL(imgObjectURL), 10000);
1627
  }
1628
  };
1629
  </script>
 
4
  import ast
5
  import re
6
  import uuid
7
+ import base64
8
  import threading
9
+ import numpy as np
10
  from pathlib import Path
11
  from typing import Optional
12
 
13
  import spaces
14
  import torch
15
+ from PIL import Image, ImageDraw, ImageFont
16
 
17
  from gradio import Server
18
  from fastapi import Request, UploadFile, File, Form
 
53
  print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...")
54
  try:
55
  qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained(
56
+ QWEN_VL_2B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16,
 
 
57
  ).to(DEVICE).eval()
58
+ qwen_vl_2b_processor = AutoProcessor.from_pretrained(QWEN_VL_2B_MODEL_NAME, trust_remote_code=True)
 
 
59
  print("Qwen3-VL-2B model loaded successfully.")
60
  except Exception as e:
61
  print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
62
+ qwen_vl_2b_model = None; qwen_vl_2b_processor = None
 
63
 
64
  # ── Qwen3-VL-4B-Instruct ────────────────────────────────
65
  print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
66
  try:
67
  qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained(
68
+ QWEN_VL_4B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16,
 
 
69
  ).to(DEVICE).eval()
70
+ qwen_vl_4b_processor = AutoProcessor.from_pretrained(QWEN_VL_4B_MODEL_NAME, trust_remote_code=True)
 
 
71
  print("Qwen3-VL-4B model loaded successfully.")
72
  except Exception as e:
73
  print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
74
+ qwen_vl_4b_model = None; qwen_vl_4b_processor = None
 
75
 
76
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
77
  print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
 
83
  print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
84
  except Exception as e:
85
  print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
86
+ qwen_4b_unredacted_model = None; qwen_4b_unredacted_processor = None
 
87
 
88
  # ── Qwen3.5-4B ──────────────────────────────────────────
89
  print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
 
95
  print("Qwen3.5-4B model loaded successfully.")
96
  except Exception as e:
97
  print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
98
+ qwen_4b_model = None; qwen_4b_processor = None
 
99
 
100
  # ── Qwen3.5-2B ──────────────────────────────────────────
101
  print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
 
107
  print("Qwen3.5-2B model loaded successfully.")
108
  except Exception as e:
109
  print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
110
+ qwen_2b_model = None; qwen_2b_processor = None
 
111
 
112
  # ── LFM2.5-VL-450M ──────────────────────────────────────
113
  print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
114
  try:
115
  lfm_450_model = AutoModelForImageTextToText.from_pretrained(
116
+ LFM_450_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16,
 
 
117
  ).eval()
118
  lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME)
119
  print("LFM-450M model loaded successfully.")
120
  except Exception as e:
121
  print(f"Warning: LFM-450M model loading failed. Error: {e}")
122
+ lfm_450_model = None; lfm_450_processor = None
 
123
 
124
  # ── Gemma4-E2B-it ───────────────────────────────────────
125
  print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
126
  try:
127
  gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained(
128
+ GEMMA4_E2B_NAME, torch_dtype=torch.bfloat16,
 
129
  device_map="auto" if torch.cuda.is_available() else None,
130
  ).eval()
131
  if not torch.cuda.is_available():
 
134
  print("Gemma4-E2B-it model loaded successfully.")
135
  except Exception as e:
136
  print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
137
+ gemma4_e2b_model = None; gemma4_e2b_processor = None
 
138
 
139
  # ── LFM2.5-VL-1.6B ──────────────────────────────────────
140
  print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
141
  try:
142
  lfm_16_model = AutoModelForImageTextToText.from_pretrained(
143
+ LFM_16_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16,
 
 
144
  ).eval()
145
  lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME)
146
  print("LFM-1.6B model loaded successfully.")
147
  except Exception as e:
148
  print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
149
+ lfm_16_model = None; lfm_16_processor = None
 
150
 
151
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
152
  print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
 
158
  print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
159
  except Exception as e:
160
  print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
161
+ qwen_unredacted_model = None; qwen_unredacted_processor = None
 
162
 
163
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
164
  print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
165
  try:
166
  qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
167
+ QWEN25_VL_3B_NAME, torch_dtype="auto", device_map="auto",
 
 
168
  ).eval()
169
  qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME)
170
  print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
171
  except Exception as e:
172
  print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
173
+ qwen25_vl_3b_model = None; qwen25_vl_3b_processor = None
174
+
175
+
176
+ # ─────────────────────────────────────────────────────────────────────────────
177
+ # SERVER-SIDE ANNOTATION (mirrors the reference app exactly)
178
+ # ─────────────────────────────────────────────────────────────────────────────
179
+
180
+ PALETTE_RGB = [
181
+ (78, 205, 196), # teal
182
+ (124, 106, 247), # purple
183
+ (255, 107, 107), # red
184
+ (255, 217, 61), # yellow
185
+ (107, 203, 119), # green
186
+ (255, 146, 43), # orange
187
+ (204, 93, 232), # violet
188
+ (51, 154, 240), # blue
189
+ ]
190
+
191
+
192
+ def _get_font(size: int = 14):
193
+ """Try to load a TrueType font; fall back to PIL default."""
194
+ for name in ["DejaVuSans-Bold.ttf", "arial.ttf", "Arial.ttf",
195
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"]:
196
+ try:
197
+ return ImageFont.truetype(name, size)
198
+ except (IOError, OSError):
199
+ pass
200
+ return ImageFont.load_default()
201
 
202
 
 
203
  def safe_parse_json(text: str):
204
+ """Strip markdown fences + <think> blocks, then parse JSON."""
205
+ # Remove <think>…</think>
206
+ text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
207
  text = text.strip()
208
+ # Strip markdown fences
209
  text = re.sub(r"^```(json)?", "", text)
210
  text = re.sub(r"```$", "", text)
211
  text = text.strip()
 
213
  return json.loads(text)
214
  except json.JSONDecodeError:
215
  pass
216
+ # Try to find the first [...] or {...} block
217
+ for ch_open, ch_close in [('[', ']'), ('{', '}')]:
218
+ idx = text.find(ch_open)
219
+ if idx != -1:
220
+ depth, in_str, esc = 0, False, False
221
+ for i in range(idx, len(text)):
222
+ c = text[i]
223
+ if esc: esc = False; continue
224
+ if c == '\\': esc = True; continue
225
+ if c == '"': in_str = not in_str; continue
226
+ if in_str: continue
227
+ if c == ch_open: depth += 1
228
+ if c == ch_close:
229
+ depth -= 1
230
+ if depth == 0:
231
+ try:
232
+ return json.loads(text[idx:i+1])
233
+ except Exception:
234
+ break
235
  try:
236
  return ast.literal_eval(text)
237
  except Exception:
238
  return {}
239
 
240
 
241
+ def annotate_detections(image: Image.Image, parsed) -> Image.Image:
242
+ """
243
+ Draw bounding boxes on image.
244
+ parsed: list of dicts with 'bbox_2d' ([x1,y1,x2,y2] in 0-1000 scale)
245
+ and optional 'label'.
246
+ Mirrors reference _run_detection_on_frame output β†’ annotate_image.
247
+ """
248
+ image = image.convert("RGB")
249
+ ow, oh = image.size
250
+ draw = ImageDraw.Draw(image, "RGBA")
251
+ font_lbl = _get_font(max(12, min(ow // 35, 22)))
252
+
253
+ items = parsed if isinstance(parsed, list) else [parsed]
254
+ drawn = 0
255
+ for i, item in enumerate(items):
256
+ if not isinstance(item, dict):
257
+ continue
258
+ bbox = (item.get("bbox_2d") or item.get("bbox") or item.get("box"))
259
+ if not bbox or len(bbox) != 4:
260
+ continue
261
+ col = PALETTE_RGB[i % len(PALETTE_RGB)]
262
+
263
+ # ── Normalise coordinates (0-1000 β†’ pixels) ──────────────────────
264
+ x1, y1, x2, y2 = [float(v) for v in bbox]
265
+ max_v = max(x1, y1, x2, y2)
266
+ if max_v <= 1.0: # 0-1 fraction
267
+ x1, y1, x2, y2 = x1*ow, y1*oh, x2*ow, y2*oh
268
+ elif max_v <= 1000.0: # 0-1000 Qwen scale
269
+ x1, y1, x2, y2 = x1/1000*ow, y1/1000*oh, x2/1000*ow, y2/1000*oh
270
+ # else already in pixels
271
+
272
+ if x2 < x1: x1, x2 = x2, x1
273
+ if y2 < y1: y1, y2 = y2, y1
274
+ x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
275
+
276
+ # ── Fill (semi-transparent) ───────────────────────────────────────
277
+ draw.rectangle([x1, y1, x2, y2], fill=(*col, 46))
278
+
279
+ # ── Border ───────────────────────────────────────────────────────
280
+ lw = max(2, ow // 200)
281
+ for t in range(lw):
282
+ draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=(*col, 255))
283
+
284
+ # ── Corner accent marks ───────────────────────────────────────────
285
+ clen = max(10, min(int((x2-x1)*0.18), int((y2-y1)*0.18), 24))
286
+ corners = [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]
287
+ for cx, cy, sx, sy in corners:
288
+ draw.line([(cx, cy),(cx+sx*clen, cy)], fill=col, width=lw+1)
289
+ draw.line([(cx, cy),(cx, cy+sy*clen)], fill=col, width=lw+1)
290
+
291
+ # ── Label ─────────────────────────────────────────────────────────
292
+ label = str(item.get("label") or item.get("class_name") or item.get("name") or f"obj {i+1}")
293
+ try:
294
+ bb = font_lbl.getbbox(label)
295
+ tw, th = bb[2]-bb[0], bb[3]-bb[1]
296
+ except AttributeError:
297
+ tw, th = font_lbl.getsize(label)
298
+ pad = 5
299
+ lx = max(0, min(x1, ow - tw - pad*2))
300
+ ly = max(0, y1 - th - pad*2) if y1 - th - pad*2 >= 0 else y1 + 2
301
+ draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 230))
302
+ draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
303
+ drawn += 1
304
+
305
+ return image
306
+
307
+
308
+ def annotate_points(image: Image.Image, parsed) -> Image.Image:
309
+ """
310
+ Draw point markers on image.
311
+ parsed: list of dicts with 'point_2d' ([x,y] in 0-1000 scale)
312
+ and optional 'label'.
313
+ Mirrors reference _run_point_detection_on_frame β†’ annotate_image_red_points.
314
+ """
315
+ image = image.convert("RGB")
316
+ ow, oh = image.size
317
+ draw = ImageDraw.Draw(image, "RGBA")
318
+ font_lbl = _get_font(max(12, min(ow // 35, 22)))
319
+
320
+ items = parsed if isinstance(parsed, list) else [parsed]
321
+ drawn = 0
322
+ for i, item in enumerate(items):
323
+ if not isinstance(item, dict):
324
+ continue
325
+ pt = (item.get("point_2d") or item.get("point") or item.get("coord"))
326
+ if not pt or len(pt) != 2:
327
+ continue
328
+ col = PALETTE_RGB[i % len(PALETTE_RGB)]
329
+
330
+ # ── Normalise coordinates ─────────────────────────────────────────
331
+ x, y = float(pt[0]), float(pt[1])
332
+ max_v = max(x, y)
333
+ if max_v <= 1.0:
334
+ x, y = x*ow, y*oh
335
+ elif max_v <= 1000.0:
336
+ x, y = x/1000*ow, y/1000*oh
337
+
338
+ cx, cy = int(x), int(y)
339
+ r = max(7, min(ow // 55, 18))
340
+
341
+ # ── Glow rings ───────────────────────────────────────────────────
342
+ draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=(*col, 38))
343
+ draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)],
344
+ fill=(*col, 64))
345
+
346
+ # ── Core dot ─────────────────────────────────────────────────────
347
+ draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=(*col, 255),
348
+ outline=(255,255,255,255), width=max(2, r//4))
349
+
350
+ # ── Centre pip ───────────────────────────────────────────────────
351
+ rp = max(2, r//4)
352
+ draw.ellipse([cx-rp, cy-rp, cx+rp, cy+rp], fill=(255,255,255,255))
353
+
354
+ # ── Label ─────────────────────────────────────────────────────────
355
+ label = str(item.get("label") or item.get("name") or f"pt {i+1}")
356
+ try:
357
+ bb = font_lbl.getbbox(label)
358
+ tw, th = bb[2]-bb[0], bb[3]-bb[1]
359
+ except AttributeError:
360
+ tw, th = font_lbl.getsize(label)
361
+ pad = 5
362
+ lx = min(cx + r + 8, ow - tw - pad*2)
363
+ ly = max(0, cy - th//2 - pad)
364
+ draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 220))
365
+ draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
366
+ drawn += 1
367
+
368
+ return image
369
+
370
+
371
+ def image_to_b64(img: Image.Image, fmt: str = "PNG") -> str:
372
+ """Convert PIL image β†’ base64 data-URI."""
373
+ buf = io.BytesIO()
374
+ img.save(buf, format=fmt)
375
+ buf.seek(0)
376
+ return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
377
+
378
+
379
+ # ─────────────────────────────────────────────────────────────────────────────
380
+ # NEW ENDPOINT: /api/annotate
381
+ # Receives the image + raw model output text + category,
382
+ # runs server-side annotation, returns base64 PNG.
383
+ # ─────────────────────────────────────────────────────────────────────────────
384
+ @app.post("/api/annotate")
385
+ async def annotate_endpoint(
386
+ image: UploadFile = File(...),
387
+ text: str = Form(...),
388
+ category: str = Form(...),
389
+ ):
390
+ try:
391
+ img_bytes = await image.read()
392
+ img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
393
+ img.thumbnail((512, 512))
394
+
395
+ parsed = safe_parse_json(text)
396
+ if not parsed:
397
+ return JSONResponse({"error": "no_json", "b64": None})
398
+
399
+ if category == "Detect":
400
+ annotated = annotate_detections(img, parsed)
401
+ elif category == "Point":
402
+ annotated = annotate_points(img, parsed)
403
+ else:
404
+ return JSONResponse({"error": "unsupported_category", "b64": None})
405
+
406
+ return JSONResponse({"b64": image_to_b64(annotated)})
407
+ except Exception as e:
408
+ return JSONResponse({"error": str(e), "b64": None}, status_code=500)
409
+
410
+
411
+ # ─────────────────────────────────────────────────────────────────────────────
412
+ # STREAMING INFERENCE
413
+ # ─────────────────────────────────────────────────────────────────────────────
414
  @spaces.GPU(duration=120)
415
  def generate_inference_stream(
416
  image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
 
430
  if model_id == "qwen_vl_2b":
431
  if qwen_vl_2b_model is None or qwen_vl_2b_processor is None:
432
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
433
+ yield "data: [DONE]\n\n"; return
 
434
  messages = [{"role": "user", "content": [
435
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
436
  text_input = qwen_vl_2b_processor.apply_chat_template(
437
+ messages, tokenize=False, add_generation_prompt=True)
 
438
  inputs = qwen_vl_2b_processor(
439
  text=[text_input], images=[image], return_tensors="pt", padding=True
440
  ).to(qwen_vl_2b_model.device)
441
+ streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer,
442
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
443
+ threading.Thread(target=qwen_vl_2b_model.generate,
444
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
445
+ use_cache=True, temperature=1.0, do_sample=True)).start()
 
 
 
 
 
446
  for tok in streamer:
447
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
448
 
449
  # ── Qwen3-VL-4B ─────────────────────────────────────
450
  elif model_id == "qwen_vl_4b":
451
  if qwen_vl_4b_model is None or qwen_vl_4b_processor is None:
452
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
453
+ yield "data: [DONE]\n\n"; return
 
454
  messages = [{"role": "user", "content": [
455
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
456
  text_input = qwen_vl_4b_processor.apply_chat_template(
457
+ messages, tokenize=False, add_generation_prompt=True)
 
458
  inputs = qwen_vl_4b_processor(
459
  text=[text_input], images=[image], return_tensors="pt", padding=True
460
  ).to(qwen_vl_4b_model.device)
461
+ streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer,
462
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
463
+ threading.Thread(target=qwen_vl_4b_model.generate,
464
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
465
+ use_cache=True, temperature=1.0, do_sample=True)).start()
 
 
 
 
 
466
  for tok in streamer:
467
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
468
 
469
  # ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
470
  elif model_id == "qwen_4b_unredacted":
471
  if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None:
472
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
473
+ yield "data: [DONE]\n\n"; return
 
474
  messages = [{"role": "user", "content": [
475
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
476
  text_input = qwen_4b_unredacted_processor.apply_chat_template(
477
+ messages, tokenize=False, add_generation_prompt=True)
 
478
  inputs = qwen_4b_unredacted_processor(
479
  text=[text_input], images=[image], return_tensors="pt", padding=True
480
  ).to(qwen_4b_unredacted_model.device)
481
+ streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer,
482
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
483
+ threading.Thread(target=qwen_4b_unredacted_model.generate,
484
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
485
+ use_cache=True, temperature=1.5, min_p=0.1)).start()
 
 
 
 
 
486
  for tok in streamer:
487
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
488
 
489
  # ── Qwen3.5-4B ───────────────────────────���──────────
490
  elif model_id == "qwen_4b":
491
  if qwen_4b_model is None or qwen_4b_processor is None:
492
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
493
+ yield "data: [DONE]\n\n"; return
 
494
  messages = [{"role": "user", "content": [
495
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
496
  text_input = qwen_4b_processor.apply_chat_template(
497
+ messages, tokenize=False, add_generation_prompt=True)
 
498
  inputs = qwen_4b_processor(
499
  text=[text_input], images=[image], return_tensors="pt", padding=True
500
  ).to(qwen_4b_model.device)
501
+ streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer,
502
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
503
+ threading.Thread(target=qwen_4b_model.generate,
504
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
505
+ use_cache=True, temperature=1.5, min_p=0.1)).start()
 
 
 
 
 
506
  for tok in streamer:
507
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
508
 
509
  # ── Qwen3.5-2B ──────────────────────────────────────
510
  elif model_id == "qwen_2b":
511
  if qwen_2b_model is None or qwen_2b_processor is None:
512
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
513
+ yield "data: [DONE]\n\n"; return
 
514
  messages = [{"role": "user", "content": [
515
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
516
  text_input = qwen_2b_processor.apply_chat_template(
517
+ messages, tokenize=False, add_generation_prompt=True)
 
518
  inputs = qwen_2b_processor(
519
  text=[text_input], images=[image], return_tensors="pt", padding=True
520
  ).to(qwen_2b_model.device)
521
+ streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer,
522
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
523
+ threading.Thread(target=qwen_2b_model.generate,
524
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
525
+ use_cache=True, temperature=1.5, min_p=0.1)).start()
 
 
 
 
 
526
  for tok in streamer:
527
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
528
 
529
  # ── LFM-450M ────────────────────────────────────────
530
  elif model_id == "lfm_450":
531
  if lfm_450_model is None or lfm_450_processor is None:
532
  yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
533
+ yield "data: [DONE]\n\n"; return
 
534
  conversation = [{"role": "user", "content": [
535
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
536
  inputs = lfm_450_processor.apply_chat_template(
537
  conversation, add_generation_prompt=True,
538
  return_tensors="pt", return_dict=True, tokenize=True,
539
  ).to(lfm_450_model.device)
540
+ streamer = TextIteratorStreamer(lfm_450_processor.tokenizer,
541
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
542
+ threading.Thread(target=lfm_450_model.generate,
543
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
544
+ use_cache=True)).start()
 
 
 
 
545
  for tok in streamer:
546
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
547
 
548
  # ── Gemma4-E2B-it ───────────────────────────────────
549
  elif model_id == "gemma4_e2b":
550
  if gemma4_e2b_model is None or gemma4_e2b_processor is None:
551
  yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
552
+ yield "data: [DONE]\n\n"; return
 
553
  messages = [{"role": "user", "content": [
554
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
555
  text_input = gemma4_e2b_processor.apply_chat_template(
556
+ messages, tokenize=False, add_generation_prompt=True)
 
557
  inputs = gemma4_e2b_processor(
558
  text=[text_input], images=[image], return_tensors="pt", padding=True,
559
  ).to(gemma4_e2b_model.device)
560
+ streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer,
561
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
562
+ threading.Thread(target=gemma4_e2b_model.generate,
563
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
564
+ use_cache=True, temperature=1.0, do_sample=True)).start()
 
 
 
 
 
565
  for tok in streamer:
566
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
567
 
568
  # ── LFM-1.6B ────────────────────────────────────────
569
  elif model_id == "lfm_16":
570
  if lfm_16_model is None or lfm_16_processor is None:
571
  yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
572
+ yield "data: [DONE]\n\n"; return
 
573
  conversation = [{"role": "user", "content": [
574
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
575
  inputs = lfm_16_processor.apply_chat_template(
576
  conversation, add_generation_prompt=True,
577
  return_tensors="pt", return_dict=True, tokenize=True,
578
  ).to(lfm_16_model.device)
579
+ streamer = TextIteratorStreamer(lfm_16_processor.tokenizer,
580
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
581
+ threading.Thread(target=lfm_16_model.generate,
582
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
583
+ use_cache=True)).start()
 
 
 
 
584
  for tok in streamer:
585
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
586
 
587
  # ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
588
  elif model_id == "qwen_unredacted":
589
  if qwen_unredacted_model is None or qwen_unredacted_processor is None:
590
  yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
591
+ yield "data: [DONE]\n\n"; return
 
592
  messages = [{"role": "user", "content": [
593
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
594
  text_input = qwen_unredacted_processor.apply_chat_template(
595
+ messages, tokenize=False, add_generation_prompt=True)
 
596
  inputs = qwen_unredacted_processor(
597
  text=[text_input], images=[image], return_tensors="pt", padding=True
598
  ).to(qwen_unredacted_model.device)
599
+ streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer,
600
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
601
+ threading.Thread(target=qwen_unredacted_model.generate,
602
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
603
+ use_cache=True, temperature=1.5, min_p=0.1)).start()
 
 
 
 
 
604
  for tok in streamer:
605
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
606
 
607
  # ── Qwen2.5-VL-3B-Instruct ──────────────────────────
608
  elif model_id == "qwen25_vl_3b":
609
  if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None:
610
  yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
611
+ yield "data: [DONE]\n\n"; return
 
612
  messages = [{"role": "user", "content": [
613
+ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
 
 
614
  text_input = qwen25_vl_3b_processor.apply_chat_template(
615
+ messages, tokenize=False, add_generation_prompt=True)
 
616
  image_inputs, video_inputs = process_vision_info(messages)
617
  inputs = qwen25_vl_3b_processor(
618
+ text=[text_input], images=image_inputs, videos=video_inputs,
619
+ return_tensors="pt", padding=True,
 
 
 
620
  ).to(qwen25_vl_3b_model.device)
621
+ streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer,
622
+ skip_prompt=True, skip_special_tokens=True, timeout=120)
623
+ threading.Thread(target=qwen25_vl_3b_model.generate,
624
+ kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
625
+ use_cache=True, temperature=1.0, do_sample=True)).start()
 
 
 
 
 
626
  for tok in streamer:
627
+ if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
 
 
628
 
629
  yield "data: [DONE]\n\n"
630
 
631
 
632
+ # ─────────────────────────────────────────────────────────────────────────────
633
+ # FastAPI Endpoints
634
+ # ─────────────────────────────────────────────────────────────────────────────
635
  @app.post("/api/run")
636
  async def run_inference(
637
  image: UploadFile = File(...),
 
651
  return JSONResponse({"error": str(e)}, status_code=500)
652
 
653
 
654
+ # ───────────────────────────��─────────────────────────────────────────────────
655
+ # Frontend UI
656
+ # ─────────────────────────────────────────────────────────────────────────────
657
  @app.get("/", response_class=HTMLResponse)
658
  async def homepage(request: Request):
659
  return """
 
707
  .top-bar .sub { font-size: 11px; color: var(--muted); }
708
  .top-bar .badge {
709
  margin-left: auto;
710
+ background: rgba(124,106,247,0.15); border: 1px solid rgba(124,106,247,0.3);
711
+ padding: 3px 10px; border-radius: 20px; font-size: 10px; color: var(--accent);
 
 
712
  }
713
  /* ── Canvas ── */
714
  #canvas {
 
716
  min-height: calc(100vh - 42px); height: 900px; margin: 0 auto;
717
  }
718
  svg.wires {
719
+ position: absolute; top: 0; left: 0; width: 100%; height: 100%;
 
720
  pointer-events: none; z-index: 2; overflow: visible;
721
  }
722
  path.wire { fill: none; stroke: var(--wire); stroke-width: 2.5; stroke-linecap: round; }
 
797
  border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
798
  }
799
  .img-chip.visible { display: flex; }
800
+ .img-chip .chip-dot { width:5px;height:5px;border-radius:50%;background:var(--accent2);flex-shrink:0;box-shadow:0 0 4px var(--accent2); }
801
+ .img-chip .chip-name { overflow:hidden;text-overflow:ellipsis;white-space:nowrap;flex:1;color:var(--text);font-size:9px; }
802
+ .img-chip .chip-size { color:var(--muted);flex-shrink:0;font-size:9px; }
803
  select, textarea {
804
  width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
805
  color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
 
822
  /* ── Output node ── */
823
  .output-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
824
  .output-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
825
+ /* ── Icon buttons ── */
826
  .icon-btn {
827
  display: flex; align-items: center; gap: 5px;
828
  background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
 
830
  font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
831
  color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
832
  transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
833
+ text-decoration: none; border: 1px solid rgba(124,106,247,0.25);
834
  }
835
  .icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
836
  .icon-btn:active { transform: scale(0.95); }
837
+ .icon-btn.teal { background:rgba(78,205,196,0.10);border-color:rgba(78,205,196,0.25);color:var(--accent2); }
838
+ .icon-btn.teal:hover { background:rgba(78,205,196,0.22);border-color:var(--accent2); }
839
+ .icon-btn.copied { background:rgba(78,205,196,0.15);border-color:var(--accent2);color:var(--accent2); }
840
+ .icon-btn svg { pointer-events:none;flex-shrink:0; }
 
 
841
  .output-box {
842
  background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
843
  border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
 
847
  /* ── Grounding node ─�� */
848
  .ground-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
849
  .ground-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
850
+ .ground-img-wrap {
851
  position: relative; flex: 1; border: 1px solid var(--node-border);
852
  border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
853
+ display: flex; align-items: center; justify-content: center;
854
  }
855
+ .ground-img-wrap img {
856
+ width: 100%; height: 100%; object-fit: contain; display: block;
 
 
 
857
  }
858
  .ground-placeholder {
859
  position: absolute; inset: 0; display: flex; align-items: center;
860
+ justify-content: center; font-size: 11px; color: var(--muted);
861
+ text-align: center; padding: 10px; pointer-events: none; z-index: 5;
862
  }
863
  .loader {
864
  width: 11px; height: 11px; border: 2px solid rgba(255,255,255,0.3);
 
866
  animation: spin 0.7s linear infinite; display: none;
867
  }
868
  @keyframes spin { to { transform: rotate(360deg); } }
869
+ .status-dot { width:6px;height:6px;border-radius:50%;background:var(--muted);display:inline-block;margin-right:6px; }
870
+ .status-dot.active { background:var(--accent2);box-shadow:0 0 5px var(--accent2); }
871
  /* ── Model badges ── */
872
  .model-badge {
873
+ display:inline-block;padding:2px 7px;border-radius:4px;
874
+ font-size:9px;font-weight:700;letter-spacing:0.06em;text-transform:uppercase;
875
  }
876
+ .model-badge.qvl2b { background:rgba(255,150,50,0.15); color:#ff9632; border:1px solid rgba(255,150,50,0.35); }
877
+ .model-badge.qvl4b { background:rgba(255,100,80,0.15); color:#ff6450; border:1px solid rgba(255,100,80,0.35); }
878
+ .model-badge.q4bunred { background:rgba(255,80,80,0.18); color:#ff5050; border:1px solid rgba(255,80,80,0.40); }
879
+ .model-badge.q4b { background:rgba(255,200,80,0.15); color:#ffc850; border:1px solid rgba(255,200,80,0.35); }
880
+ .model-badge.q2b { background:rgba(124,106,247,0.2); color:var(--accent); border:1px solid rgba(124,106,247,0.3); }
881
+ .model-badge.lfm450 { background:rgba(78,205,196,0.15); color:var(--accent2); border:1px solid rgba(78,205,196,0.3); }
882
+ .model-badge.g4e2b { background:rgba(66,197,107,0.15); color:#42c56b; border:1px solid rgba(66,197,107,0.35); }
883
+ .model-badge.lfm16 { background:rgba(107,203,119,0.15);color:#6bcb77; border:1px solid rgba(107,203,119,0.35); }
884
+ .model-badge.qunred { background:rgba(255,80,160,0.15); color:#ff50a0; border:1px solid rgba(255,80,160,0.35); }
885
+ .model-badge.q25vl3b { background:rgba(80,180,255,0.15); color:#50b4ff; border:1px solid rgba(80,180,255,0.35); }
886
+ .model-info-box { border-radius:6px;padding:9px;font-size:10px;color:var(--muted);line-height:1.55;flex-shrink:0; }
887
  .canvas-footer { height: 36px; }
 
 
 
 
 
 
 
 
 
 
 
888
  </style>
889
  </head>
890
  <body>
 
896
  <span class="badge">10x Vision Models</span>
897
  </div>
898
 
 
 
 
 
 
 
899
  <div id="canvas">
900
  <svg class="wires">
901
  <path id="wire-img-task" class="wire" />
 
1049
  SAVE
1050
  </a>
1051
  </div>
1052
+ <div class="ground-img-wrap">
1053
+ <!-- Server-rendered annotated image displayed here -->
1054
+ <img id="groundImg" src="" alt="" style="display:none;" />
1055
  <div class="ground-placeholder" id="groundPlaceholder">
1056
  Active for Point / Detect tasks.<br>Run inference to visualise.
1057
  </div>
 
1103
  });
1104
  document.addEventListener('mousemove', e => {
1105
  if (!drag) return;
1106
+ node.style.left=`${il+e.clientX-sx}px`;
1107
+ node.style.top=`${it+e.clientY-sy}px`;
1108
  updateWires();
1109
  });
1110
  document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
 
1129
  const dotImg = document.getElementById('dot-img');
1130
 
1131
  function formatBytes(b) {
1132
+ if (b<1024) return b+' B';
1133
+ if (b<1048576) return (b/1024).toFixed(1)+' KB';
1134
  return (b/1048576).toFixed(1)+' MB';
1135
  }
1136
  function handleFile(file) {
1137
+ if (!file || !file.type.startsWith('image/')) return;
1138
+ currentFile = file;
1139
+ imgPreview.src = URL.createObjectURL(file);
1140
+ previewWrap.classList.add('visible');
1141
+ dropZone.style.display = 'none';
1142
+ chipName.textContent = file.name;
1143
+ chipSize.textContent = formatBytes(file.size);
1144
+ imgChip.classList.add('visible');
1145
+ dotImg.classList.add('active');
1146
  requestAnimationFrame(updateWires);
1147
  }
1148
  function clearImage() {
1149
+ currentFile = null;
1150
+ imgPreview.src = '';
1151
+ previewWrap.classList.remove('visible');
1152
+ dropZone.style.display = '';
1153
+ imgChip.classList.remove('visible');
1154
+ chipName.textContent = 'β€”';
1155
+ chipSize.textContent = '';
1156
+ fileInput.value = '';
1157
+ dotImg.classList.remove('active');
1158
+ requestAnimationFrame(updateWires);
1159
  }
1160
  dropZone.onclick = () => fileInput.click();
1161
  fileInput.onchange = e => handleFile(e.target.files[0]);
 
1177
 
1178
  const MODEL_INFO = {
1179
  qwen_vl_2b: {
1180
+ html: `<span class="model-badge qvl2b">QWEN3-VL Β· 2B</span><br><br>
1181
+ Qwen3-VL-2B-Instruct β€” dedicated vision-language model by Alibaba Cloud.
1182
+ Strong spatial grounding, OCR &amp; instruction-following.`,
1183
  bg: 'rgba(255,150,50,0.07)', border: 'rgba(255,150,50,0.30)',
1184
  },
1185
  qwen_vl_4b: {
1186
+ html: `<span class="model-badge qvl4b">QWEN3-VL Β· 4B</span><br><br>
1187
+ Qwen3-VL-4B-Instruct β€” enhanced vision-language model by Alibaba Cloud.
1188
+ Superior spatial grounding, richer OCR &amp; stronger multi-step reasoning.`,
1189
  bg: 'rgba(255,100,80,0.07)', border: 'rgba(255,100,80,0.25)',
1190
  },
1191
  qwen_4b_unredacted: {
1192
+ html: `<span class="model-badge q4bunred">QWEN 3.5 Β· 4B UNREDACTED MAX</span><br><br>
1193
+ Qwen3.5-4B-Unredacted-MAX by prithivMLmods. Uncensored fine-tune of Qwen3.5-4B
1194
+ with extended instruction-following &amp; unrestricted reasoning.`,
1195
  bg: 'rgba(255,80,80,0.07)', border: 'rgba(255,80,80,0.30)',
1196
  },
1197
  qwen_4b: {
1198
+ html: `<span class="model-badge q4b">QWEN 3.5 Β· 4B</span><br><br>
1199
+ Qwen3.5 4B multimodal model by Alibaba Cloud.
1200
+ Enhanced capacity β€” richer reasoning &amp; better instruction following.`,
1201
  bg: 'rgba(255,200,80,0.07)', border: 'rgba(255,200,80,0.30)',
1202
  },
1203
  qwen_2b: {
1204
+ html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
1205
+ Qwen3.5 2B multimodal model by Alibaba Cloud.
1206
+ Lightweight &amp; fast β€” ideal for quick tasks.`,
1207
  bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
1208
  },
1209
  lfm_450: {
1210
+ html: `<span class="model-badge lfm450">LFM Β· 450M</span><br><br>
1211
+ LFM2.5-VL 450M by LiquidAI. Ultra-lightweight edge model
1212
+ with solid grounding capabilities.`,
1213
  bg: 'rgba(78,205,196,0.07)', border: 'rgba(78,205,196,0.25)',
1214
  },
1215
  gemma4_e2b: {
1216
+ html: `<span class="model-badge g4e2b">GEMMA 4 Β· E2B</span><br><br>
1217
+ Gemma4-E2B-it by Google DeepMind. Efficient 2B multimodal model
1218
+ with strong vision-language understanding &amp; instruction-following.`,
1219
  bg: 'rgba(66,197,107,0.07)', border: 'rgba(66,197,107,0.25)',
1220
  },
1221
  lfm_16: {
1222
+ html: `<span class="model-badge lfm16">LFM Β· 1.6B</span><br><br>
1223
+ LFM2.5-VL 1.6B by LiquidAI. Larger liquid-state model offering
1224
+ enhanced reasoning &amp; richer visual understanding.`,
1225
  bg: 'rgba(107,203,119,0.07)', border: 'rgba(107,203,119,0.25)',
1226
  },
1227
  qwen_unredacted: {
1228
+ html: `<span class="model-badge qunred">QWEN 3.5 Β· 2B UNREDACTED MAX</span><br><br>
1229
+ Qwen3.5-2B-Unredacted-MAX by prithivMLmods. Fine-tuned variant of Qwen3.5-2B
1230
+ with uncensored &amp; extended instruction-following capabilities.`,
1231
  bg: 'rgba(255,80,160,0.07)', border: 'rgba(255,80,160,0.25)',
1232
  },
1233
  qwen25_vl_3b: {
1234
+ html: `<span class="model-badge q25vl3b">QWEN 2.5-VL Β· 3B</span><br><br>
1235
+ Qwen2.5-VL-3B-Instruct by Alibaba Cloud. Powerful 3B vision-language model
1236
+ with strong grounding, OCR &amp; multi-task visual reasoning.`,
1237
  bg: 'rgba(80,180,255,0.07)', border: 'rgba(80,180,255,0.25)',
1238
  },
1239
  };
 
1256
  Point: 'e.g., The gun held by the person.',
1257
  Detect: 'e.g., The headlight of the car.',
1258
  };
1259
+ categorySelect.onchange = e => {
1260
+ promptInput.placeholder = PLACEHOLDERS[e.target.value] || '';
1261
+ };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1262
 
1263
  // ══════════════════════════════════════════════
1264
  // COPY BUTTON
 
1277
  </svg> COPY`;
1278
  }
1279
  copyBtn.onclick = () => {
1280
+ const txt = outputBox.innerText || '';
1281
+ if (!txt || txt === 'Results will stream here...') return;
1282
  navigator.clipboard.writeText(txt).then(() => {
1283
  copyBtn.classList.add('copied');
1284
  copyBtn.innerHTML = `
 
1286
  stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round">
1287
  <polyline points="20 6 9 17 4 12"/>
1288
  </svg> COPIED`;
1289
+ clearTimeout(copyTimer);
1290
+ copyTimer = setTimeout(resetCopyBtn, 2000);
1291
  }).catch(() => {
1292
+ const ta = document.createElement('textarea');
1293
+ ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
1294
+ document.body.appendChild(ta); ta.select();
1295
+ document.execCommand('copy'); document.body.removeChild(ta);
1296
  });
1297
  };
1298
 
1299
+ // ══════════════════════════════════════════════
1300
+ // GROUNDING IMAGE (server-rendered, base64)
1301
+ // ══════════════════════════════════════════════
1302
+ const groundImg = document.getElementById('groundImg');
1303
+ const groundPlaceholder = document.getElementById('groundPlaceholder');
1304
+ const downloadBtn = document.getElementById('downloadBtn');
1305
+ const dotGnd = document.getElementById('dot-gnd');
1306
+
1307
+ function showGroundingImage(b64DataUri) {
1308
+ groundImg.src = b64DataUri;
1309
+ groundImg.style.display = 'block';
1310
+ groundPlaceholder.style.display = 'none';
1311
+ // Wire up download button
1312
+ const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1313
+ downloadBtn.href = b64DataUri;
1314
+ downloadBtn.download = `grounding_${ts}.png`;
1315
+ downloadBtn.style.display = 'flex';
1316
+ dotGnd.classList.add('active');
1317
+ }
1318
+
1319
+ function resetGrounding(msg) {
1320
+ groundImg.src = '';
1321
+ groundImg.style.display = 'none';
1322
+ groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks. Run inference to visualise.';
1323
+ groundPlaceholder.style.display = 'flex';
1324
+ downloadBtn.style.display = 'none';
1325
+ dotGnd.classList.remove('active');
1326
+ }
1327
+
1328
  // ══════════════════════════════════════════════
1329
  // RUN INFERENCE
1330
  // ══════════════════════════════════════════════
 
1333
  const allWires = ['wire-img-task','wire-model-task','wire-task-out','wire-task-gnd'];
1334
  const dotTask = document.getElementById('dot-task');
1335
  const dotOut = document.getElementById('dot-out');
 
1336
 
1337
  runBtn.onclick = async () => {
1338
  if (!currentFile) { alert('Please upload an image into the Input Node.'); return; }
1339
  const promptStr = promptInput.value.trim();
1340
  if (!promptStr) { alert('Please enter a prompt directive.'); return; }
1341
 
1342
+ // ── Reset UI ─────────────────────────────────────────
1343
  runBtn.disabled = true;
1344
  btnLoader.style.display = 'inline-block';
1345
  outputBox.innerText = '';
1346
  outputBox.style.color = '';
 
 
 
 
 
 
1347
  dotTask.classList.add('active');
1348
  dotOut.classList.remove('active');
 
1349
  allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
1350
  resetCopyBtn();
1351
 
1352
+ const cat = categorySelect.value;
1353
+ if (cat === 'Point' || cat === 'Detect') {
1354
+ resetGrounding('Running inference…');
1355
+ }
1356
+
1357
+ // ── Build FormData ────────────────────────────────────
1358
  const formData = new FormData();
1359
  formData.append('image', currentFile);
1360
+ formData.append('category', cat);
1361
  formData.append('prompt', promptStr);
1362
  formData.append('model_id', modelSelect.value);
1363
 
1364
  let fullText = '';
 
 
1365
 
1366
  try {
1367
+ // ── 1. Stream inference ───────────────────────────
1368
  const response = await fetch('/api/run', { method: 'POST', body: formData });
1369
  if (!response.ok) {
1370
  const err = await response.json();
 
1373
 
1374
  const reader = response.body.getReader();
1375
  const decoder = new TextDecoder('utf-8');
1376
+ let buffer = '';
1377
 
1378
  while (true) {
1379
  const { value, done } = await reader.read();
1380
  if (done) break;
1381
  buffer += decoder.decode(value, { stream: true });
1382
  const lines = buffer.split('\\n\\n');
1383
+ buffer = lines.pop(); // keep incomplete chunk
1384
 
1385
  for (const line of lines) {
1386
  if (!line.startsWith('data: ')) continue;
1387
+ const payload = line.slice(6);
1388
  if (payload === '[DONE]') break;
1389
  try {
1390
  const data = JSON.parse(payload);
 
1399
 
1400
  dotOut.classList.add('active');
1401
 
1402
+ // ── 2. Server-side annotation for Point / Detect ──
 
1403
  if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
1404
+ resetGrounding('Annotating image…');
1405
+ try {
1406
+ const annForm = new FormData();
1407
+ annForm.append('image', currentFile);
1408
+ annForm.append('text', fullText);
1409
+ annForm.append('category', cat);
1410
+
1411
+ const annResp = await fetch('/api/annotate', {
1412
+ method: 'POST', body: annForm,
1413
+ });
1414
+ const annData = await annResp.json();
1415
+
1416
+ if (annData.b64) {
1417
+ showGroundingImage(annData.b64);
1418
  } else {
1419
+ resetGrounding(
1420
+ annData.error === 'no_json'
1421
+ ? 'No grounding coordinates found in model output.'
1422
+ : `Annotation error: ${annData.error || 'unknown'}`
1423
+ );
1424
  }
1425
+ } catch (annErr) {
1426
+ resetGrounding(`Annotation failed: ${annErr.message}`);
1427
+ }
1428
  } else if (cat !== 'Point' && cat !== 'Detect') {
1429
+ resetGrounding('Active for Point / Detect tasks. Run inference to visualise.');
 
1430
  }
1431
 
1432
  } catch (err) {
1433
  outputBox.innerText = `[Error] ${err.message}`;
1434
  outputBox.style.color = '#ff6b6b';
1435
+ if (cat === 'Point' || cat === 'Detect') {
1436
+ resetGrounding('Inference error β€” see Output Stream node.');
1437
+ }
1438
  } finally {
1439
  runBtn.disabled = false;
1440
  btnLoader.style.display = 'none';
1441
  dotTask.classList.remove('active');
1442
  allWires.forEach(id => document.getElementById(id)?.classList.remove('active'));
 
 
1443
  }
1444
  };
1445
  </script>