Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,14 +4,13 @@ import json
|
|
| 4 |
import ast
|
| 5 |
import re
|
| 6 |
import uuid
|
| 7 |
-
import base64
|
| 8 |
import threading
|
| 9 |
-
import numpy as np
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Optional
|
| 12 |
|
| 13 |
import spaces
|
| 14 |
import torch
|
|
|
|
| 15 |
from PIL import Image, ImageDraw, ImageFont
|
| 16 |
|
| 17 |
from gradio import Server
|
|
@@ -59,7 +58,8 @@ try:
|
|
| 59 |
print("Qwen3-VL-2B model loaded successfully.")
|
| 60 |
except Exception as e:
|
| 61 |
print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
|
| 62 |
-
qwen_vl_2b_model = None
|
|
|
|
| 63 |
|
| 64 |
# ββ Qwen3-VL-4B-Instruct ββββββββββββββββββββββββββββββββ
|
| 65 |
print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
|
|
@@ -71,7 +71,8 @@ try:
|
|
| 71 |
print("Qwen3-VL-4B model loaded successfully.")
|
| 72 |
except Exception as e:
|
| 73 |
print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
|
| 74 |
-
qwen_vl_4b_model = None
|
|
|
|
| 75 |
|
| 76 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 77 |
print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
|
|
@@ -83,7 +84,8 @@ try:
|
|
| 83 |
print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
|
| 84 |
except Exception as e:
|
| 85 |
print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
|
| 86 |
-
qwen_4b_unredacted_model = None
|
|
|
|
| 87 |
|
| 88 |
# ββ Qwen3.5-4B ββββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
|
|
@@ -95,7 +97,8 @@ try:
|
|
| 95 |
print("Qwen3.5-4B model loaded successfully.")
|
| 96 |
except Exception as e:
|
| 97 |
print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
|
| 98 |
-
qwen_4b_model = None
|
|
|
|
| 99 |
|
| 100 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
|
|
@@ -107,7 +110,8 @@ try:
|
|
| 107 |
print("Qwen3.5-2B model loaded successfully.")
|
| 108 |
except Exception as e:
|
| 109 |
print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
|
| 110 |
-
qwen_2b_model = None
|
|
|
|
| 111 |
|
| 112 |
# ββ LFM2.5-VL-450M ββββββββββββββββββββββββββββββββββββββ
|
| 113 |
print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
|
|
@@ -119,7 +123,8 @@ try:
|
|
| 119 |
print("LFM-450M model loaded successfully.")
|
| 120 |
except Exception as e:
|
| 121 |
print(f"Warning: LFM-450M model loading failed. Error: {e}")
|
| 122 |
-
lfm_450_model = None
|
|
|
|
| 123 |
|
| 124 |
# ββ Gemma4-E2B-it βββββββββββββββββββββββββββββββββββββββ
|
| 125 |
print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
|
|
@@ -134,7 +139,8 @@ try:
|
|
| 134 |
print("Gemma4-E2B-it model loaded successfully.")
|
| 135 |
except Exception as e:
|
| 136 |
print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
|
| 137 |
-
gemma4_e2b_model = None
|
|
|
|
| 138 |
|
| 139 |
# ββ LFM2.5-VL-1.6B ββββββββββββββββββββββββββββββββββββββ
|
| 140 |
print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
|
|
@@ -146,7 +152,8 @@ try:
|
|
| 146 |
print("LFM-1.6B model loaded successfully.")
|
| 147 |
except Exception as e:
|
| 148 |
print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
|
| 149 |
-
lfm_16_model = None
|
|
|
|
| 150 |
|
| 151 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 152 |
print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
|
|
@@ -158,7 +165,8 @@ try:
|
|
| 158 |
print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
|
| 159 |
except Exception as e:
|
| 160 |
print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
|
| 161 |
-
qwen_unredacted_model = None
|
|
|
|
| 162 |
|
| 163 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββββββ
|
| 164 |
print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
|
|
@@ -170,42 +178,17 @@ try:
|
|
| 170 |
print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
|
| 171 |
except Exception as e:
|
| 172 |
print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
|
| 173 |
-
qwen25_vl_3b_model = None
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
-
# SERVER-SIDE ANNOTATION (mirrors the reference app exactly)
|
| 178 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
-
|
| 180 |
-
PALETTE_RGB = [
|
| 181 |
-
(78, 205, 196), # teal
|
| 182 |
-
(124, 106, 247), # purple
|
| 183 |
-
(255, 107, 107), # red
|
| 184 |
-
(255, 217, 61), # yellow
|
| 185 |
-
(107, 203, 119), # green
|
| 186 |
-
(255, 146, 43), # orange
|
| 187 |
-
(204, 93, 232), # violet
|
| 188 |
-
(51, 154, 240), # blue
|
| 189 |
-
]
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
def _get_font(size: int = 14):
|
| 193 |
-
"""Try to load a TrueType font; fall back to PIL default."""
|
| 194 |
-
for name in ["DejaVuSans-Bold.ttf", "arial.ttf", "Arial.ttf",
|
| 195 |
-
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"]:
|
| 196 |
-
try:
|
| 197 |
-
return ImageFont.truetype(name, size)
|
| 198 |
-
except (IOError, OSError):
|
| 199 |
-
pass
|
| 200 |
-
return ImageFont.load_default()
|
| 201 |
|
| 202 |
|
|
|
|
|
|
|
|
|
|
| 203 |
def safe_parse_json(text: str):
|
| 204 |
-
"""Strip markdown fences + <think> blocks, then parse JSON."""
|
| 205 |
-
# Remove <think>β¦</think>
|
| 206 |
-
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
|
| 207 |
text = text.strip()
|
| 208 |
-
#
|
|
|
|
| 209 |
text = re.sub(r"^```(json)?", "", text)
|
| 210 |
text = re.sub(r"```$", "", text)
|
| 211 |
text = text.strip()
|
|
@@ -213,204 +196,210 @@ def safe_parse_json(text: str):
|
|
| 213 |
return json.loads(text)
|
| 214 |
except json.JSONDecodeError:
|
| 215 |
pass
|
| 216 |
-
# Try to find the first [...] or {...} block
|
| 217 |
-
for ch_open, ch_close in [('[', ']'), ('{', '}')]:
|
| 218 |
-
idx = text.find(ch_open)
|
| 219 |
-
if idx != -1:
|
| 220 |
-
depth, in_str, esc = 0, False, False
|
| 221 |
-
for i in range(idx, len(text)):
|
| 222 |
-
c = text[i]
|
| 223 |
-
if esc: esc = False; continue
|
| 224 |
-
if c == '\\': esc = True; continue
|
| 225 |
-
if c == '"': in_str = not in_str; continue
|
| 226 |
-
if in_str: continue
|
| 227 |
-
if c == ch_open: depth += 1
|
| 228 |
-
if c == ch_close:
|
| 229 |
-
depth -= 1
|
| 230 |
-
if depth == 0:
|
| 231 |
-
try:
|
| 232 |
-
return json.loads(text[idx:i+1])
|
| 233 |
-
except Exception:
|
| 234 |
-
break
|
| 235 |
try:
|
| 236 |
return ast.literal_eval(text)
|
| 237 |
except Exception:
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
-
def annotate_detections(image: Image.Image,
|
| 242 |
"""
|
| 243 |
-
Draw bounding boxes on image.
|
| 244 |
-
|
| 245 |
-
and optional 'label'.
|
| 246 |
-
Mirrors reference _run_detection_on_frame output β annotate_image.
|
| 247 |
"""
|
| 248 |
-
image = image.convert("RGB")
|
| 249 |
-
|
| 250 |
draw = ImageDraw.Draw(image, "RGBA")
|
| 251 |
-
font_lbl = _get_font(max(12,
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
continue
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
if max_v <= 1.0: # 0-1 fraction
|
| 267 |
-
x1, y1, x2, y2 = x1*ow, y1*oh, x2*ow, y2*oh
|
| 268 |
-
elif max_v <= 1000.0: # 0-1000 Qwen scale
|
| 269 |
-
x1, y1, x2, y2 = x1/1000*ow, y1/1000*oh, x2/1000*ow, y2/1000*oh
|
| 270 |
-
# else already in pixels
|
| 271 |
-
|
| 272 |
-
if x2 < x1: x1, x2 = x2, x1
|
| 273 |
-
if y2 < y1: y1, y2 = y2, y1
|
| 274 |
-
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
|
| 275 |
-
|
| 276 |
-
# ββ Fill (semi-transparent) βββββββββββββββββββββββββββββββββββββββ
|
| 277 |
-
draw.rectangle([x1, y1, x2, y2], fill=(*col, 46))
|
| 278 |
-
|
| 279 |
-
# ββ Border βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
-
lw = max(2, ow // 200)
|
| 281 |
for t in range(lw):
|
| 282 |
-
draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=
|
| 283 |
|
| 284 |
-
#
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
for cx, cy,
|
| 288 |
-
draw.line([
|
| 289 |
-
draw.line([
|
| 290 |
|
| 291 |
-
#
|
| 292 |
-
label =
|
| 293 |
try:
|
| 294 |
bb = font_lbl.getbbox(label)
|
| 295 |
tw, th = bb[2]-bb[0], bb[3]-bb[1]
|
| 296 |
-
except
|
| 297 |
-
tw, th =
|
| 298 |
pad = 5
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
| 302 |
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
|
| 303 |
-
drawn += 1
|
| 304 |
|
| 305 |
return image
|
| 306 |
|
| 307 |
|
| 308 |
-
def annotate_points(image: Image.Image,
|
| 309 |
"""
|
| 310 |
-
Draw point markers on image.
|
| 311 |
-
|
| 312 |
-
and optional 'label'.
|
| 313 |
-
Mirrors reference _run_point_detection_on_frame β annotate_image_red_points.
|
| 314 |
"""
|
| 315 |
-
image = image.convert("RGB")
|
| 316 |
-
|
| 317 |
draw = ImageDraw.Draw(image, "RGBA")
|
| 318 |
-
font_lbl = _get_font(max(12,
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
try:
|
| 357 |
-
bb = font_lbl.getbbox(label)
|
| 358 |
-
tw, th = bb[2]-bb[0], bb[3]-bb[1]
|
| 359 |
-
except AttributeError:
|
| 360 |
-
tw, th = font_lbl.getsize(label)
|
| 361 |
-
pad = 5
|
| 362 |
-
lx = min(cx + r + 8, ow - tw - pad*2)
|
| 363 |
-
ly = max(0, cy - th//2 - pad)
|
| 364 |
-
draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 220))
|
| 365 |
-
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
|
| 366 |
-
drawn += 1
|
| 367 |
|
| 368 |
return image
|
| 369 |
|
| 370 |
|
| 371 |
-
def
|
| 372 |
-
"""
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
if not parsed:
|
| 397 |
-
return JSONResponse({"error": "no_json", "b64": None})
|
| 398 |
-
|
| 399 |
-
if category == "Detect":
|
| 400 |
-
annotated = annotate_detections(img, parsed)
|
| 401 |
-
elif category == "Point":
|
| 402 |
-
annotated = annotate_points(img, parsed)
|
| 403 |
-
else:
|
| 404 |
-
return JSONResponse({"error": "unsupported_category", "b64": None})
|
| 405 |
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
|
| 411 |
-
#
|
| 412 |
-
#
|
| 413 |
-
#
|
| 414 |
@spaces.GPU(duration=120)
|
| 415 |
def generate_inference_stream(
|
| 416 |
image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
|
|
@@ -432,19 +421,17 @@ def generate_inference_stream(
|
|
| 432 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
|
| 433 |
yield "data: [DONE]\n\n"; return
|
| 434 |
messages = [{"role": "user", "content": [
|
| 435 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
inputs = qwen_vl_2b_processor(
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
threading.Thread(target=qwen_vl_2b_model.generate,
|
| 444 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 445 |
-
use_cache=True, temperature=1.0, do_sample=True)).start()
|
| 446 |
for tok in streamer:
|
| 447 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 448 |
|
| 449 |
# ββ Qwen3-VL-4B βββββββββββββββββββββββββββββββββββββ
|
| 450 |
elif model_id == "qwen_vl_4b":
|
|
@@ -452,19 +439,17 @@ def generate_inference_stream(
|
|
| 452 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
|
| 453 |
yield "data: [DONE]\n\n"; return
|
| 454 |
messages = [{"role": "user", "content": [
|
| 455 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
inputs = qwen_vl_4b_processor(
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
threading.Thread(target=qwen_vl_4b_model.generate,
|
| 464 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 465 |
-
use_cache=True, temperature=1.0, do_sample=True)).start()
|
| 466 |
for tok in streamer:
|
| 467 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 468 |
|
| 469 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββ
|
| 470 |
elif model_id == "qwen_4b_unredacted":
|
|
@@ -472,19 +457,17 @@ def generate_inference_stream(
|
|
| 472 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 473 |
yield "data: [DONE]\n\n"; return
|
| 474 |
messages = [{"role": "user", "content": [
|
| 475 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
inputs = qwen_4b_unredacted_processor(
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
threading.Thread(target=qwen_4b_unredacted_model.generate,
|
| 484 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 485 |
-
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
| 486 |
for tok in streamer:
|
| 487 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 488 |
|
| 489 |
# ββ Qwen3.5-4B ββββββββββββββββββββββββββββββββββββββ
|
| 490 |
elif model_id == "qwen_4b":
|
|
@@ -492,19 +475,17 @@ def generate_inference_stream(
|
|
| 492 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
|
| 493 |
yield "data: [DONE]\n\n"; return
|
| 494 |
messages = [{"role": "user", "content": [
|
| 495 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
inputs = qwen_4b_processor(
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
threading.Thread(target=qwen_4b_model.generate,
|
| 504 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 505 |
-
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
| 506 |
for tok in streamer:
|
| 507 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 508 |
|
| 509 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββ
|
| 510 |
elif model_id == "qwen_2b":
|
|
@@ -512,19 +493,17 @@ def generate_inference_stream(
|
|
| 512 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
|
| 513 |
yield "data: [DONE]\n\n"; return
|
| 514 |
messages = [{"role": "user", "content": [
|
| 515 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
inputs = qwen_2b_processor(
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
threading.Thread(target=qwen_2b_model.generate,
|
| 524 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 525 |
-
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
| 526 |
for tok in streamer:
|
| 527 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 528 |
|
| 529 |
# ββ LFM-450M ββββββββββββββββββββββββββββββββββββββββ
|
| 530 |
elif model_id == "lfm_450":
|
|
@@ -532,18 +511,19 @@ def generate_inference_stream(
|
|
| 532 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
|
| 533 |
yield "data: [DONE]\n\n"; return
|
| 534 |
conversation = [{"role": "user", "content": [
|
| 535 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
|
|
|
| 536 |
inputs = lfm_450_processor.apply_chat_template(
|
| 537 |
conversation, add_generation_prompt=True,
|
| 538 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 539 |
).to(lfm_450_model.device)
|
| 540 |
-
streamer = TextIteratorStreamer(lfm_450_processor.tokenizer,
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
use_cache=True)).start()
|
| 545 |
for tok in streamer:
|
| 546 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 547 |
|
| 548 |
# ββ Gemma4-E2B-it ββββββββββββββοΏ½οΏ½ββββββββββββββββββββ
|
| 549 |
elif model_id == "gemma4_e2b":
|
|
@@ -551,19 +531,17 @@ def generate_inference_stream(
|
|
| 551 |
yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
|
| 552 |
yield "data: [DONE]\n\n"; return
|
| 553 |
messages = [{"role": "user", "content": [
|
| 554 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
inputs = gemma4_e2b_processor(
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
threading.Thread(target=gemma4_e2b_model.generate,
|
| 563 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 564 |
-
use_cache=True, temperature=1.0, do_sample=True)).start()
|
| 565 |
for tok in streamer:
|
| 566 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 567 |
|
| 568 |
# ββ LFM-1.6B ββββββββββββββββββββββββββββββββββββββββ
|
| 569 |
elif model_id == "lfm_16":
|
|
@@ -571,18 +549,19 @@ def generate_inference_stream(
|
|
| 571 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
|
| 572 |
yield "data: [DONE]\n\n"; return
|
| 573 |
conversation = [{"role": "user", "content": [
|
| 574 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
|
|
|
| 575 |
inputs = lfm_16_processor.apply_chat_template(
|
| 576 |
conversation, add_generation_prompt=True,
|
| 577 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 578 |
).to(lfm_16_model.device)
|
| 579 |
-
streamer = TextIteratorStreamer(lfm_16_processor.tokenizer,
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
use_cache=True)).start()
|
| 584 |
for tok in streamer:
|
| 585 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 586 |
|
| 587 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββ
|
| 588 |
elif model_id == "qwen_unredacted":
|
|
@@ -590,19 +569,17 @@ def generate_inference_stream(
|
|
| 590 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 591 |
yield "data: [DONE]\n\n"; return
|
| 592 |
messages = [{"role": "user", "content": [
|
| 593 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
inputs = qwen_unredacted_processor(
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
threading.Thread(target=qwen_unredacted_model.generate,
|
| 602 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 603 |
-
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
| 604 |
for tok in streamer:
|
| 605 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 606 |
|
| 607 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββ
|
| 608 |
elif model_id == "qwen25_vl_3b":
|
|
@@ -610,28 +587,51 @@ def generate_inference_stream(
|
|
| 610 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
|
| 611 |
yield "data: [DONE]\n\n"; return
|
| 612 |
messages = [{"role": "user", "content": [
|
| 613 |
-
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}
|
| 614 |
-
|
| 615 |
-
|
| 616 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 617 |
inputs = qwen25_vl_3b_processor(
|
| 618 |
text=[text_input], images=image_inputs, videos=video_inputs,
|
| 619 |
return_tensors="pt", padding=True,
|
| 620 |
).to(qwen25_vl_3b_model.device)
|
| 621 |
-
streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer,
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
use_cache=True, temperature=1.0, do_sample=True)).start()
|
| 626 |
for tok in streamer:
|
| 627 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
| 628 |
|
| 629 |
yield "data: [DONE]\n\n"
|
| 630 |
|
| 631 |
|
| 632 |
-
#
|
| 633 |
-
#
|
| 634 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
@app.post("/api/run")
|
| 636 |
async def run_inference(
|
| 637 |
image: UploadFile = File(...),
|
|
@@ -651,12 +651,12 @@ async def run_inference(
|
|
| 651 |
return JSONResponse({"error": str(e)}, status_code=500)
|
| 652 |
|
| 653 |
|
| 654 |
-
#
|
| 655 |
-
#
|
| 656 |
-
#
|
| 657 |
@app.get("/", response_class=HTMLResponse)
|
| 658 |
async def homepage(request: Request):
|
| 659 |
-
return """
|
| 660 |
<!DOCTYPE html>
|
| 661 |
<html lang="en">
|
| 662 |
<head>
|
|
@@ -697,18 +697,17 @@ async def homepage(request: Request):
|
|
| 697 |
/* ββ Top Bar ββ */
|
| 698 |
.top-bar {
|
| 699 |
position: sticky; top: 0; left: 0; right: 0; height: 42px;
|
| 700 |
-
background: rgba(13,13,15,0.95);
|
| 701 |
-
border-bottom: 1px solid var(--node-border);
|
| 702 |
display: flex; align-items: center; padding: 0 20px;
|
| 703 |
gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
|
| 704 |
}
|
| 705 |
-
.top-bar .logo
|
| 706 |
-
.top-bar .sep
|
| 707 |
-
.top-bar .sub
|
| 708 |
.top-bar .badge {
|
| 709 |
-
margin-left: auto;
|
| 710 |
-
|
| 711 |
-
|
| 712 |
}
|
| 713 |
/* ββ Canvas ββ */
|
| 714 |
#canvas {
|
|
@@ -797,9 +796,9 @@ async def homepage(request: Request):
|
|
| 797 |
border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
|
| 798 |
}
|
| 799 |
.img-chip.visible { display: flex; }
|
| 800 |
-
.img-chip .chip-dot { width:5px;height:5px;border-radius:50%;background:var(--accent2);flex-shrink:0;box-shadow:0 0 4px var(--accent2); }
|
| 801 |
-
.img-chip .chip-name { overflow:hidden;text-overflow:ellipsis;white-space:nowrap;flex:1;color:var(--text);font-size:9px; }
|
| 802 |
-
.img-chip .chip-size { color:var(--muted);flex-shrink:0;font-size:9px; }
|
| 803 |
select, textarea {
|
| 804 |
width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
|
| 805 |
color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
|
|
@@ -826,18 +825,17 @@ async def homepage(request: Request):
|
|
| 826 |
.icon-btn {
|
| 827 |
display: flex; align-items: center; gap: 5px;
|
| 828 |
background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
|
| 829 |
-
border-radius: 5px; padding: 3px 8px;
|
| 830 |
-
font-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
text-decoration: none; border: 1px solid rgba(124,106,247,0.25);
|
| 834 |
}
|
| 835 |
.icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
|
| 836 |
.icon-btn:active { transform: scale(0.95); }
|
| 837 |
-
.icon-btn.teal { background:rgba(78,205,196,0.10);border-color:rgba(78,205,196,0.25);color:var(--accent2); }
|
| 838 |
-
.icon-btn.teal:hover { background:rgba(78,205,196,0.22);border-color:var(--accent2); }
|
| 839 |
-
.icon-btn.copied { background:rgba(78,205,196,0.15);border-color:var(--accent2);color:var(--accent2); }
|
| 840 |
-
.icon-btn svg { pointer-events:none;flex-shrink:0; }
|
| 841 |
.output-box {
|
| 842 |
background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
|
| 843 |
border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
|
|
@@ -852,8 +850,10 @@ async def homepage(request: Request):
|
|
| 852 |
border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
|
| 853 |
display: flex; align-items: center; justify-content: center;
|
| 854 |
}
|
| 855 |
-
|
| 856 |
-
|
|
|
|
|
|
|
| 857 |
}
|
| 858 |
.ground-placeholder {
|
| 859 |
position: absolute; inset: 0; display: flex; align-items: center;
|
|
@@ -866,24 +866,24 @@ async def homepage(request: Request):
|
|
| 866 |
animation: spin 0.7s linear infinite; display: none;
|
| 867 |
}
|
| 868 |
@keyframes spin { to { transform: rotate(360deg); } }
|
| 869 |
-
.status-dot { width:6px;height:6px;border-radius:50%;background:var(--muted);display:inline-block;margin-right:6px; }
|
| 870 |
-
.status-dot.active { background:var(--accent2);box-shadow:0 0 5px var(--accent2); }
|
| 871 |
/* ββ Model badges ββ */
|
| 872 |
.model-badge {
|
| 873 |
-
display:inline-block;padding:2px 7px;border-radius:4px;
|
| 874 |
-
font-size:9px;font-weight:700;letter-spacing:0.06em;text-transform:uppercase;
|
| 875 |
}
|
| 876 |
-
.model-badge.qvl2b { background:rgba(255,150,50,0.15);
|
| 877 |
-
.model-badge.qvl4b { background:rgba(255,100,80,0.15);
|
| 878 |
-
.model-badge.q4bunred { background:rgba(255,80,80,0.18);
|
| 879 |
-
.model-badge.q4b { background:rgba(255,200,80,0.15);
|
| 880 |
-
.model-badge.q2b { background:rgba(124,106,247,0.2);
|
| 881 |
-
.model-badge.lfm450 { background:rgba(78,205,196,0.15);
|
| 882 |
-
.model-badge.g4e2b { background:rgba(66,197,107,0.15);
|
| 883 |
-
.model-badge.lfm16 { background:rgba(107,203,119,0.15);color:#6bcb77;
|
| 884 |
-
.model-badge.qunred { background:rgba(255,80,160,0.15);
|
| 885 |
-
.model-badge.q25vl3b { background:rgba(80,180,255,0.15);
|
| 886 |
-
.model-info-box { border-radius:6px;padding:9px;font-size:10px;color:var(--muted);line-height:1.55;flex-shrink:0; }
|
| 887 |
.canvas-footer { height: 36px; }
|
| 888 |
</style>
|
| 889 |
</head>
|
|
@@ -1049,9 +1049,8 @@ async def homepage(request: Request):
|
|
| 1049 |
SAVE
|
| 1050 |
</a>
|
| 1051 |
</div>
|
| 1052 |
-
<div class="ground-img-wrap">
|
| 1053 |
-
<
|
| 1054 |
-
<img id="groundImg" src="" alt="" style="display:none;" />
|
| 1055 |
<div class="ground-placeholder" id="groundPlaceholder">
|
| 1056 |
Active for Point / Detect tasks.<br>Run inference to visualise.
|
| 1057 |
</div>
|
|
@@ -1103,8 +1102,7 @@ document.querySelectorAll('.node').forEach(node => {
|
|
| 1103 |
});
|
| 1104 |
document.addEventListener('mousemove', e => {
|
| 1105 |
if (!drag) return;
|
| 1106 |
-
node.style.left=`${il+e.clientX-sx}px`;
|
| 1107 |
-
node.style.top=`${it+e.clientY-sy}px`;
|
| 1108 |
updateWires();
|
| 1109 |
});
|
| 1110 |
document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
|
|
@@ -1134,27 +1132,24 @@ function formatBytes(b) {
|
|
| 1134 |
return (b/1048576).toFixed(1)+' MB';
|
| 1135 |
}
|
| 1136 |
function handleFile(file) {
|
| 1137 |
-
if (!file
|
| 1138 |
-
currentFile
|
| 1139 |
-
imgPreview.src
|
| 1140 |
previewWrap.classList.add('visible');
|
| 1141 |
-
dropZone.style.display
|
| 1142 |
-
chipName.textContent
|
| 1143 |
-
chipSize.textContent
|
| 1144 |
imgChip.classList.add('visible');
|
| 1145 |
dotImg.classList.add('active');
|
| 1146 |
requestAnimationFrame(updateWires);
|
| 1147 |
}
|
| 1148 |
function clearImage() {
|
| 1149 |
-
currentFile
|
| 1150 |
-
imgPreview.src = '';
|
| 1151 |
previewWrap.classList.remove('visible');
|
| 1152 |
-
dropZone.style.display
|
| 1153 |
imgChip.classList.remove('visible');
|
| 1154 |
-
chipName.textContent
|
| 1155 |
-
|
| 1156 |
-
fileInput.value = '';
|
| 1157 |
-
dotImg.classList.remove('active');
|
| 1158 |
requestAnimationFrame(updateWires);
|
| 1159 |
}
|
| 1160 |
dropZone.onclick = () => fileInput.click();
|
|
@@ -1203,7 +1198,7 @@ const MODEL_INFO = {
|
|
| 1203 |
qwen_2b: {
|
| 1204 |
html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
|
| 1205 |
Qwen3.5 2B multimodal model by Alibaba Cloud.
|
| 1206 |
-
Lightweight & fast β ideal for quick tasks.`,
|
| 1207 |
bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
|
| 1208 |
},
|
| 1209 |
lfm_450: {
|
|
@@ -1256,9 +1251,7 @@ const PLACEHOLDERS = {
|
|
| 1256 |
Point: 'e.g., The gun held by the person.',
|
| 1257 |
Detect: 'e.g., The headlight of the car.',
|
| 1258 |
};
|
| 1259 |
-
categorySelect.onchange = e => {
|
| 1260 |
-
promptInput.placeholder = PLACEHOLDERS[e.target.value] || '';
|
| 1261 |
-
};
|
| 1262 |
|
| 1263 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1264 |
// COPY BUTTON
|
|
@@ -1291,35 +1284,37 @@ copyBtn.onclick = () => {
|
|
| 1291 |
}).catch(() => {
|
| 1292 |
const ta = document.createElement('textarea');
|
| 1293 |
ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
|
| 1294 |
-
document.body.appendChild(ta); ta.select();
|
| 1295 |
-
document.
|
| 1296 |
});
|
| 1297 |
};
|
| 1298 |
|
| 1299 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1300 |
-
// GROUNDING
|
| 1301 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1302 |
-
const
|
| 1303 |
const groundPlaceholder = document.getElementById('groundPlaceholder');
|
| 1304 |
const downloadBtn = document.getElementById('downloadBtn');
|
| 1305 |
const dotGnd = document.getElementById('dot-gnd');
|
| 1306 |
|
| 1307 |
-
function
|
| 1308 |
-
|
| 1309 |
-
|
|
|
|
| 1310 |
groundPlaceholder.style.display = 'none';
|
| 1311 |
-
|
|
|
|
|
|
|
| 1312 |
const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
| 1313 |
-
downloadBtn.href =
|
| 1314 |
downloadBtn.download = `grounding_${ts}.png`;
|
| 1315 |
downloadBtn.style.display = 'flex';
|
| 1316 |
-
dotGnd.classList.add('active');
|
| 1317 |
}
|
| 1318 |
|
| 1319 |
-
function
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks.
|
| 1323 |
groundPlaceholder.style.display = 'flex';
|
| 1324 |
downloadBtn.style.display = 'none';
|
| 1325 |
dotGnd.classList.remove('active');
|
|
@@ -1339,7 +1334,7 @@ runBtn.onclick = async () => {
|
|
| 1339 |
const promptStr = promptInput.value.trim();
|
| 1340 |
if (!promptStr) { alert('Please enter a prompt directive.'); return; }
|
| 1341 |
|
| 1342 |
-
// ββ Reset UI ββββββββββββββββββββββββββββββ
|
| 1343 |
runBtn.disabled = true;
|
| 1344 |
btnLoader.style.display = 'inline-block';
|
| 1345 |
outputBox.innerText = '';
|
|
@@ -1348,23 +1343,21 @@ runBtn.onclick = async () => {
|
|
| 1348 |
dotOut.classList.remove('active');
|
| 1349 |
allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
|
| 1350 |
resetCopyBtn();
|
|
|
|
| 1351 |
|
| 1352 |
-
const
|
| 1353 |
-
|
| 1354 |
-
resetGrounding('Running inferenceβ¦');
|
| 1355 |
-
}
|
| 1356 |
|
| 1357 |
-
// ββ
|
| 1358 |
const formData = new FormData();
|
| 1359 |
formData.append('image', currentFile);
|
| 1360 |
-
formData.append('category',
|
| 1361 |
formData.append('prompt', promptStr);
|
| 1362 |
-
formData.append('model_id',
|
| 1363 |
|
| 1364 |
let fullText = '';
|
| 1365 |
|
| 1366 |
try {
|
| 1367 |
-
// ββ 1. Stream inference βββββββββββββββββββββββββββ
|
| 1368 |
const response = await fetch('/api/run', { method: 'POST', body: formData });
|
| 1369 |
if (!response.ok) {
|
| 1370 |
const err = await response.json();
|
|
@@ -1373,15 +1366,14 @@ runBtn.onclick = async () => {
|
|
| 1373 |
|
| 1374 |
const reader = response.body.getReader();
|
| 1375 |
const decoder = new TextDecoder('utf-8');
|
| 1376 |
-
let
|
| 1377 |
|
| 1378 |
while (true) {
|
| 1379 |
const { value, done } = await reader.read();
|
| 1380 |
if (done) break;
|
| 1381 |
buffer += decoder.decode(value, { stream: true });
|
| 1382 |
-
const lines = buffer.split('\
|
| 1383 |
-
buffer = lines.pop();
|
| 1384 |
-
|
| 1385 |
for (const line of lines) {
|
| 1386 |
if (!line.startsWith('data: ')) continue;
|
| 1387 |
const payload = line.slice(6);
|
|
@@ -1399,42 +1391,42 @@ runBtn.onclick = async () => {
|
|
| 1399 |
|
| 1400 |
dotOut.classList.add('active');
|
| 1401 |
|
| 1402 |
-
// ββ 2
|
| 1403 |
-
if ((
|
| 1404 |
-
|
|
|
|
|
|
|
| 1405 |
try {
|
| 1406 |
-
const
|
| 1407 |
-
|
| 1408 |
-
|
| 1409 |
-
|
| 1410 |
|
| 1411 |
-
const
|
| 1412 |
-
method: 'POST', body:
|
| 1413 |
});
|
| 1414 |
-
|
| 1415 |
|
| 1416 |
-
|
| 1417 |
-
|
|
|
|
|
|
|
|
|
|
| 1418 |
} else {
|
| 1419 |
-
|
| 1420 |
-
annData.error === 'no_json'
|
| 1421 |
-
? 'No grounding coordinates found in model output.'
|
| 1422 |
-
: `Annotation error: ${annData.error || 'unknown'}`
|
| 1423 |
-
);
|
| 1424 |
}
|
| 1425 |
-
} catch (
|
| 1426 |
-
|
|
|
|
| 1427 |
}
|
| 1428 |
-
} else if (
|
| 1429 |
-
|
| 1430 |
}
|
| 1431 |
|
| 1432 |
} catch (err) {
|
| 1433 |
outputBox.innerText = `[Error] ${err.message}`;
|
| 1434 |
outputBox.style.color = '#ff6b6b';
|
| 1435 |
-
|
| 1436 |
-
resetGrounding('Inference error β see Output Stream node.');
|
| 1437 |
-
}
|
| 1438 |
} finally {
|
| 1439 |
runBtn.disabled = false;
|
| 1440 |
btnLoader.style.display = 'none';
|
|
|
|
| 4 |
import ast
|
| 5 |
import re
|
| 6 |
import uuid
|
|
|
|
| 7 |
import threading
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Optional
|
| 10 |
|
| 11 |
import spaces
|
| 12 |
import torch
|
| 13 |
+
import numpy as np
|
| 14 |
from PIL import Image, ImageDraw, ImageFont
|
| 15 |
|
| 16 |
from gradio import Server
|
|
|
|
| 58 |
print("Qwen3-VL-2B model loaded successfully.")
|
| 59 |
except Exception as e:
|
| 60 |
print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
|
| 61 |
+
qwen_vl_2b_model = None
|
| 62 |
+
qwen_vl_2b_processor = None
|
| 63 |
|
| 64 |
# ββ Qwen3-VL-4B-Instruct ββββββββββββββββββββββββββββββββ
|
| 65 |
print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 71 |
print("Qwen3-VL-4B model loaded successfully.")
|
| 72 |
except Exception as e:
|
| 73 |
print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
|
| 74 |
+
qwen_vl_4b_model = None
|
| 75 |
+
qwen_vl_4b_processor = None
|
| 76 |
|
| 77 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 78 |
print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
|
|
|
|
| 84 |
print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
|
| 85 |
except Exception as e:
|
| 86 |
print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
|
| 87 |
+
qwen_4b_unredacted_model = None
|
| 88 |
+
qwen_4b_unredacted_processor = None
|
| 89 |
|
| 90 |
# ββ Qwen3.5-4B ββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 97 |
print("Qwen3.5-4B model loaded successfully.")
|
| 98 |
except Exception as e:
|
| 99 |
print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
|
| 100 |
+
qwen_4b_model = None
|
| 101 |
+
qwen_4b_processor = None
|
| 102 |
|
| 103 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 110 |
print("Qwen3.5-2B model loaded successfully.")
|
| 111 |
except Exception as e:
|
| 112 |
print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
|
| 113 |
+
qwen_2b_model = None
|
| 114 |
+
qwen_2b_processor = None
|
| 115 |
|
| 116 |
# ββ LFM2.5-VL-450M ββββββββββββββββββββββββββββββββββββββ
|
| 117 |
print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 123 |
print("LFM-450M model loaded successfully.")
|
| 124 |
except Exception as e:
|
| 125 |
print(f"Warning: LFM-450M model loading failed. Error: {e}")
|
| 126 |
+
lfm_450_model = None
|
| 127 |
+
lfm_450_processor = None
|
| 128 |
|
| 129 |
# ββ Gemma4-E2B-it βββββββββββββββββββββββββββββββββββββββ
|
| 130 |
print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
|
|
|
|
| 139 |
print("Gemma4-E2B-it model loaded successfully.")
|
| 140 |
except Exception as e:
|
| 141 |
print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
|
| 142 |
+
gemma4_e2b_model = None
|
| 143 |
+
gemma4_e2b_processor = None
|
| 144 |
|
| 145 |
# ββ LFM2.5-VL-1.6B ββββββββββββββββββββββββββββββββββββββ
|
| 146 |
print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 152 |
print("LFM-1.6B model loaded successfully.")
|
| 153 |
except Exception as e:
|
| 154 |
print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
|
| 155 |
+
lfm_16_model = None
|
| 156 |
+
lfm_16_processor = None
|
| 157 |
|
| 158 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 159 |
print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
|
|
|
|
| 165 |
print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
|
| 166 |
except Exception as e:
|
| 167 |
print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
|
| 168 |
+
qwen_unredacted_model = None
|
| 169 |
+
qwen_unredacted_processor = None
|
| 170 |
|
| 171 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββββββ
|
| 172 |
print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
|
|
|
|
| 178 |
print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
|
| 179 |
except Exception as e:
|
| 180 |
print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
|
| 181 |
+
qwen25_vl_3b_model = None
|
| 182 |
+
qwen25_vl_3b_processor = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
+
# ---------------------------------------------------------------------------
|
| 186 |
+
# Utility: safe JSON parser (strips markdown fences, handles ast fallback)
|
| 187 |
+
# ---------------------------------------------------------------------------
|
| 188 |
def safe_parse_json(text: str):
|
|
|
|
|
|
|
|
|
|
| 189 |
text = text.strip()
|
| 190 |
+
# strip <think>β¦</think>
|
| 191 |
+
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE).strip()
|
| 192 |
text = re.sub(r"^```(json)?", "", text)
|
| 193 |
text = re.sub(r"```$", "", text)
|
| 194 |
text = text.strip()
|
|
|
|
| 196 |
return json.loads(text)
|
| 197 |
except json.JSONDecodeError:
|
| 198 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
try:
|
| 200 |
return ast.literal_eval(text)
|
| 201 |
except Exception:
|
| 202 |
+
pass
|
| 203 |
+
# Try to find the first JSON array or object in the text
|
| 204 |
+
for pattern in [r'\[[\s\S]*\]', r'\{[\s\S]*\}']:
|
| 205 |
+
m = re.search(pattern, text)
|
| 206 |
+
if m:
|
| 207 |
+
try:
|
| 208 |
+
return json.loads(m.group())
|
| 209 |
+
except Exception:
|
| 210 |
+
pass
|
| 211 |
+
return None
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# ---------------------------------------------------------------------------
|
| 215 |
+
# Server-side annotation (mirrors reference annotate_image exactly)
|
| 216 |
+
# ---------------------------------------------------------------------------
|
| 217 |
+
PALETTE_COLORS = [
|
| 218 |
+
(78, 205, 196), # teal
|
| 219 |
+
(124, 106, 247), # purple
|
| 220 |
+
(255, 107, 107), # red
|
| 221 |
+
(255, 217, 61), # yellow
|
| 222 |
+
(107, 203, 119), # green
|
| 223 |
+
(255, 146, 43), # orange
|
| 224 |
+
(204, 93, 232), # magenta
|
| 225 |
+
(51, 154, 240), # blue
|
| 226 |
+
]
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _get_font(size: int = 14):
|
| 230 |
+
"""Try to load a truetype font, fall back to default."""
|
| 231 |
+
for font_name in ["arial.ttf", "Arial.ttf", "DejaVuSans.ttf",
|
| 232 |
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
|
| 233 |
+
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]:
|
| 234 |
+
try:
|
| 235 |
+
return ImageFont.truetype(font_name, size)
|
| 236 |
+
except (IOError, OSError):
|
| 237 |
+
continue
|
| 238 |
+
return ImageFont.load_default()
|
| 239 |
|
| 240 |
|
| 241 |
+
def annotate_detections(image: Image.Image, objects: list) -> Image.Image:
|
| 242 |
"""
|
| 243 |
+
Draw bounding boxes + labels on image.
|
| 244 |
+
objects: list of {label, x_min, y_min, x_max, y_max} (all coords 0-1 fractions)
|
|
|
|
|
|
|
| 245 |
"""
|
| 246 |
+
image = image.convert("RGB").copy()
|
| 247 |
+
W, H = image.size
|
| 248 |
draw = ImageDraw.Draw(image, "RGBA")
|
| 249 |
+
font_lbl = _get_font(max(12, W // 40))
|
| 250 |
+
|
| 251 |
+
for i, obj in enumerate(objects):
|
| 252 |
+
col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
|
| 253 |
+
col_rgba_fill = col + (46,) # ~18% opacity fill
|
| 254 |
+
col_rgba_solid = col + (255,)
|
| 255 |
+
|
| 256 |
+
x1 = int(obj["x_min"] * W)
|
| 257 |
+
y1 = int(obj["y_min"] * H)
|
| 258 |
+
x2 = int(obj["x_max"] * W)
|
| 259 |
+
y2 = int(obj["y_max"] * H)
|
| 260 |
+
# clamp
|
| 261 |
+
x1, x2 = max(0, x1), min(W, x2)
|
| 262 |
+
y1, y2 = max(0, y1), min(H, y2)
|
| 263 |
+
if x2 <= x1 or y2 <= y1:
|
| 264 |
continue
|
| 265 |
+
|
| 266 |
+
# Filled rectangle
|
| 267 |
+
draw.rectangle([x1, y1, x2, y2], fill=col_rgba_fill)
|
| 268 |
+
# Border (draw 2px by drawing twice)
|
| 269 |
+
lw = max(2, W // 200)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
for t in range(lw):
|
| 271 |
+
draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=col_rgba_solid)
|
| 272 |
|
| 273 |
+
# Corner accents
|
| 274 |
+
ca = min(18, (x2-x1)//4, (y2-y1)//4)
|
| 275 |
+
cw = max(2, lw + 1)
|
| 276 |
+
for (cx, cy, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]:
|
| 277 |
+
draw.line([cx, cy, cx+dx*ca, cy], fill=col_rgba_solid, width=cw)
|
| 278 |
+
draw.line([cx, cy, cx, cy+dy*ca], fill=col_rgba_solid, width=cw)
|
| 279 |
|
| 280 |
+
# Label pill
|
| 281 |
+
label = obj.get("label", "object")
|
| 282 |
try:
|
| 283 |
bb = font_lbl.getbbox(label)
|
| 284 |
tw, th = bb[2]-bb[0], bb[3]-bb[1]
|
| 285 |
+
except Exception:
|
| 286 |
+
tw, th = len(label)*7, 12
|
| 287 |
pad = 5
|
| 288 |
+
pw, ph = tw + pad*2, th + pad*2
|
| 289 |
+
lx = max(0, min(x1, W - pw))
|
| 290 |
+
ly = max(0, y1 - ph) if y1 - ph >= 0 else y1 + 2
|
| 291 |
+
draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba_solid)
|
| 292 |
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
|
|
|
|
| 293 |
|
| 294 |
return image
|
| 295 |
|
| 296 |
|
| 297 |
+
def annotate_points(image: Image.Image, points: list) -> Image.Image:
|
| 298 |
"""
|
| 299 |
+
Draw point markers + labels on image.
|
| 300 |
+
points: list of {label, x, y} (coords 0-1 fractions)
|
|
|
|
|
|
|
| 301 |
"""
|
| 302 |
+
image = image.convert("RGB").copy()
|
| 303 |
+
W, H = image.size
|
| 304 |
draw = ImageDraw.Draw(image, "RGBA")
|
| 305 |
+
font_lbl = _get_font(max(12, W // 40))
|
| 306 |
+
r = max(7, W // 55)
|
| 307 |
+
|
| 308 |
+
for i, pt in enumerate(points):
|
| 309 |
+
col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
|
| 310 |
+
col_rgba = col + (255,)
|
| 311 |
+
glow_rgba = col + (40,)
|
| 312 |
+
mid_rgba = col + (64,)
|
| 313 |
+
|
| 314 |
+
cx = int(pt["x"] * W)
|
| 315 |
+
cy = int(pt["y"] * H)
|
| 316 |
+
cx = max(r, min(W-r, cx))
|
| 317 |
+
cy = max(r, min(H-r, cy))
|
| 318 |
+
|
| 319 |
+
# Outer glow
|
| 320 |
+
draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=glow_rgba)
|
| 321 |
+
# Mid ring
|
| 322 |
+
draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)], fill=mid_rgba)
|
| 323 |
+
# Core dot
|
| 324 |
+
draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=col_rgba, outline=(255,255,255,255), width=max(2,r//3))
|
| 325 |
+
# Centre white dot
|
| 326 |
+
cr = max(2, r//3)
|
| 327 |
+
draw.ellipse([cx-cr, cy-cr, cx+cr, cy+cr], fill=(255,255,255,255))
|
| 328 |
+
|
| 329 |
+
# Label
|
| 330 |
+
label = pt.get("label", "")
|
| 331 |
+
if label:
|
| 332 |
+
try:
|
| 333 |
+
bb = font_lbl.getbbox(label)
|
| 334 |
+
tw, th = bb[2]-bb[0], bb[3]-bb[1]
|
| 335 |
+
except Exception:
|
| 336 |
+
tw, th = len(label)*7, 12
|
| 337 |
+
pad = 5
|
| 338 |
+
pw, ph = tw + pad*2, th + pad*2
|
| 339 |
+
lx = min(cx + r + 6, W - pw)
|
| 340 |
+
ly = max(0, cy - ph//2)
|
| 341 |
+
draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba)
|
| 342 |
+
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
return image
|
| 345 |
|
| 346 |
|
| 347 |
+
def parse_and_annotate(image: Image.Image, full_text: str, category: str):
|
| 348 |
+
"""
|
| 349 |
+
Parse model output and return annotated PIL image + structured result dict.
|
| 350 |
+
Mirrors the reference code logic exactly.
|
| 351 |
+
"""
|
| 352 |
+
parsed = safe_parse_json(full_text)
|
| 353 |
+
if parsed is None:
|
| 354 |
+
return image, {"error": "No JSON found in model output", "raw": full_text[:500]}
|
| 355 |
+
|
| 356 |
+
if category == "Point":
|
| 357 |
+
result = {"points": []}
|
| 358 |
+
items = parsed if isinstance(parsed, list) else [parsed]
|
| 359 |
+
for item in items:
|
| 360 |
+
if isinstance(item, dict) and "point_2d" in item:
|
| 361 |
+
coords = item["point_2d"]
|
| 362 |
+
if isinstance(coords, (list, tuple)) and len(coords) == 2:
|
| 363 |
+
x, y = float(coords[0]), float(coords[1])
|
| 364 |
+
# Reference divides by 1000.0 β Qwen uses 0-1000 scale
|
| 365 |
+
result["points"].append({
|
| 366 |
+
"label": item.get("label", ""),
|
| 367 |
+
"x": x / 1000.0,
|
| 368 |
+
"y": y / 1000.0,
|
| 369 |
+
})
|
| 370 |
+
annotated = annotate_points(image.copy(), result["points"])
|
| 371 |
+
return annotated, result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
+
elif category == "Detect":
|
| 374 |
+
result = {"objects": []}
|
| 375 |
+
items = parsed if isinstance(parsed, list) else [parsed]
|
| 376 |
+
for item in items:
|
| 377 |
+
if isinstance(item, dict) and "bbox_2d" in item:
|
| 378 |
+
coords = item["bbox_2d"]
|
| 379 |
+
if isinstance(coords, (list, tuple)) and len(coords) == 4:
|
| 380 |
+
xmin, ymin, xmax, ymax = [float(v) for v in coords]
|
| 381 |
+
result["objects"].append({
|
| 382 |
+
"label": item.get("label", "object"),
|
| 383 |
+
"x_min": xmin / 1000.0,
|
| 384 |
+
"y_min": ymin / 1000.0,
|
| 385 |
+
"x_max": xmax / 1000.0,
|
| 386 |
+
"y_max": ymax / 1000.0,
|
| 387 |
+
})
|
| 388 |
+
annotated = annotate_detections(image.copy(), result["objects"])
|
| 389 |
+
return annotated, result
|
| 390 |
+
|
| 391 |
+
return image, {}
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def pil_to_png_bytes(image: Image.Image) -> bytes:
|
| 395 |
+
buf = io.BytesIO()
|
| 396 |
+
image.save(buf, format="PNG")
|
| 397 |
+
return buf.getvalue()
|
| 398 |
|
| 399 |
|
| 400 |
+
# ---------------------------------------------------------------------------
|
| 401 |
+
# Inference Generator (Streaming)
|
| 402 |
+
# ---------------------------------------------------------------------------
|
| 403 |
@spaces.GPU(duration=120)
|
| 404 |
def generate_inference_stream(
|
| 405 |
image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
|
|
|
|
| 421 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
|
| 422 |
yield "data: [DONE]\n\n"; return
|
| 423 |
messages = [{"role": "user", "content": [
|
| 424 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 425 |
+
]}]
|
| 426 |
+
text_input = qwen_vl_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 427 |
+
inputs = qwen_vl_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_2b_model.device)
|
| 428 |
+
streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 429 |
+
thread = threading.Thread(target=qwen_vl_2b_model.generate,
|
| 430 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
|
| 431 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 432 |
for tok in streamer:
|
| 433 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 434 |
+
thread.join()
|
| 435 |
|
| 436 |
# ββ Qwen3-VL-4B βββββββββββββββββββββββββββββββββββββ
|
| 437 |
elif model_id == "qwen_vl_4b":
|
|
|
|
| 439 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
|
| 440 |
yield "data: [DONE]\n\n"; return
|
| 441 |
messages = [{"role": "user", "content": [
|
| 442 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 443 |
+
]}]
|
| 444 |
+
text_input = qwen_vl_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 445 |
+
inputs = qwen_vl_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_4b_model.device)
|
| 446 |
+
streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 447 |
+
thread = threading.Thread(target=qwen_vl_4b_model.generate,
|
| 448 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
|
| 449 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 450 |
for tok in streamer:
|
| 451 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 452 |
+
thread.join()
|
| 453 |
|
| 454 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββ
|
| 455 |
elif model_id == "qwen_4b_unredacted":
|
|
|
|
| 457 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 458 |
yield "data: [DONE]\n\n"; return
|
| 459 |
messages = [{"role": "user", "content": [
|
| 460 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 461 |
+
]}]
|
| 462 |
+
text_input = qwen_4b_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 463 |
+
inputs = qwen_4b_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_unredacted_model.device)
|
| 464 |
+
streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 465 |
+
thread = threading.Thread(target=qwen_4b_unredacted_model.generate,
|
| 466 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
|
| 467 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 468 |
for tok in streamer:
|
| 469 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 470 |
+
thread.join()
|
| 471 |
|
| 472 |
# ββ Qwen3.5-4B ββββββββββββββββββββββββββββββββββββββ
|
| 473 |
elif model_id == "qwen_4b":
|
|
|
|
| 475 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
|
| 476 |
yield "data: [DONE]\n\n"; return
|
| 477 |
messages = [{"role": "user", "content": [
|
| 478 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 479 |
+
]}]
|
| 480 |
+
text_input = qwen_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 481 |
+
inputs = qwen_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_model.device)
|
| 482 |
+
streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 483 |
+
thread = threading.Thread(target=qwen_4b_model.generate,
|
| 484 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
|
| 485 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 486 |
for tok in streamer:
|
| 487 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 488 |
+
thread.join()
|
| 489 |
|
| 490 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββ
|
| 491 |
elif model_id == "qwen_2b":
|
|
|
|
| 493 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
|
| 494 |
yield "data: [DONE]\n\n"; return
|
| 495 |
messages = [{"role": "user", "content": [
|
| 496 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 497 |
+
]}]
|
| 498 |
+
text_input = qwen_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 499 |
+
inputs = qwen_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_2b_model.device)
|
| 500 |
+
streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 501 |
+
thread = threading.Thread(target=qwen_2b_model.generate,
|
| 502 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
|
| 503 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 504 |
for tok in streamer:
|
| 505 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 506 |
+
thread.join()
|
| 507 |
|
| 508 |
# ββ LFM-450M ββββββββββββββββββββββββββββββββββββββββ
|
| 509 |
elif model_id == "lfm_450":
|
|
|
|
| 511 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
|
| 512 |
yield "data: [DONE]\n\n"; return
|
| 513 |
conversation = [{"role": "user", "content": [
|
| 514 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 515 |
+
]}]
|
| 516 |
inputs = lfm_450_processor.apply_chat_template(
|
| 517 |
conversation, add_generation_prompt=True,
|
| 518 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 519 |
).to(lfm_450_model.device)
|
| 520 |
+
streamer = TextIteratorStreamer(lfm_450_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 521 |
+
thread = threading.Thread(target=lfm_450_model.generate,
|
| 522 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
|
| 523 |
+
thread.start()
|
|
|
|
| 524 |
for tok in streamer:
|
| 525 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 526 |
+
thread.join()
|
| 527 |
|
| 528 |
# ββ Gemma4-E2B-it ββββββββββββββοΏ½οΏ½ββββββββββββββββββββ
|
| 529 |
elif model_id == "gemma4_e2b":
|
|
|
|
| 531 |
yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
|
| 532 |
yield "data: [DONE]\n\n"; return
|
| 533 |
messages = [{"role": "user", "content": [
|
| 534 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 535 |
+
]}]
|
| 536 |
+
text_input = gemma4_e2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 537 |
+
inputs = gemma4_e2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(gemma4_e2b_model.device)
|
| 538 |
+
streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 539 |
+
thread = threading.Thread(target=gemma4_e2b_model.generate,
|
| 540 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
|
| 541 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 542 |
for tok in streamer:
|
| 543 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 544 |
+
thread.join()
|
| 545 |
|
| 546 |
# ββ LFM-1.6B ββββββββββββββββββββββββββββββββββββββββ
|
| 547 |
elif model_id == "lfm_16":
|
|
|
|
| 549 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
|
| 550 |
yield "data: [DONE]\n\n"; return
|
| 551 |
conversation = [{"role": "user", "content": [
|
| 552 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 553 |
+
]}]
|
| 554 |
inputs = lfm_16_processor.apply_chat_template(
|
| 555 |
conversation, add_generation_prompt=True,
|
| 556 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 557 |
).to(lfm_16_model.device)
|
| 558 |
+
streamer = TextIteratorStreamer(lfm_16_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 559 |
+
thread = threading.Thread(target=lfm_16_model.generate,
|
| 560 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
|
| 561 |
+
thread.start()
|
|
|
|
| 562 |
for tok in streamer:
|
| 563 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 564 |
+
thread.join()
|
| 565 |
|
| 566 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββ
|
| 567 |
elif model_id == "qwen_unredacted":
|
|
|
|
| 569 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 570 |
yield "data: [DONE]\n\n"; return
|
| 571 |
messages = [{"role": "user", "content": [
|
| 572 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 573 |
+
]}]
|
| 574 |
+
text_input = qwen_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 575 |
+
inputs = qwen_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_unredacted_model.device)
|
| 576 |
+
streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 577 |
+
thread = threading.Thread(target=qwen_unredacted_model.generate,
|
| 578 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
|
| 579 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
| 580 |
for tok in streamer:
|
| 581 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 582 |
+
thread.join()
|
| 583 |
|
| 584 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββ
|
| 585 |
elif model_id == "qwen25_vl_3b":
|
|
|
|
| 587 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
|
| 588 |
yield "data: [DONE]\n\n"; return
|
| 589 |
messages = [{"role": "user", "content": [
|
| 590 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
|
| 591 |
+
]}]
|
| 592 |
+
text_input = qwen25_vl_3b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 593 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 594 |
inputs = qwen25_vl_3b_processor(
|
| 595 |
text=[text_input], images=image_inputs, videos=video_inputs,
|
| 596 |
return_tensors="pt", padding=True,
|
| 597 |
).to(qwen25_vl_3b_model.device)
|
| 598 |
+
streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 599 |
+
thread = threading.Thread(target=qwen25_vl_3b_model.generate,
|
| 600 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
|
| 601 |
+
thread.start()
|
|
|
|
| 602 |
for tok in streamer:
|
| 603 |
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 604 |
+
thread.join()
|
| 605 |
|
| 606 |
yield "data: [DONE]\n\n"
|
| 607 |
|
| 608 |
|
| 609 |
+
# ---------------------------------------------------------------------------
|
| 610 |
+
# New endpoint: /api/annotate β receives image + model output text + category
|
| 611 |
+
# Returns annotated PNG + structured JSON
|
| 612 |
+
# ---------------------------------------------------------------------------
|
| 613 |
+
@app.post("/api/annotate")
|
| 614 |
+
async def annotate_endpoint(
|
| 615 |
+
image: UploadFile = File(...),
|
| 616 |
+
text: str = Form(...),
|
| 617 |
+
category: str = Form(...),
|
| 618 |
+
):
|
| 619 |
+
try:
|
| 620 |
+
img_bytes = await image.read()
|
| 621 |
+
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
| 622 |
+
annotated_img, result_dict = parse_and_annotate(img, text, category)
|
| 623 |
+
png_bytes = pil_to_png_bytes(annotated_img)
|
| 624 |
+
return JSONResponse({
|
| 625 |
+
"image_b64": __import__("base64").b64encode(png_bytes).decode(),
|
| 626 |
+
"result": result_dict,
|
| 627 |
+
})
|
| 628 |
+
except Exception as e:
|
| 629 |
+
return JSONResponse({"error": str(e)}, status_code=500)
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
# ---------------------------------------------------------------------------
|
| 633 |
+
# Main inference endpoint
|
| 634 |
+
# ---------------------------------------------------------------------------
|
| 635 |
@app.post("/api/run")
|
| 636 |
async def run_inference(
|
| 637 |
image: UploadFile = File(...),
|
|
|
|
| 651 |
return JSONResponse({"error": str(e)}, status_code=500)
|
| 652 |
|
| 653 |
|
| 654 |
+
# ---------------------------------------------------------------------------
|
| 655 |
+
# Frontend
|
| 656 |
+
# ---------------------------------------------------------------------------
|
| 657 |
@app.get("/", response_class=HTMLResponse)
|
| 658 |
async def homepage(request: Request):
|
| 659 |
+
return r"""
|
| 660 |
<!DOCTYPE html>
|
| 661 |
<html lang="en">
|
| 662 |
<head>
|
|
|
|
| 697 |
/* ββ Top Bar ββ */
|
| 698 |
.top-bar {
|
| 699 |
position: sticky; top: 0; left: 0; right: 0; height: 42px;
|
| 700 |
+
background: rgba(13,13,15,0.95); border-bottom: 1px solid var(--node-border);
|
|
|
|
| 701 |
display: flex; align-items: center; padding: 0 20px;
|
| 702 |
gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
|
| 703 |
}
|
| 704 |
+
.top-bar .logo { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
|
| 705 |
+
.top-bar .sep { color: var(--node-border); }
|
| 706 |
+
.top-bar .sub { font-size: 11px; color: var(--muted); }
|
| 707 |
.top-bar .badge {
|
| 708 |
+
margin-left: auto; background: rgba(124,106,247,0.15);
|
| 709 |
+
border: 1px solid rgba(124,106,247,0.3); padding: 3px 10px;
|
| 710 |
+
border-radius: 20px; font-size: 10px; color: var(--accent);
|
| 711 |
}
|
| 712 |
/* ββ Canvas ββ */
|
| 713 |
#canvas {
|
|
|
|
| 796 |
border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
|
| 797 |
}
|
| 798 |
.img-chip.visible { display: flex; }
|
| 799 |
+
.img-chip .chip-dot { width: 5px; height: 5px; border-radius: 50%; background: var(--accent2); flex-shrink: 0; box-shadow: 0 0 4px var(--accent2); }
|
| 800 |
+
.img-chip .chip-name { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; flex: 1; color: var(--text); font-size: 9px; }
|
| 801 |
+
.img-chip .chip-size { color: var(--muted); flex-shrink: 0; font-size: 9px; }
|
| 802 |
select, textarea {
|
| 803 |
width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
|
| 804 |
color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
|
|
|
|
| 825 |
.icon-btn {
|
| 826 |
display: flex; align-items: center; gap: 5px;
|
| 827 |
background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
|
| 828 |
+
border-radius: 5px; padding: 3px 8px; font-size: 9px; font-weight: 700;
|
| 829 |
+
font-family: 'JetBrains Mono', monospace; color: var(--accent); cursor: pointer;
|
| 830 |
+
letter-spacing: 0.05em; transition: background 0.18s, border-color 0.18s, transform 0.1s;
|
| 831 |
+
flex-shrink: 0; text-decoration: none;
|
|
|
|
| 832 |
}
|
| 833 |
.icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
|
| 834 |
.icon-btn:active { transform: scale(0.95); }
|
| 835 |
+
.icon-btn.teal { background: rgba(78,205,196,0.10); border-color: rgba(78,205,196,0.25); color: var(--accent2); }
|
| 836 |
+
.icon-btn.teal:hover { background: rgba(78,205,196,0.22); border-color: var(--accent2); }
|
| 837 |
+
.icon-btn.copied { background: rgba(78,205,196,0.15); border-color: var(--accent2); color: var(--accent2); }
|
| 838 |
+
.icon-btn svg { pointer-events: none; flex-shrink: 0; }
|
| 839 |
.output-box {
|
| 840 |
background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
|
| 841 |
border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
|
|
|
|
| 850 |
border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
|
| 851 |
display: flex; align-items: center; justify-content: center;
|
| 852 |
}
|
| 853 |
+
/* annotated image displayed via <img> tag β no canvas needed */
|
| 854 |
+
.ground-img-wrap img.overlay-img {
|
| 855 |
+
max-width: 100%; max-height: 100%;
|
| 856 |
+
object-fit: contain; display: block;
|
| 857 |
}
|
| 858 |
.ground-placeholder {
|
| 859 |
position: absolute; inset: 0; display: flex; align-items: center;
|
|
|
|
| 866 |
animation: spin 0.7s linear infinite; display: none;
|
| 867 |
}
|
| 868 |
@keyframes spin { to { transform: rotate(360deg); } }
|
| 869 |
+
.status-dot { width: 6px; height: 6px; border-radius: 50%; background: var(--muted); display: inline-block; margin-right: 6px; }
|
| 870 |
+
.status-dot.active { background: var(--accent2); box-shadow: 0 0 5px var(--accent2); }
|
| 871 |
/* ββ Model badges ββ */
|
| 872 |
.model-badge {
|
| 873 |
+
display: inline-block; padding: 2px 7px; border-radius: 4px;
|
| 874 |
+
font-size: 9px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase;
|
| 875 |
}
|
| 876 |
+
.model-badge.qvl2b { background: rgba(255,150,50,0.15); color: #ff9632; border: 1px solid rgba(255,150,50,0.35); }
|
| 877 |
+
.model-badge.qvl4b { background: rgba(255,100,80,0.15); color: #ff6450; border: 1px solid rgba(255,100,80,0.35); }
|
| 878 |
+
.model-badge.q4bunred { background: rgba(255,80,80,0.18); color: #ff5050; border: 1px solid rgba(255,80,80,0.40); }
|
| 879 |
+
.model-badge.q4b { background: rgba(255,200,80,0.15); color: #ffc850; border: 1px solid rgba(255,200,80,0.35); }
|
| 880 |
+
.model-badge.q2b { background: rgba(124,106,247,0.2); color: var(--accent); border: 1px solid rgba(124,106,247,0.3); }
|
| 881 |
+
.model-badge.lfm450 { background: rgba(78,205,196,0.15); color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
|
| 882 |
+
.model-badge.g4e2b { background: rgba(66,197,107,0.15); color: #42c56b; border: 1px solid rgba(66,197,107,0.35); }
|
| 883 |
+
.model-badge.lfm16 { background: rgba(107,203,119,0.15); color: #6bcb77; border: 1px solid rgba(107,203,119,0.35); }
|
| 884 |
+
.model-badge.qunred { background: rgba(255,80,160,0.15); color: #ff50a0; border: 1px solid rgba(255,80,160,0.35); }
|
| 885 |
+
.model-badge.q25vl3b { background: rgba(80,180,255,0.15); color: #50b4ff; border: 1px solid rgba(80,180,255,0.35); }
|
| 886 |
+
.model-info-box { border-radius: 6px; padding: 9px; font-size: 10px; color: var(--muted); line-height: 1.55; flex-shrink: 0; }
|
| 887 |
.canvas-footer { height: 36px; }
|
| 888 |
</style>
|
| 889 |
</head>
|
|
|
|
| 1049 |
SAVE
|
| 1050 |
</a>
|
| 1051 |
</div>
|
| 1052 |
+
<div class="ground-img-wrap" id="groundWrap">
|
| 1053 |
+
<img class="overlay-img" id="overlayImg" src="" style="display:none;" />
|
|
|
|
| 1054 |
<div class="ground-placeholder" id="groundPlaceholder">
|
| 1055 |
Active for Point / Detect tasks.<br>Run inference to visualise.
|
| 1056 |
</div>
|
|
|
|
| 1102 |
});
|
| 1103 |
document.addEventListener('mousemove', e => {
|
| 1104 |
if (!drag) return;
|
| 1105 |
+
node.style.left=`${il+e.clientX-sx}px`; node.style.top=`${it+e.clientY-sy}px`;
|
|
|
|
| 1106 |
updateWires();
|
| 1107 |
});
|
| 1108 |
document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
|
|
|
|
| 1132 |
return (b/1048576).toFixed(1)+' MB';
|
| 1133 |
}
|
| 1134 |
function handleFile(file) {
|
| 1135 |
+
if (!file||!file.type.startsWith('image/')) return;
|
| 1136 |
+
currentFile=file;
|
| 1137 |
+
imgPreview.src=URL.createObjectURL(file);
|
| 1138 |
previewWrap.classList.add('visible');
|
| 1139 |
+
dropZone.style.display='none';
|
| 1140 |
+
chipName.textContent=file.name;
|
| 1141 |
+
chipSize.textContent=formatBytes(file.size);
|
| 1142 |
imgChip.classList.add('visible');
|
| 1143 |
dotImg.classList.add('active');
|
| 1144 |
requestAnimationFrame(updateWires);
|
| 1145 |
}
|
| 1146 |
function clearImage() {
|
| 1147 |
+
currentFile=null; imgPreview.src='';
|
|
|
|
| 1148 |
previewWrap.classList.remove('visible');
|
| 1149 |
+
dropZone.style.display='';
|
| 1150 |
imgChip.classList.remove('visible');
|
| 1151 |
+
chipName.textContent='β'; chipSize.textContent='';
|
| 1152 |
+
fileInput.value=''; dotImg.classList.remove('active');
|
|
|
|
|
|
|
| 1153 |
requestAnimationFrame(updateWires);
|
| 1154 |
}
|
| 1155 |
dropZone.onclick = () => fileInput.click();
|
|
|
|
| 1198 |
qwen_2b: {
|
| 1199 |
html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
|
| 1200 |
Qwen3.5 2B multimodal model by Alibaba Cloud.
|
| 1201 |
+
Lightweight & fast β ideal for quick Query, Caption, Point & Detect tasks.`,
|
| 1202 |
bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
|
| 1203 |
},
|
| 1204 |
lfm_450: {
|
|
|
|
| 1251 |
Point: 'e.g., The gun held by the person.',
|
| 1252 |
Detect: 'e.g., The headlight of the car.',
|
| 1253 |
};
|
| 1254 |
+
categorySelect.onchange = e => { promptInput.placeholder = PLACEHOLDERS[e.target.value] || ''; };
|
|
|
|
|
|
|
| 1255 |
|
| 1256 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1257 |
// COPY BUTTON
|
|
|
|
| 1284 |
}).catch(() => {
|
| 1285 |
const ta = document.createElement('textarea');
|
| 1286 |
ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
|
| 1287 |
+
document.body.appendChild(ta); ta.select(); document.execCommand('copy');
|
| 1288 |
+
document.body.removeChild(ta);
|
| 1289 |
});
|
| 1290 |
};
|
| 1291 |
|
| 1292 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1293 |
+
// GROUNDING DISPLAY (server-side annotated image)
|
| 1294 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1295 |
+
const overlayImg = document.getElementById('overlayImg');
|
| 1296 |
const groundPlaceholder = document.getElementById('groundPlaceholder');
|
| 1297 |
const downloadBtn = document.getElementById('downloadBtn');
|
| 1298 |
const dotGnd = document.getElementById('dot-gnd');
|
| 1299 |
|
| 1300 |
+
function showOverlay(b64png) {
|
| 1301 |
+
const src = 'data:image/png;base64,' + b64png;
|
| 1302 |
+
overlayImg.src = src;
|
| 1303 |
+
overlayImg.style.display = 'block';
|
| 1304 |
groundPlaceholder.style.display = 'none';
|
| 1305 |
+
dotGnd.classList.add('active');
|
| 1306 |
+
|
| 1307 |
+
// Update download button
|
| 1308 |
const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
| 1309 |
+
downloadBtn.href = src;
|
| 1310 |
downloadBtn.download = `grounding_${ts}.png`;
|
| 1311 |
downloadBtn.style.display = 'flex';
|
|
|
|
| 1312 |
}
|
| 1313 |
|
| 1314 |
+
function resetOverlay(msg) {
|
| 1315 |
+
overlayImg.src = '';
|
| 1316 |
+
overlayImg.style.display = 'none';
|
| 1317 |
+
groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks.\nRun inference to visualise.';
|
| 1318 |
groundPlaceholder.style.display = 'flex';
|
| 1319 |
downloadBtn.style.display = 'none';
|
| 1320 |
dotGnd.classList.remove('active');
|
|
|
|
| 1334 |
const promptStr = promptInput.value.trim();
|
| 1335 |
if (!promptStr) { alert('Please enter a prompt directive.'); return; }
|
| 1336 |
|
| 1337 |
+
// ββ Reset UI ββββββββββββββββββββββββββββββ
|
| 1338 |
runBtn.disabled = true;
|
| 1339 |
btnLoader.style.display = 'inline-block';
|
| 1340 |
outputBox.innerText = '';
|
|
|
|
| 1343 |
dotOut.classList.remove('active');
|
| 1344 |
allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
|
| 1345 |
resetCopyBtn();
|
| 1346 |
+
resetOverlay('Running inferenceβ¦');
|
| 1347 |
|
| 1348 |
+
const category = categorySelect.value;
|
| 1349 |
+
const modelId = modelSelect.value;
|
|
|
|
|
|
|
| 1350 |
|
| 1351 |
+
// ββ Step 1: stream text from /api/run βββββ
|
| 1352 |
const formData = new FormData();
|
| 1353 |
formData.append('image', currentFile);
|
| 1354 |
+
formData.append('category', category);
|
| 1355 |
formData.append('prompt', promptStr);
|
| 1356 |
+
formData.append('model_id', modelId);
|
| 1357 |
|
| 1358 |
let fullText = '';
|
| 1359 |
|
| 1360 |
try {
|
|
|
|
| 1361 |
const response = await fetch('/api/run', { method: 'POST', body: formData });
|
| 1362 |
if (!response.ok) {
|
| 1363 |
const err = await response.json();
|
|
|
|
| 1366 |
|
| 1367 |
const reader = response.body.getReader();
|
| 1368 |
const decoder = new TextDecoder('utf-8');
|
| 1369 |
+
let buffer = '';
|
| 1370 |
|
| 1371 |
while (true) {
|
| 1372 |
const { value, done } = await reader.read();
|
| 1373 |
if (done) break;
|
| 1374 |
buffer += decoder.decode(value, { stream: true });
|
| 1375 |
+
const lines = buffer.split('\n\n');
|
| 1376 |
+
buffer = lines.pop();
|
|
|
|
| 1377 |
for (const line of lines) {
|
| 1378 |
if (!line.startsWith('data: ')) continue;
|
| 1379 |
const payload = line.slice(6);
|
|
|
|
| 1391 |
|
| 1392 |
dotOut.classList.add('active');
|
| 1393 |
|
| 1394 |
+
// ββ Step 2: if Point or Detect β call /api/annotate ββ
|
| 1395 |
+
if ((category === 'Point' || category === 'Detect') && fullText.trim()) {
|
| 1396 |
+
groundPlaceholder.textContent = 'Annotating imageβ¦';
|
| 1397 |
+
groundPlaceholder.style.display = 'flex';
|
| 1398 |
+
|
| 1399 |
try {
|
| 1400 |
+
const annotForm = new FormData();
|
| 1401 |
+
annotForm.append('image', currentFile);
|
| 1402 |
+
annotForm.append('text', fullText);
|
| 1403 |
+
annotForm.append('category', category);
|
| 1404 |
|
| 1405 |
+
const annotResp = await fetch('/api/annotate', {
|
| 1406 |
+
method: 'POST', body: annotForm,
|
| 1407 |
});
|
| 1408 |
+
if (!annotResp.ok) throw new Error('Annotation request failed');
|
| 1409 |
|
| 1410 |
+
const annotData = await annotResp.json();
|
| 1411 |
+
if (annotData.error) {
|
| 1412 |
+
resetOverlay('Annotation error: ' + annotData.error);
|
| 1413 |
+
} else if (annotData.image_b64) {
|
| 1414 |
+
showOverlay(annotData.image_b64);
|
| 1415 |
} else {
|
| 1416 |
+
resetOverlay('No coordinates found in model output.');
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1417 |
}
|
| 1418 |
+
} catch (annotErr) {
|
| 1419 |
+
resetOverlay('Annotation failed: ' + annotErr.message);
|
| 1420 |
+
console.error('Annotation error:', annotErr);
|
| 1421 |
}
|
| 1422 |
+
} else if (category !== 'Point' && category !== 'Detect') {
|
| 1423 |
+
resetOverlay('Active for Point / Detect tasks.\nRun inference to visualise.');
|
| 1424 |
}
|
| 1425 |
|
| 1426 |
} catch (err) {
|
| 1427 |
outputBox.innerText = `[Error] ${err.message}`;
|
| 1428 |
outputBox.style.color = '#ff6b6b';
|
| 1429 |
+
resetOverlay('Inference error β see Output Stream node.');
|
|
|
|
|
|
|
| 1430 |
} finally {
|
| 1431 |
runBtn.disabled = false;
|
| 1432 |
btnLoader.style.display = 'none';
|