Spaces:
Running on Zero
Running on Zero
update app
Browse files
app.py
CHANGED
|
@@ -4,13 +4,15 @@ import json
|
|
| 4 |
import ast
|
| 5 |
import re
|
| 6 |
import uuid
|
|
|
|
| 7 |
import threading
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Optional
|
| 10 |
|
| 11 |
import spaces
|
| 12 |
import torch
|
| 13 |
-
from PIL import Image
|
| 14 |
|
| 15 |
from gradio import Server
|
| 16 |
from fastapi import Request, UploadFile, File, Form
|
|
@@ -51,35 +53,25 @@ QWEN25_VL_3B_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
|
| 51 |
print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...")
|
| 52 |
try:
|
| 53 |
qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 54 |
-
QWEN_VL_2B_MODEL_NAME,
|
| 55 |
-
trust_remote_code=True,
|
| 56 |
-
torch_dtype=torch.bfloat16,
|
| 57 |
).to(DEVICE).eval()
|
| 58 |
-
qwen_vl_2b_processor = AutoProcessor.from_pretrained(
|
| 59 |
-
QWEN_VL_2B_MODEL_NAME, trust_remote_code=True
|
| 60 |
-
)
|
| 61 |
print("Qwen3-VL-2B model loaded successfully.")
|
| 62 |
except Exception as e:
|
| 63 |
print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
|
| 64 |
-
qwen_vl_2b_model = None
|
| 65 |
-
qwen_vl_2b_processor = None
|
| 66 |
|
| 67 |
# ββ Qwen3-VL-4B-Instruct ββββββββββββββββββββββββββββββββ
|
| 68 |
print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
|
| 69 |
try:
|
| 70 |
qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 71 |
-
QWEN_VL_4B_MODEL_NAME,
|
| 72 |
-
trust_remote_code=True,
|
| 73 |
-
torch_dtype=torch.bfloat16,
|
| 74 |
).to(DEVICE).eval()
|
| 75 |
-
qwen_vl_4b_processor = AutoProcessor.from_pretrained(
|
| 76 |
-
QWEN_VL_4B_MODEL_NAME, trust_remote_code=True
|
| 77 |
-
)
|
| 78 |
print("Qwen3-VL-4B model loaded successfully.")
|
| 79 |
except Exception as e:
|
| 80 |
print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
|
| 81 |
-
qwen_vl_4b_model = None
|
| 82 |
-
qwen_vl_4b_processor = None
|
| 83 |
|
| 84 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 85 |
print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
|
|
@@ -91,8 +83,7 @@ try:
|
|
| 91 |
print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
|
| 92 |
except Exception as e:
|
| 93 |
print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
|
| 94 |
-
qwen_4b_unredacted_model = None
|
| 95 |
-
qwen_4b_unredacted_processor = None
|
| 96 |
|
| 97 |
# ββ Qwen3.5-4B ββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
|
|
@@ -104,8 +95,7 @@ try:
|
|
| 104 |
print("Qwen3.5-4B model loaded successfully.")
|
| 105 |
except Exception as e:
|
| 106 |
print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
|
| 107 |
-
qwen_4b_model = None
|
| 108 |
-
qwen_4b_processor = None
|
| 109 |
|
| 110 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
|
|
@@ -117,30 +107,25 @@ try:
|
|
| 117 |
print("Qwen3.5-2B model loaded successfully.")
|
| 118 |
except Exception as e:
|
| 119 |
print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
|
| 120 |
-
qwen_2b_model = None
|
| 121 |
-
qwen_2b_processor = None
|
| 122 |
|
| 123 |
# ββ LFM2.5-VL-450M ββββββββββββββββββββββββββββββββββββββ
|
| 124 |
print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
|
| 125 |
try:
|
| 126 |
lfm_450_model = AutoModelForImageTextToText.from_pretrained(
|
| 127 |
-
LFM_450_MODEL_NAME,
|
| 128 |
-
device_map="auto",
|
| 129 |
-
torch_dtype=torch.bfloat16,
|
| 130 |
).eval()
|
| 131 |
lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME)
|
| 132 |
print("LFM-450M model loaded successfully.")
|
| 133 |
except Exception as e:
|
| 134 |
print(f"Warning: LFM-450M model loading failed. Error: {e}")
|
| 135 |
-
lfm_450_model = None
|
| 136 |
-
lfm_450_processor = None
|
| 137 |
|
| 138 |
# ββ Gemma4-E2B-it βββββββββββββββββββββββββββββββββββββββ
|
| 139 |
print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
|
| 140 |
try:
|
| 141 |
gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained(
|
| 142 |
-
GEMMA4_E2B_NAME,
|
| 143 |
-
torch_dtype=torch.bfloat16,
|
| 144 |
device_map="auto" if torch.cuda.is_available() else None,
|
| 145 |
).eval()
|
| 146 |
if not torch.cuda.is_available():
|
|
@@ -149,23 +134,19 @@ try:
|
|
| 149 |
print("Gemma4-E2B-it model loaded successfully.")
|
| 150 |
except Exception as e:
|
| 151 |
print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
|
| 152 |
-
gemma4_e2b_model = None
|
| 153 |
-
gemma4_e2b_processor = None
|
| 154 |
|
| 155 |
# ββ LFM2.5-VL-1.6B ββββββββββββββββββββββββββββββββββββββ
|
| 156 |
print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
|
| 157 |
try:
|
| 158 |
lfm_16_model = AutoModelForImageTextToText.from_pretrained(
|
| 159 |
-
LFM_16_MODEL_NAME,
|
| 160 |
-
device_map="auto",
|
| 161 |
-
torch_dtype=torch.bfloat16,
|
| 162 |
).eval()
|
| 163 |
lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME)
|
| 164 |
print("LFM-1.6B model loaded successfully.")
|
| 165 |
except Exception as e:
|
| 166 |
print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
|
| 167 |
-
lfm_16_model = None
|
| 168 |
-
lfm_16_processor = None
|
| 169 |
|
| 170 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 171 |
print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
|
|
@@ -177,28 +158,54 @@ try:
|
|
| 177 |
print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
|
| 178 |
except Exception as e:
|
| 179 |
print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
|
| 180 |
-
qwen_unredacted_model = None
|
| 181 |
-
qwen_unredacted_processor = None
|
| 182 |
|
| 183 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββββββ
|
| 184 |
print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
|
| 185 |
try:
|
| 186 |
qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 187 |
-
QWEN25_VL_3B_NAME,
|
| 188 |
-
torch_dtype="auto",
|
| 189 |
-
device_map="auto",
|
| 190 |
).eval()
|
| 191 |
qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME)
|
| 192 |
print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
|
| 193 |
except Exception as e:
|
| 194 |
print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
|
| 195 |
-
qwen25_vl_3b_model = None
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
|
| 199 |
-
# --- Utility Functions ---
|
| 200 |
def safe_parse_json(text: str):
|
|
|
|
|
|
|
|
|
|
| 201 |
text = text.strip()
|
|
|
|
| 202 |
text = re.sub(r"^```(json)?", "", text)
|
| 203 |
text = re.sub(r"```$", "", text)
|
| 204 |
text = text.strip()
|
|
@@ -206,13 +213,204 @@ def safe_parse_json(text: str):
|
|
| 206 |
return json.loads(text)
|
| 207 |
except json.JSONDecodeError:
|
| 208 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
try:
|
| 210 |
return ast.literal_eval(text)
|
| 211 |
except Exception:
|
| 212 |
return {}
|
| 213 |
|
| 214 |
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
@spaces.GPU(duration=120)
|
| 217 |
def generate_inference_stream(
|
| 218 |
image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
|
|
@@ -232,315 +430,208 @@ def generate_inference_stream(
|
|
| 232 |
if model_id == "qwen_vl_2b":
|
| 233 |
if qwen_vl_2b_model is None or qwen_vl_2b_processor is None:
|
| 234 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
|
| 235 |
-
yield "data: [DONE]\n\n"
|
| 236 |
-
return
|
| 237 |
messages = [{"role": "user", "content": [
|
| 238 |
-
{"type": "image", "image": image},
|
| 239 |
-
{"type": "text", "text": full_prompt},
|
| 240 |
-
]}]
|
| 241 |
text_input = qwen_vl_2b_processor.apply_chat_template(
|
| 242 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 243 |
-
)
|
| 244 |
inputs = qwen_vl_2b_processor(
|
| 245 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 246 |
).to(qwen_vl_2b_model.device)
|
| 247 |
-
streamer = TextIteratorStreamer(
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
target=qwen_vl_2b_model.generate,
|
| 253 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 254 |
-
use_cache=True, temperature=1.0, do_sample=True),
|
| 255 |
-
)
|
| 256 |
-
thread.start()
|
| 257 |
for tok in streamer:
|
| 258 |
-
if tok:
|
| 259 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 260 |
-
thread.join()
|
| 261 |
|
| 262 |
# ββ Qwen3-VL-4B βββββββββββββββββββββββββββββββββββββ
|
| 263 |
elif model_id == "qwen_vl_4b":
|
| 264 |
if qwen_vl_4b_model is None or qwen_vl_4b_processor is None:
|
| 265 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
|
| 266 |
-
yield "data: [DONE]\n\n"
|
| 267 |
-
return
|
| 268 |
messages = [{"role": "user", "content": [
|
| 269 |
-
{"type": "image", "image": image},
|
| 270 |
-
{"type": "text", "text": full_prompt},
|
| 271 |
-
]}]
|
| 272 |
text_input = qwen_vl_4b_processor.apply_chat_template(
|
| 273 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 274 |
-
)
|
| 275 |
inputs = qwen_vl_4b_processor(
|
| 276 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 277 |
).to(qwen_vl_4b_model.device)
|
| 278 |
-
streamer = TextIteratorStreamer(
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
target=qwen_vl_4b_model.generate,
|
| 284 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 285 |
-
use_cache=True, temperature=1.0, do_sample=True),
|
| 286 |
-
)
|
| 287 |
-
thread.start()
|
| 288 |
for tok in streamer:
|
| 289 |
-
if tok:
|
| 290 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 291 |
-
thread.join()
|
| 292 |
|
| 293 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββ
|
| 294 |
elif model_id == "qwen_4b_unredacted":
|
| 295 |
if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None:
|
| 296 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 297 |
-
yield "data: [DONE]\n\n"
|
| 298 |
-
return
|
| 299 |
messages = [{"role": "user", "content": [
|
| 300 |
-
{"type": "image", "image": image},
|
| 301 |
-
{"type": "text", "text": full_prompt},
|
| 302 |
-
]}]
|
| 303 |
text_input = qwen_4b_unredacted_processor.apply_chat_template(
|
| 304 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 305 |
-
)
|
| 306 |
inputs = qwen_4b_unredacted_processor(
|
| 307 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 308 |
).to(qwen_4b_unredacted_model.device)
|
| 309 |
-
streamer = TextIteratorStreamer(
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
target=qwen_4b_unredacted_model.generate,
|
| 315 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 316 |
-
use_cache=True, temperature=1.5, min_p=0.1),
|
| 317 |
-
)
|
| 318 |
-
thread.start()
|
| 319 |
for tok in streamer:
|
| 320 |
-
if tok:
|
| 321 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 322 |
-
thread.join()
|
| 323 |
|
| 324 |
# ββ Qwen3.5-4B βββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββ
|
| 325 |
elif model_id == "qwen_4b":
|
| 326 |
if qwen_4b_model is None or qwen_4b_processor is None:
|
| 327 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
|
| 328 |
-
yield "data: [DONE]\n\n"
|
| 329 |
-
return
|
| 330 |
messages = [{"role": "user", "content": [
|
| 331 |
-
{"type": "image", "image": image},
|
| 332 |
-
{"type": "text", "text": full_prompt},
|
| 333 |
-
]}]
|
| 334 |
text_input = qwen_4b_processor.apply_chat_template(
|
| 335 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 336 |
-
)
|
| 337 |
inputs = qwen_4b_processor(
|
| 338 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 339 |
).to(qwen_4b_model.device)
|
| 340 |
-
streamer = TextIteratorStreamer(
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
target=qwen_4b_model.generate,
|
| 346 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 347 |
-
use_cache=True, temperature=1.5, min_p=0.1),
|
| 348 |
-
)
|
| 349 |
-
thread.start()
|
| 350 |
for tok in streamer:
|
| 351 |
-
if tok:
|
| 352 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 353 |
-
thread.join()
|
| 354 |
|
| 355 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββ
|
| 356 |
elif model_id == "qwen_2b":
|
| 357 |
if qwen_2b_model is None or qwen_2b_processor is None:
|
| 358 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
|
| 359 |
-
yield "data: [DONE]\n\n"
|
| 360 |
-
return
|
| 361 |
messages = [{"role": "user", "content": [
|
| 362 |
-
{"type": "image", "image": image},
|
| 363 |
-
{"type": "text", "text": full_prompt},
|
| 364 |
-
]}]
|
| 365 |
text_input = qwen_2b_processor.apply_chat_template(
|
| 366 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 367 |
-
)
|
| 368 |
inputs = qwen_2b_processor(
|
| 369 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 370 |
).to(qwen_2b_model.device)
|
| 371 |
-
streamer = TextIteratorStreamer(
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
target=qwen_2b_model.generate,
|
| 377 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 378 |
-
use_cache=True, temperature=1.5, min_p=0.1),
|
| 379 |
-
)
|
| 380 |
-
thread.start()
|
| 381 |
for tok in streamer:
|
| 382 |
-
if tok:
|
| 383 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 384 |
-
thread.join()
|
| 385 |
|
| 386 |
# ββ LFM-450M ββββββββββββββββββββββββββββββββββββββββ
|
| 387 |
elif model_id == "lfm_450":
|
| 388 |
if lfm_450_model is None or lfm_450_processor is None:
|
| 389 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
|
| 390 |
-
yield "data: [DONE]\n\n"
|
| 391 |
-
return
|
| 392 |
conversation = [{"role": "user", "content": [
|
| 393 |
-
{"type": "image", "image": image},
|
| 394 |
-
{"type": "text", "text": full_prompt},
|
| 395 |
-
]}]
|
| 396 |
inputs = lfm_450_processor.apply_chat_template(
|
| 397 |
conversation, add_generation_prompt=True,
|
| 398 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 399 |
).to(lfm_450_model.device)
|
| 400 |
-
streamer = TextIteratorStreamer(
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
target=lfm_450_model.generate,
|
| 406 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True),
|
| 407 |
-
)
|
| 408 |
-
thread.start()
|
| 409 |
for tok in streamer:
|
| 410 |
-
if tok:
|
| 411 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 412 |
-
thread.join()
|
| 413 |
|
| 414 |
# ββ Gemma4-E2B-it βββββββββββββββββββββββββββββββββββ
|
| 415 |
elif model_id == "gemma4_e2b":
|
| 416 |
if gemma4_e2b_model is None or gemma4_e2b_processor is None:
|
| 417 |
yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
|
| 418 |
-
yield "data: [DONE]\n\n"
|
| 419 |
-
return
|
| 420 |
messages = [{"role": "user", "content": [
|
| 421 |
-
{"type": "image", "image": image},
|
| 422 |
-
{"type": "text", "text": full_prompt},
|
| 423 |
-
]}]
|
| 424 |
text_input = gemma4_e2b_processor.apply_chat_template(
|
| 425 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 426 |
-
)
|
| 427 |
inputs = gemma4_e2b_processor(
|
| 428 |
text=[text_input], images=[image], return_tensors="pt", padding=True,
|
| 429 |
).to(gemma4_e2b_model.device)
|
| 430 |
-
streamer = TextIteratorStreamer(
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
target=gemma4_e2b_model.generate,
|
| 436 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 437 |
-
use_cache=True, temperature=1.0, do_sample=True),
|
| 438 |
-
)
|
| 439 |
-
thread.start()
|
| 440 |
for tok in streamer:
|
| 441 |
-
if tok:
|
| 442 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 443 |
-
thread.join()
|
| 444 |
|
| 445 |
# ββ LFM-1.6B ββββββββββββββββββββββββββββββββββββββββ
|
| 446 |
elif model_id == "lfm_16":
|
| 447 |
if lfm_16_model is None or lfm_16_processor is None:
|
| 448 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
|
| 449 |
-
yield "data: [DONE]\n\n"
|
| 450 |
-
return
|
| 451 |
conversation = [{"role": "user", "content": [
|
| 452 |
-
{"type": "image", "image": image},
|
| 453 |
-
{"type": "text", "text": full_prompt},
|
| 454 |
-
]}]
|
| 455 |
inputs = lfm_16_processor.apply_chat_template(
|
| 456 |
conversation, add_generation_prompt=True,
|
| 457 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 458 |
).to(lfm_16_model.device)
|
| 459 |
-
streamer = TextIteratorStreamer(
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
target=lfm_16_model.generate,
|
| 465 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True),
|
| 466 |
-
)
|
| 467 |
-
thread.start()
|
| 468 |
for tok in streamer:
|
| 469 |
-
if tok:
|
| 470 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 471 |
-
thread.join()
|
| 472 |
|
| 473 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββ
|
| 474 |
elif model_id == "qwen_unredacted":
|
| 475 |
if qwen_unredacted_model is None or qwen_unredacted_processor is None:
|
| 476 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 477 |
-
yield "data: [DONE]\n\n"
|
| 478 |
-
return
|
| 479 |
messages = [{"role": "user", "content": [
|
| 480 |
-
{"type": "image", "image": image},
|
| 481 |
-
{"type": "text", "text": full_prompt},
|
| 482 |
-
]}]
|
| 483 |
text_input = qwen_unredacted_processor.apply_chat_template(
|
| 484 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 485 |
-
)
|
| 486 |
inputs = qwen_unredacted_processor(
|
| 487 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 488 |
).to(qwen_unredacted_model.device)
|
| 489 |
-
streamer = TextIteratorStreamer(
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
target=qwen_unredacted_model.generate,
|
| 495 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 496 |
-
use_cache=True, temperature=1.5, min_p=0.1),
|
| 497 |
-
)
|
| 498 |
-
thread.start()
|
| 499 |
for tok in streamer:
|
| 500 |
-
if tok:
|
| 501 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 502 |
-
thread.join()
|
| 503 |
|
| 504 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββ
|
| 505 |
elif model_id == "qwen25_vl_3b":
|
| 506 |
if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None:
|
| 507 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
|
| 508 |
-
yield "data: [DONE]\n\n"
|
| 509 |
-
return
|
| 510 |
messages = [{"role": "user", "content": [
|
| 511 |
-
{"type": "image", "image": image},
|
| 512 |
-
{"type": "text", "text": full_prompt},
|
| 513 |
-
]}]
|
| 514 |
text_input = qwen25_vl_3b_processor.apply_chat_template(
|
| 515 |
-
messages, tokenize=False, add_generation_prompt=True
|
| 516 |
-
)
|
| 517 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 518 |
inputs = qwen25_vl_3b_processor(
|
| 519 |
-
text=[text_input],
|
| 520 |
-
|
| 521 |
-
videos=video_inputs,
|
| 522 |
-
return_tensors="pt",
|
| 523 |
-
padding=True,
|
| 524 |
).to(qwen25_vl_3b_model.device)
|
| 525 |
-
streamer = TextIteratorStreamer(
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
target=qwen25_vl_3b_model.generate,
|
| 531 |
-
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 532 |
-
use_cache=True, temperature=1.0, do_sample=True),
|
| 533 |
-
)
|
| 534 |
-
thread.start()
|
| 535 |
for tok in streamer:
|
| 536 |
-
if tok:
|
| 537 |
-
yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
| 538 |
-
thread.join()
|
| 539 |
|
| 540 |
yield "data: [DONE]\n\n"
|
| 541 |
|
| 542 |
|
| 543 |
-
#
|
|
|
|
|
|
|
| 544 |
@app.post("/api/run")
|
| 545 |
async def run_inference(
|
| 546 |
image: UploadFile = File(...),
|
|
@@ -560,7 +651,9 @@ async def run_inference(
|
|
| 560 |
return JSONResponse({"error": str(e)}, status_code=500)
|
| 561 |
|
| 562 |
|
| 563 |
-
#
|
|
|
|
|
|
|
| 564 |
@app.get("/", response_class=HTMLResponse)
|
| 565 |
async def homepage(request: Request):
|
| 566 |
return """
|
|
@@ -614,10 +707,8 @@ async def homepage(request: Request):
|
|
| 614 |
.top-bar .sub { font-size: 11px; color: var(--muted); }
|
| 615 |
.top-bar .badge {
|
| 616 |
margin-left: auto;
|
| 617 |
-
background: rgba(124,106,247,0.15);
|
| 618 |
-
border:
|
| 619 |
-
padding: 3px 10px; border-radius: 20px;
|
| 620 |
-
font-size: 10px; color: var(--accent);
|
| 621 |
}
|
| 622 |
/* ββ Canvas ββ */
|
| 623 |
#canvas {
|
|
@@ -625,8 +716,7 @@ async def homepage(request: Request):
|
|
| 625 |
min-height: calc(100vh - 42px); height: 900px; margin: 0 auto;
|
| 626 |
}
|
| 627 |
svg.wires {
|
| 628 |
-
position: absolute; top: 0; left: 0;
|
| 629 |
-
width: 100%; height: 100%;
|
| 630 |
pointer-events: none; z-index: 2; overflow: visible;
|
| 631 |
}
|
| 632 |
path.wire { fill: none; stroke: var(--wire); stroke-width: 2.5; stroke-linecap: round; }
|
|
@@ -707,9 +797,9 @@ async def homepage(request: Request):
|
|
| 707 |
border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
|
| 708 |
}
|
| 709 |
.img-chip.visible { display: flex; }
|
| 710 |
-
.img-chip .chip-dot { width:
|
| 711 |
-
.img-chip .chip-name { overflow:
|
| 712 |
-
.img-chip .chip-size { color:
|
| 713 |
select, textarea {
|
| 714 |
width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
|
| 715 |
color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
|
|
@@ -732,7 +822,7 @@ async def homepage(request: Request):
|
|
| 732 |
/* ββ Output node ββ */
|
| 733 |
.output-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
|
| 734 |
.output-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
|
| 735 |
-
/* ββ Icon buttons
|
| 736 |
.icon-btn {
|
| 737 |
display: flex; align-items: center; gap: 5px;
|
| 738 |
background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
|
|
@@ -740,16 +830,14 @@ async def homepage(request: Request):
|
|
| 740 |
font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
|
| 741 |
color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
|
| 742 |
transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
|
| 743 |
-
text-decoration: none;
|
| 744 |
}
|
| 745 |
.icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
|
| 746 |
.icon-btn:active { transform: scale(0.95); }
|
| 747 |
-
.icon-btn.teal {
|
| 748 |
-
|
| 749 |
-
}
|
| 750 |
-
.icon-btn
|
| 751 |
-
.icon-btn.copied { background: rgba(78,205,196,0.15); border-color: var(--accent2); color: var(--accent2); }
|
| 752 |
-
.icon-btn svg { pointer-events: none; flex-shrink: 0; }
|
| 753 |
.output-box {
|
| 754 |
background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
|
| 755 |
border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
|
|
@@ -759,20 +847,18 @@ async def homepage(request: Request):
|
|
| 759 |
/* ββ Grounding node βοΏ½οΏ½ */
|
| 760 |
.ground-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
|
| 761 |
.ground-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
|
| 762 |
-
.ground-
|
| 763 |
position: relative; flex: 1; border: 1px solid var(--node-border);
|
| 764 |
border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
|
|
|
|
| 765 |
}
|
| 766 |
-
.ground-
|
| 767 |
-
|
| 768 |
-
width: 100%; height: 100%;
|
| 769 |
-
object-fit: contain; display: block;
|
| 770 |
-
image-rendering: auto;
|
| 771 |
}
|
| 772 |
.ground-placeholder {
|
| 773 |
position: absolute; inset: 0; display: flex; align-items: center;
|
| 774 |
-
justify-content: center; font-size: 11px; color: var(--muted);
|
| 775 |
-
pointer-events: none; z-index: 5;
|
| 776 |
}
|
| 777 |
.loader {
|
| 778 |
width: 11px; height: 11px; border: 2px solid rgba(255,255,255,0.3);
|
|
@@ -780,36 +866,25 @@ async def homepage(request: Request):
|
|
| 780 |
animation: spin 0.7s linear infinite; display: none;
|
| 781 |
}
|
| 782 |
@keyframes spin { to { transform: rotate(360deg); } }
|
| 783 |
-
.status-dot { width:
|
| 784 |
-
.status-dot.active { background:
|
| 785 |
/* ββ Model badges ββ */
|
| 786 |
.model-badge {
|
| 787 |
-
display:
|
| 788 |
-
font-size:
|
| 789 |
}
|
| 790 |
-
.model-badge.qvl2b { background:
|
| 791 |
-
.model-badge.qvl4b { background:
|
| 792 |
-
.model-badge.q4bunred { background:
|
| 793 |
-
.model-badge.q4b { background:
|
| 794 |
-
.model-badge.q2b { background:
|
| 795 |
-
.model-badge.lfm450 { background:
|
| 796 |
-
.model-badge.g4e2b { background:
|
| 797 |
-
.model-badge.lfm16 { background:
|
| 798 |
-
.model-badge.qunred { background:
|
| 799 |
-
.model-badge.q25vl3b { background:
|
| 800 |
-
.model-info-box { border-radius:
|
| 801 |
.canvas-footer { height: 36px; }
|
| 802 |
-
|
| 803 |
-
/* ββ Debug panel ββ */
|
| 804 |
-
#debugPanel {
|
| 805 |
-
position: fixed; bottom: 12px; right: 12px; z-index: 9999;
|
| 806 |
-
background: rgba(13,13,15,0.95); border: 1px solid var(--node-border);
|
| 807 |
-
border-radius: 7px; padding: 8px 12px; font-size: 10px; color: var(--muted);
|
| 808 |
-
max-width: 340px; display: none; backdrop-filter: blur(8px);
|
| 809 |
-
}
|
| 810 |
-
#debugPanel.visible { display: block; }
|
| 811 |
-
#debugPanel .dbg-title { color: var(--accent2); font-weight: 700; margin-bottom: 4px; }
|
| 812 |
-
#debugPanel pre { white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow-y: auto; color: #a0a0c0; }
|
| 813 |
</style>
|
| 814 |
</head>
|
| 815 |
<body>
|
|
@@ -821,12 +896,6 @@ async def homepage(request: Request):
|
|
| 821 |
<span class="badge">10x Vision Models</span>
|
| 822 |
</div>
|
| 823 |
|
| 824 |
-
<!-- Debug panel (toggle with D key) -->
|
| 825 |
-
<div id="debugPanel">
|
| 826 |
-
<div class="dbg-title">⬑ GROUNDING DEBUG</div>
|
| 827 |
-
<pre id="debugPre"></pre>
|
| 828 |
-
</div>
|
| 829 |
-
|
| 830 |
<div id="canvas">
|
| 831 |
<svg class="wires">
|
| 832 |
<path id="wire-img-task" class="wire" />
|
|
@@ -980,8 +1049,9 @@ async def homepage(request: Request):
|
|
| 980 |
SAVE
|
| 981 |
</a>
|
| 982 |
</div>
|
| 983 |
-
<div class="ground-
|
| 984 |
-
<
|
|
|
|
| 985 |
<div class="ground-placeholder" id="groundPlaceholder">
|
| 986 |
Active for Point / Detect tasks.<br>Run inference to visualise.
|
| 987 |
</div>
|
|
@@ -1033,7 +1103,8 @@ document.querySelectorAll('.node').forEach(node => {
|
|
| 1033 |
});
|
| 1034 |
document.addEventListener('mousemove', e => {
|
| 1035 |
if (!drag) return;
|
| 1036 |
-
node.style.left=`${il+e.clientX-sx}px`;
|
|
|
|
| 1037 |
updateWires();
|
| 1038 |
});
|
| 1039 |
document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
|
|
@@ -1058,22 +1129,33 @@ const chipSize = document.getElementById('chipSize');
|
|
| 1058 |
const dotImg = document.getElementById('dot-img');
|
| 1059 |
|
| 1060 |
function formatBytes(b) {
|
| 1061 |
-
if (b<1024) return b+' B';
|
|
|
|
| 1062 |
return (b/1048576).toFixed(1)+' MB';
|
| 1063 |
}
|
| 1064 |
function handleFile(file) {
|
| 1065 |
-
if (!file||!file.type.startsWith('image/')) return;
|
| 1066 |
-
currentFile
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1070 |
requestAnimationFrame(updateWires);
|
| 1071 |
}
|
| 1072 |
function clearImage() {
|
| 1073 |
-
currentFile
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1077 |
}
|
| 1078 |
dropZone.onclick = () => fileInput.click();
|
| 1079 |
fileInput.onchange = e => handleFile(e.target.files[0]);
|
|
@@ -1095,63 +1177,63 @@ dotModel.classList.add('active');
|
|
| 1095 |
|
| 1096 |
const MODEL_INFO = {
|
| 1097 |
qwen_vl_2b: {
|
| 1098 |
-
html:
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
bg: 'rgba(255,150,50,0.07)', border: 'rgba(255,150,50,0.30)',
|
| 1102 |
},
|
| 1103 |
qwen_vl_4b: {
|
| 1104 |
-
html:
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
bg: 'rgba(255,100,80,0.07)', border: 'rgba(255,100,80,0.25)',
|
| 1108 |
},
|
| 1109 |
qwen_4b_unredacted: {
|
| 1110 |
-
html:
|
| 1111 |
-
|
| 1112 |
-
|
| 1113 |
bg: 'rgba(255,80,80,0.07)', border: 'rgba(255,80,80,0.30)',
|
| 1114 |
},
|
| 1115 |
qwen_4b: {
|
| 1116 |
-
html:
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
bg: 'rgba(255,200,80,0.07)', border: 'rgba(255,200,80,0.30)',
|
| 1120 |
},
|
| 1121 |
qwen_2b: {
|
| 1122 |
-
html:
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
|
| 1126 |
},
|
| 1127 |
lfm_450: {
|
| 1128 |
-
html:
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
bg: 'rgba(78,205,196,0.07)', border: 'rgba(78,205,196,0.25)',
|
| 1132 |
},
|
| 1133 |
gemma4_e2b: {
|
| 1134 |
-
html:
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
bg: 'rgba(66,197,107,0.07)', border: 'rgba(66,197,107,0.25)',
|
| 1138 |
},
|
| 1139 |
lfm_16: {
|
| 1140 |
-
html:
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
bg: 'rgba(107,203,119,0.07)', border: 'rgba(107,203,119,0.25)',
|
| 1144 |
},
|
| 1145 |
qwen_unredacted: {
|
| 1146 |
-
html:
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
bg: 'rgba(255,80,160,0.07)', border: 'rgba(255,80,160,0.25)',
|
| 1150 |
},
|
| 1151 |
qwen25_vl_3b: {
|
| 1152 |
-
html:
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
bg: 'rgba(80,180,255,0.07)', border: 'rgba(80,180,255,0.25)',
|
| 1156 |
},
|
| 1157 |
};
|
|
@@ -1174,307 +1256,9 @@ const PLACEHOLDERS = {
|
|
| 1174 |
Point: 'e.g., The gun held by the person.',
|
| 1175 |
Detect: 'e.g., The headlight of the car.',
|
| 1176 |
};
|
| 1177 |
-
categorySelect.onchange = e => {
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
// DEBUG PANEL (press D to toggle)
|
| 1181 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1182 |
-
const debugPanel = document.getElementById('debugPanel');
|
| 1183 |
-
const debugPre = document.getElementById('debugPre');
|
| 1184 |
-
let debugVisible = false;
|
| 1185 |
-
document.addEventListener('keydown', e => {
|
| 1186 |
-
if (e.key === 'd' || e.key === 'D') {
|
| 1187 |
-
debugVisible = !debugVisible;
|
| 1188 |
-
debugPanel.classList.toggle('visible', debugVisible);
|
| 1189 |
-
}
|
| 1190 |
-
});
|
| 1191 |
-
function dbg(msg) {
|
| 1192 |
-
debugPre.textContent = msg;
|
| 1193 |
-
console.log('[GROUNDING]', msg);
|
| 1194 |
-
}
|
| 1195 |
-
|
| 1196 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1197 |
-
// ROBUST JSON EXTRACTOR (handles all model output styles)
|
| 1198 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1199 |
-
function extractGroundingJSON(raw) {
|
| 1200 |
-
// Step 1: strip <think>β¦</think> blocks completely
|
| 1201 |
-
let text = raw;
|
| 1202 |
-
for (let i = 0; i < 10; i++) {
|
| 1203 |
-
const next = text.replace(/<think>[\s\S]*?<\/think>/gi, '');
|
| 1204 |
-
if (next === text) break;
|
| 1205 |
-
text = next;
|
| 1206 |
-
}
|
| 1207 |
-
|
| 1208 |
-
// Step 2: strip markdown fences
|
| 1209 |
-
text = text.replace(/```(?:json)?\\s*/gi, '').replace(/```/g, '').trim();
|
| 1210 |
-
|
| 1211 |
-
dbg('Cleaned text (first 400):' + text.slice(0, 400));
|
| 1212 |
-
|
| 1213 |
-
// Step 3: Balanced bracket extractor
|
| 1214 |
-
function extractBalanced(str, startIdx, openCh, closeCh) {
|
| 1215 |
-
let depth = 0, inStr = false, esc = false;
|
| 1216 |
-
for (let i = startIdx; i < str.length; i++) {
|
| 1217 |
-
const c = str[i];
|
| 1218 |
-
if (esc) { esc = false; continue; }
|
| 1219 |
-
if (c === '\\\\') { esc = true; continue; }
|
| 1220 |
-
if (c === '"') { inStr = !inStr; continue; }
|
| 1221 |
-
if (inStr) continue;
|
| 1222 |
-
if (c === openCh) depth++;
|
| 1223 |
-
if (c === closeCh) {
|
| 1224 |
-
depth--;
|
| 1225 |
-
if (depth === 0) {
|
| 1226 |
-
try { return JSON.parse(str.slice(startIdx, i + 1)); }
|
| 1227 |
-
catch (_) { return null; }
|
| 1228 |
-
}
|
| 1229 |
-
}
|
| 1230 |
-
}
|
| 1231 |
-
return null;
|
| 1232 |
-
}
|
| 1233 |
-
|
| 1234 |
-
// Step 4: scan for ALL '[' positions, try each from last to first
|
| 1235 |
-
const bracketPositions = [];
|
| 1236 |
-
const bracePositions = [];
|
| 1237 |
-
for (let i = 0; i < text.length; i++) {
|
| 1238 |
-
if (text[i] === '[') bracketPositions.push(i);
|
| 1239 |
-
if (text[i] === '{') bracePositions.push(i);
|
| 1240 |
-
}
|
| 1241 |
-
|
| 1242 |
-
// Prefer arrays (most models return [{...}, {...}])
|
| 1243 |
-
for (let i = bracketPositions.length - 1; i >= 0; i--) {
|
| 1244 |
-
const r = extractBalanced(text, bracketPositions[i], '[', ']');
|
| 1245 |
-
if (r !== null && Array.isArray(r) && r.length > 0) {
|
| 1246 |
-
dbg('Found array at pos ' + bracketPositions[i] + ': ' + JSON.stringify(r).slice(0, 200));
|
| 1247 |
-
return r;
|
| 1248 |
-
}
|
| 1249 |
-
}
|
| 1250 |
-
// Try objects
|
| 1251 |
-
for (let i = bracePositions.length - 1; i >= 0; i--) {
|
| 1252 |
-
const r = extractBalanced(text, bracePositions[i], '{', '}');
|
| 1253 |
-
if (r !== null) {
|
| 1254 |
-
dbg('Found object at pos ' + bracePositions[i] + ': ' + JSON.stringify(r).slice(0, 200));
|
| 1255 |
-
return r;
|
| 1256 |
-
}
|
| 1257 |
-
}
|
| 1258 |
-
|
| 1259 |
-
// Step 5: try whole-text parse
|
| 1260 |
-
try { return JSON.parse(text); } catch (_) {}
|
| 1261 |
-
|
| 1262 |
-
dbg('No JSON found. Raw tail: ' + text.slice(-300));
|
| 1263 |
-
return null;
|
| 1264 |
-
}
|
| 1265 |
-
|
| 1266 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1267 |
-
// COORDINATE NORMALISER
|
| 1268 |
-
// Handles: absolute pixels, 0-1 fractions, 0-1000 Qwen scale
|
| 1269 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1270 |
-
function normaliseCoords(arr, W, H) {
|
| 1271 |
-
// arr is [x1,y1,x2,y2] or [x,y]
|
| 1272 |
-
const nums = arr.map(Number);
|
| 1273 |
-
|
| 1274 |
-
if (arr.length === 4) {
|
| 1275 |
-
let [x1,y1,x2,y2] = nums;
|
| 1276 |
-
// Qwen VL often uses 0-1000 normalised coords
|
| 1277 |
-
const maxVal = Math.max(x1, y1, x2, y2);
|
| 1278 |
-
if (maxVal <= 1.0 && maxVal > 0) {
|
| 1279 |
-
// 0-1 fraction
|
| 1280 |
-
return [x1*W, y1*H, x2*W, y2*H];
|
| 1281 |
-
} else if (maxVal <= 1000 && maxVal > 1) {
|
| 1282 |
-
// 0-1000 scale (Qwen VL convention)
|
| 1283 |
-
return [x1/1000*W, y1/1000*H, x2/1000*W, y2/1000*H];
|
| 1284 |
-
}
|
| 1285 |
-
// Already in pixels
|
| 1286 |
-
return [x1, y1, x2, y2];
|
| 1287 |
-
}
|
| 1288 |
-
|
| 1289 |
-
if (arr.length === 2) {
|
| 1290 |
-
let [x, y] = nums;
|
| 1291 |
-
const maxVal = Math.max(x, y);
|
| 1292 |
-
if (maxVal <= 1.0 && maxVal > 0) return [x*W, y*H];
|
| 1293 |
-
if (maxVal <= 1000 && maxVal > 1) return [x/1000*W, y/1000*H];
|
| 1294 |
-
return [x, y];
|
| 1295 |
-
}
|
| 1296 |
-
|
| 1297 |
-
return nums;
|
| 1298 |
-
}
|
| 1299 |
-
|
| 1300 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1301 |
-
// GROUNDING VISUALIZER
|
| 1302 |
-
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1303 |
-
const groundCanvas = document.getElementById('groundCanvas');
|
| 1304 |
-
const groundWrap = document.getElementById('groundWrap');
|
| 1305 |
-
const groundPlaceholder = document.getElementById('groundPlaceholder');
|
| 1306 |
-
const gCtx = groundCanvas.getContext('2d');
|
| 1307 |
-
const downloadBtn = document.getElementById('downloadBtn');
|
| 1308 |
-
|
| 1309 |
-
const PALETTE = ['#4ecdc4','#7c6af7','#ff6b6b','#ffd93d','#6bcb77','#ff922b','#cc5de8','#339af0'];
|
| 1310 |
-
|
| 1311 |
-
function hexToRgba(hex, alpha) {
|
| 1312 |
-
const r=parseInt(hex.slice(1,3),16), g=parseInt(hex.slice(3,5),16), b=parseInt(hex.slice(5,7),16);
|
| 1313 |
-
return `rgba(${r},${g},${b},${alpha})`;
|
| 1314 |
-
}
|
| 1315 |
-
function drawRoundRect(ctx, x, y, w, h, r) {
|
| 1316 |
-
r = Math.min(r, w/2, h/2);
|
| 1317 |
-
ctx.beginPath();
|
| 1318 |
-
ctx.moveTo(x+r, y);
|
| 1319 |
-
ctx.lineTo(x+w-r, y); ctx.quadraticCurveTo(x+w, y, x+w, y+r);
|
| 1320 |
-
ctx.lineTo(x+w, y+h-r); ctx.quadraticCurveTo(x+w, y+h, x+w-r, y+h);
|
| 1321 |
-
ctx.lineTo(x+r, y+h); ctx.quadraticCurveTo(x, y+h, x, y+h-r);
|
| 1322 |
-
ctx.lineTo(x, y+r); ctx.quadraticCurveTo(x, y, x+r, y);
|
| 1323 |
-
ctx.closePath();
|
| 1324 |
-
}
|
| 1325 |
-
|
| 1326 |
-
function updateDownloadBtn() {
|
| 1327 |
-
const dataURL = groundCanvas.toDataURL('image/png');
|
| 1328 |
-
const ts = new Date().toISOString().replace(/[:.]/g,'-').slice(0,19);
|
| 1329 |
-
downloadBtn.href = dataURL;
|
| 1330 |
-
downloadBtn.download = `grounding_${ts}.png`;
|
| 1331 |
-
downloadBtn.style.display = 'flex';
|
| 1332 |
-
}
|
| 1333 |
-
|
| 1334 |
-
function drawGrounding(imgSrc, rawText) {
|
| 1335 |
-
const parsed = extractGroundingJSON(rawText);
|
| 1336 |
-
|
| 1337 |
-
if (!parsed) {
|
| 1338 |
-
dbg('drawGrounding: no JSON parsed from output.');
|
| 1339 |
-
groundPlaceholder.textContent = 'No grounding coordinates found in model output.';
|
| 1340 |
-
groundPlaceholder.style.display = 'flex';
|
| 1341 |
-
return;
|
| 1342 |
-
}
|
| 1343 |
-
|
| 1344 |
-
const img = new Image();
|
| 1345 |
-
img.crossOrigin = 'anonymous';
|
| 1346 |
-
|
| 1347 |
-
img.onload = () => {
|
| 1348 |
-
const W = img.naturalWidth || img.width || 512;
|
| 1349 |
-
const H = img.naturalHeight || img.height || 512;
|
| 1350 |
-
|
| 1351 |
-
// Set canvas to image natural size for crisp drawing
|
| 1352 |
-
groundCanvas.width = W;
|
| 1353 |
-
groundCanvas.height = H;
|
| 1354 |
-
|
| 1355 |
-
// Draw base image
|
| 1356 |
-
gCtx.drawImage(img, 0, 0, W, H);
|
| 1357 |
-
|
| 1358 |
-
// Hide placeholder β canvas is now populated
|
| 1359 |
-
groundPlaceholder.style.display = 'none';
|
| 1360 |
-
|
| 1361 |
-
const lw = Math.max(2, W / 180);
|
| 1362 |
-
const fs = Math.max(11, Math.min(W / 35, 22));
|
| 1363 |
-
gCtx.lineWidth = lw;
|
| 1364 |
-
|
| 1365 |
-
const items = Array.isArray(parsed) ? parsed : [parsed];
|
| 1366 |
-
dbg('Drawing ' + items.length + ' item(s) on ' + W + 'x' + H);
|
| 1367 |
-
|
| 1368 |
-
items.forEach((item, i) => {
|
| 1369 |
-
const col = PALETTE[i % PALETTE.length];
|
| 1370 |
-
|
| 1371 |
-
// ββ Try to extract bbox βββββββββββββββββββββββ
|
| 1372 |
-
let rawBbox = null;
|
| 1373 |
-
if (Array.isArray(item?.bbox_2d) && item.bbox_2d.length === 4) rawBbox = item.bbox_2d;
|
| 1374 |
-
else if (Array.isArray(item?.bbox) && item.bbox.length === 4) rawBbox = item.bbox;
|
| 1375 |
-
else if (Array.isArray(item?.box) && item.box.length === 4) rawBbox = item.box;
|
| 1376 |
-
// flat array of 4 numbers
|
| 1377 |
-
else if (Array.isArray(item) && item.length === 4 && item.every(v => typeof v === 'number'))
|
| 1378 |
-
rawBbox = item;
|
| 1379 |
-
|
| 1380 |
-
if (rawBbox) {
|
| 1381 |
-
let [x1, y1, x2, y2] = normaliseCoords(rawBbox, W, H);
|
| 1382 |
-
// Ensure x1<x2, y1<y2
|
| 1383 |
-
if (x2 < x1) [x1, x2] = [x2, x1];
|
| 1384 |
-
if (y2 < y1) [y1, y2] = [y2, y1];
|
| 1385 |
-
const bw = x2 - x1, bh = y2 - y1;
|
| 1386 |
-
|
| 1387 |
-
// Fill
|
| 1388 |
-
gCtx.fillStyle = hexToRgba(col, 0.18);
|
| 1389 |
-
gCtx.fillRect(x1, y1, bw, bh);
|
| 1390 |
-
|
| 1391 |
-
// Border
|
| 1392 |
-
gCtx.strokeStyle = col;
|
| 1393 |
-
gCtx.lineWidth = lw;
|
| 1394 |
-
gCtx.strokeRect(x1, y1, bw, bh);
|
| 1395 |
-
|
| 1396 |
-
// Corner accent marks
|
| 1397 |
-
const cLen = Math.min(bw, bh, 18);
|
| 1398 |
-
gCtx.lineWidth = lw * 1.8;
|
| 1399 |
-
[[x1,y1],[x2,y1],[x2,y2],[x1,y2]].forEach(([cx,cy]) => {
|
| 1400 |
-
const sx = cx === x1 ? 1 : -1, sy = cy === y1 ? 1 : -1;
|
| 1401 |
-
gCtx.beginPath();
|
| 1402 |
-
gCtx.moveTo(cx + sx*cLen, cy);
|
| 1403 |
-
gCtx.lineTo(cx, cy);
|
| 1404 |
-
gCtx.lineTo(cx, cy + sy*cLen);
|
| 1405 |
-
gCtx.strokeStyle = col;
|
| 1406 |
-
gCtx.stroke();
|
| 1407 |
-
});
|
| 1408 |
-
gCtx.lineWidth = lw;
|
| 1409 |
-
|
| 1410 |
-
// Label
|
| 1411 |
-
const lbl = (item?.label ?? item?.class_name ?? item?.name ?? `obj ${i+1}`).toString();
|
| 1412 |
-
gCtx.font = `bold ${fs}px JetBrains Mono, monospace`;
|
| 1413 |
-
const tw = gCtx.measureText(lbl).width;
|
| 1414 |
-
const ph = fs * 1.5, pw = tw + 14;
|
| 1415 |
-
const lx = Math.max(0, Math.min(x1, W - pw));
|
| 1416 |
-
const ly = y1 - ph > 0 ? y1 - ph : y1 + 2;
|
| 1417 |
-
drawRoundRect(gCtx, lx, ly, pw, ph, 4);
|
| 1418 |
-
gCtx.fillStyle = col; gCtx.fill();
|
| 1419 |
-
gCtx.fillStyle = '#fff';
|
| 1420 |
-
gCtx.fillText(lbl, lx + 7, ly + ph * 0.74);
|
| 1421 |
-
return;
|
| 1422 |
-
}
|
| 1423 |
-
|
| 1424 |
-
// ββ Try to extract point ββββββββββββββββββββββ
|
| 1425 |
-
let rawPt = null;
|
| 1426 |
-
if (Array.isArray(item?.point_2d) && item.point_2d.length === 2) rawPt = item.point_2d;
|
| 1427 |
-
else if (Array.isArray(item?.point) && item.point.length === 2) rawPt = item.point;
|
| 1428 |
-
else if (Array.isArray(item?.coord) && item.coord.length === 2) rawPt = item.coord;
|
| 1429 |
-
else if (Array.isArray(item) && item.length === 2 && item.every(v => typeof v === 'number'))
|
| 1430 |
-
rawPt = item;
|
| 1431 |
-
|
| 1432 |
-
if (rawPt) {
|
| 1433 |
-
let [x, y] = normaliseCoords(rawPt, W, H);
|
| 1434 |
-
const r = Math.max(7, Math.min(W / 55, 18));
|
| 1435 |
-
const lbl = (item?.label ?? item?.name ?? `pt ${i+1}`).toString();
|
| 1436 |
-
|
| 1437 |
-
// Outer glow ring
|
| 1438 |
-
gCtx.beginPath(); gCtx.arc(x, y, r * 2.2, 0, Math.PI*2);
|
| 1439 |
-
gCtx.fillStyle = hexToRgba(col, 0.15); gCtx.fill();
|
| 1440 |
-
|
| 1441 |
-
// Middle ring
|
| 1442 |
-
gCtx.beginPath(); gCtx.arc(x, y, r * 1.4, 0, Math.PI*2);
|
| 1443 |
-
gCtx.fillStyle = hexToRgba(col, 0.25); gCtx.fill();
|
| 1444 |
-
|
| 1445 |
-
// Core dot
|
| 1446 |
-
gCtx.beginPath(); gCtx.arc(x, y, r, 0, Math.PI*2);
|
| 1447 |
-
gCtx.fillStyle = col; gCtx.fill();
|
| 1448 |
-
gCtx.strokeStyle = '#fff'; gCtx.lineWidth = Math.max(1.5, lw); gCtx.stroke();
|
| 1449 |
-
|
| 1450 |
-
// Centre dot
|
| 1451 |
-
gCtx.beginPath(); gCtx.arc(x, y, r * 0.3, 0, Math.PI*2);
|
| 1452 |
-
gCtx.fillStyle = '#fff'; gCtx.fill();
|
| 1453 |
-
|
| 1454 |
-
// Label
|
| 1455 |
-
gCtx.font = `bold ${fs}px JetBrains Mono, monospace`;
|
| 1456 |
-
const tw = gCtx.measureText(lbl).width;
|
| 1457 |
-
const ph = fs * 1.45, pw = tw + 12;
|
| 1458 |
-
const lx = Math.min(x + r + 6, W - pw);
|
| 1459 |
-
const ly = Math.max(0, y - ph/2);
|
| 1460 |
-
drawRoundRect(gCtx, lx, ly, pw, ph, 4);
|
| 1461 |
-
gCtx.fillStyle = col; gCtx.fill();
|
| 1462 |
-
gCtx.fillStyle = '#fff';
|
| 1463 |
-
gCtx.fillText(lbl, lx + 6, ly + ph * 0.74);
|
| 1464 |
-
}
|
| 1465 |
-
});
|
| 1466 |
-
|
| 1467 |
-
updateDownloadBtn();
|
| 1468 |
-
};
|
| 1469 |
-
|
| 1470 |
-
img.onerror = (e) => {
|
| 1471 |
-
dbg('Image load error: ' + e);
|
| 1472 |
-
groundPlaceholder.textContent = 'Failed to load image for overlay.';
|
| 1473 |
-
groundPlaceholder.style.display = 'flex';
|
| 1474 |
-
};
|
| 1475 |
-
|
| 1476 |
-
img.src = imgSrc;
|
| 1477 |
-
}
|
| 1478 |
|
| 1479 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1480 |
// COPY BUTTON
|
|
@@ -1493,8 +1277,8 @@ function resetCopyBtn() {
|
|
| 1493 |
</svg> COPY`;
|
| 1494 |
}
|
| 1495 |
copyBtn.onclick = () => {
|
| 1496 |
-
const txt = outputBox.innerText||'';
|
| 1497 |
-
if (!txt||txt==='Results will stream here...') return;
|
| 1498 |
navigator.clipboard.writeText(txt).then(() => {
|
| 1499 |
copyBtn.classList.add('copied');
|
| 1500 |
copyBtn.innerHTML = `
|
|
@@ -1502,14 +1286,45 @@ copyBtn.onclick = () => {
|
|
| 1502 |
stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round">
|
| 1503 |
<polyline points="20 6 9 17 4 12"/>
|
| 1504 |
</svg> COPIED`;
|
| 1505 |
-
clearTimeout(copyTimer);
|
|
|
|
| 1506 |
}).catch(() => {
|
| 1507 |
-
const ta=document.createElement('textarea');
|
| 1508 |
-
ta.style.position='fixed'; ta.style.opacity='0';
|
| 1509 |
-
document.body.appendChild(ta); ta.select();
|
|
|
|
| 1510 |
});
|
| 1511 |
};
|
| 1512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1513 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1514 |
// RUN INFERENCE
|
| 1515 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1518,41 +1333,38 @@ const btnLoader = document.getElementById('btnLoader');
|
|
| 1518 |
const allWires = ['wire-img-task','wire-model-task','wire-task-out','wire-task-gnd'];
|
| 1519 |
const dotTask = document.getElementById('dot-task');
|
| 1520 |
const dotOut = document.getElementById('dot-out');
|
| 1521 |
-
const dotGnd = document.getElementById('dot-gnd');
|
| 1522 |
|
| 1523 |
runBtn.onclick = async () => {
|
| 1524 |
if (!currentFile) { alert('Please upload an image into the Input Node.'); return; }
|
| 1525 |
const promptStr = promptInput.value.trim();
|
| 1526 |
if (!promptStr) { alert('Please enter a prompt directive.'); return; }
|
| 1527 |
|
| 1528 |
-
// Reset UI
|
| 1529 |
runBtn.disabled = true;
|
| 1530 |
btnLoader.style.display = 'inline-block';
|
| 1531 |
outputBox.innerText = '';
|
| 1532 |
outputBox.style.color = '';
|
| 1533 |
-
groundPlaceholder.style.display = 'flex';
|
| 1534 |
-
groundPlaceholder.textContent = 'Running inferenceβ¦';
|
| 1535 |
-
gCtx.clearRect(0, 0, groundCanvas.width, groundCanvas.height);
|
| 1536 |
-
groundCanvas.width = 1; // reset canvas
|
| 1537 |
-
groundCanvas.height = 1;
|
| 1538 |
-
downloadBtn.style.display = 'none';
|
| 1539 |
dotTask.classList.add('active');
|
| 1540 |
dotOut.classList.remove('active');
|
| 1541 |
-
dotGnd.classList.remove('active');
|
| 1542 |
allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
|
| 1543 |
resetCopyBtn();
|
| 1544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1545 |
const formData = new FormData();
|
| 1546 |
formData.append('image', currentFile);
|
| 1547 |
-
formData.append('category',
|
| 1548 |
formData.append('prompt', promptStr);
|
| 1549 |
formData.append('model_id', modelSelect.value);
|
| 1550 |
|
| 1551 |
let fullText = '';
|
| 1552 |
-
// Create a stable object URL for this run
|
| 1553 |
-
const imgObjectURL = URL.createObjectURL(currentFile);
|
| 1554 |
|
| 1555 |
try {
|
|
|
|
| 1556 |
const response = await fetch('/api/run', { method: 'POST', body: formData });
|
| 1557 |
if (!response.ok) {
|
| 1558 |
const err = await response.json();
|
|
@@ -1561,18 +1373,18 @@ runBtn.onclick = async () => {
|
|
| 1561 |
|
| 1562 |
const reader = response.body.getReader();
|
| 1563 |
const decoder = new TextDecoder('utf-8');
|
| 1564 |
-
let
|
| 1565 |
|
| 1566 |
while (true) {
|
| 1567 |
const { value, done } = await reader.read();
|
| 1568 |
if (done) break;
|
| 1569 |
buffer += decoder.decode(value, { stream: true });
|
| 1570 |
const lines = buffer.split('\\n\\n');
|
| 1571 |
-
buffer = lines.pop();
|
| 1572 |
|
| 1573 |
for (const line of lines) {
|
| 1574 |
if (!line.startsWith('data: ')) continue;
|
| 1575 |
-
const payload = line.slice(6);
|
| 1576 |
if (payload === '[DONE]') break;
|
| 1577 |
try {
|
| 1578 |
const data = JSON.parse(payload);
|
|
@@ -1587,43 +1399,47 @@ runBtn.onclick = async () => {
|
|
| 1587 |
|
| 1588 |
dotOut.classList.add('active');
|
| 1589 |
|
| 1590 |
-
// ββ
|
| 1591 |
-
const cat = categorySelect.value;
|
| 1592 |
if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
|
| 1593 |
-
|
| 1594 |
-
|
| 1595 |
-
|
| 1596 |
-
|
| 1597 |
-
|
| 1598 |
-
|
| 1599 |
-
|
| 1600 |
-
|
| 1601 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1602 |
} else {
|
| 1603 |
-
|
| 1604 |
-
|
| 1605 |
-
|
| 1606 |
-
|
| 1607 |
-
|
| 1608 |
}
|
| 1609 |
-
}
|
|
|
|
|
|
|
| 1610 |
} else if (cat !== 'Point' && cat !== 'Detect') {
|
| 1611 |
-
|
| 1612 |
-
groundPlaceholder.style.display = 'flex';
|
| 1613 |
}
|
| 1614 |
|
| 1615 |
} catch (err) {
|
| 1616 |
outputBox.innerText = `[Error] ${err.message}`;
|
| 1617 |
outputBox.style.color = '#ff6b6b';
|
| 1618 |
-
|
| 1619 |
-
|
|
|
|
| 1620 |
} finally {
|
| 1621 |
runBtn.disabled = false;
|
| 1622 |
btnLoader.style.display = 'none';
|
| 1623 |
dotTask.classList.remove('active');
|
| 1624 |
allWires.forEach(id => document.getElementById(id)?.classList.remove('active'));
|
| 1625 |
-
// Revoke object URL after a delay to allow canvas drawing
|
| 1626 |
-
setTimeout(() => URL.revokeObjectURL(imgObjectURL), 10000);
|
| 1627 |
}
|
| 1628 |
};
|
| 1629 |
</script>
|
|
|
|
| 4 |
import ast
|
| 5 |
import re
|
| 6 |
import uuid
|
| 7 |
+
import base64
|
| 8 |
import threading
|
| 9 |
+
import numpy as np
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Optional
|
| 12 |
|
| 13 |
import spaces
|
| 14 |
import torch
|
| 15 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 16 |
|
| 17 |
from gradio import Server
|
| 18 |
from fastapi import Request, UploadFile, File, Form
|
|
|
|
| 53 |
print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...")
|
| 54 |
try:
|
| 55 |
qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 56 |
+
QWEN_VL_2B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16,
|
|
|
|
|
|
|
| 57 |
).to(DEVICE).eval()
|
| 58 |
+
qwen_vl_2b_processor = AutoProcessor.from_pretrained(QWEN_VL_2B_MODEL_NAME, trust_remote_code=True)
|
|
|
|
|
|
|
| 59 |
print("Qwen3-VL-2B model loaded successfully.")
|
| 60 |
except Exception as e:
|
| 61 |
print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
|
| 62 |
+
qwen_vl_2b_model = None; qwen_vl_2b_processor = None
|
|
|
|
| 63 |
|
| 64 |
# ββ Qwen3-VL-4B-Instruct ββββββββββββββββββββββββββββββββ
|
| 65 |
print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
|
| 66 |
try:
|
| 67 |
qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 68 |
+
QWEN_VL_4B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16,
|
|
|
|
|
|
|
| 69 |
).to(DEVICE).eval()
|
| 70 |
+
qwen_vl_4b_processor = AutoProcessor.from_pretrained(QWEN_VL_4B_MODEL_NAME, trust_remote_code=True)
|
|
|
|
|
|
|
| 71 |
print("Qwen3-VL-4B model loaded successfully.")
|
| 72 |
except Exception as e:
|
| 73 |
print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
|
| 74 |
+
qwen_vl_4b_model = None; qwen_vl_4b_processor = None
|
|
|
|
| 75 |
|
| 76 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 77 |
print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
|
|
|
|
| 83 |
print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
|
| 84 |
except Exception as e:
|
| 85 |
print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
|
| 86 |
+
qwen_4b_unredacted_model = None; qwen_4b_unredacted_processor = None
|
|
|
|
| 87 |
|
| 88 |
# ββ Qwen3.5-4B ββββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 95 |
print("Qwen3.5-4B model loaded successfully.")
|
| 96 |
except Exception as e:
|
| 97 |
print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
|
| 98 |
+
qwen_4b_model = None; qwen_4b_processor = None
|
|
|
|
| 99 |
|
| 100 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
|
|
|
|
| 107 |
print("Qwen3.5-2B model loaded successfully.")
|
| 108 |
except Exception as e:
|
| 109 |
print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
|
| 110 |
+
qwen_2b_model = None; qwen_2b_processor = None
|
|
|
|
| 111 |
|
| 112 |
# ββ LFM2.5-VL-450M ββββββββββββββββββββββββββββββββββββββ
|
| 113 |
print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
|
| 114 |
try:
|
| 115 |
lfm_450_model = AutoModelForImageTextToText.from_pretrained(
|
| 116 |
+
LFM_450_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16,
|
|
|
|
|
|
|
| 117 |
).eval()
|
| 118 |
lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME)
|
| 119 |
print("LFM-450M model loaded successfully.")
|
| 120 |
except Exception as e:
|
| 121 |
print(f"Warning: LFM-450M model loading failed. Error: {e}")
|
| 122 |
+
lfm_450_model = None; lfm_450_processor = None
|
|
|
|
| 123 |
|
| 124 |
# ββ Gemma4-E2B-it βββββββββββββββββββββββββββββββββββββββ
|
| 125 |
print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
|
| 126 |
try:
|
| 127 |
gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained(
|
| 128 |
+
GEMMA4_E2B_NAME, torch_dtype=torch.bfloat16,
|
|
|
|
| 129 |
device_map="auto" if torch.cuda.is_available() else None,
|
| 130 |
).eval()
|
| 131 |
if not torch.cuda.is_available():
|
|
|
|
| 134 |
print("Gemma4-E2B-it model loaded successfully.")
|
| 135 |
except Exception as e:
|
| 136 |
print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
|
| 137 |
+
gemma4_e2b_model = None; gemma4_e2b_processor = None
|
|
|
|
| 138 |
|
| 139 |
# ββ LFM2.5-VL-1.6B ββββββββββββββββββββββββββββββββββββββ
|
| 140 |
print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
|
| 141 |
try:
|
| 142 |
lfm_16_model = AutoModelForImageTextToText.from_pretrained(
|
| 143 |
+
LFM_16_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16,
|
|
|
|
|
|
|
| 144 |
).eval()
|
| 145 |
lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME)
|
| 146 |
print("LFM-1.6B model loaded successfully.")
|
| 147 |
except Exception as e:
|
| 148 |
print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
|
| 149 |
+
lfm_16_model = None; lfm_16_processor = None
|
|
|
|
| 150 |
|
| 151 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββββββ
|
| 152 |
print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
|
|
|
|
| 158 |
print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
|
| 159 |
except Exception as e:
|
| 160 |
print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
|
| 161 |
+
qwen_unredacted_model = None; qwen_unredacted_processor = None
|
|
|
|
| 162 |
|
| 163 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββββββ
|
| 164 |
print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
|
| 165 |
try:
|
| 166 |
qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 167 |
+
QWEN25_VL_3B_NAME, torch_dtype="auto", device_map="auto",
|
|
|
|
|
|
|
| 168 |
).eval()
|
| 169 |
qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME)
|
| 170 |
print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
|
| 171 |
except Exception as e:
|
| 172 |
print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
|
| 173 |
+
qwen25_vl_3b_model = None; qwen25_vl_3b_processor = None
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
+
# SERVER-SIDE ANNOTATION (mirrors the reference app exactly)
|
| 178 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
+
|
| 180 |
+
PALETTE_RGB = [
|
| 181 |
+
(78, 205, 196), # teal
|
| 182 |
+
(124, 106, 247), # purple
|
| 183 |
+
(255, 107, 107), # red
|
| 184 |
+
(255, 217, 61), # yellow
|
| 185 |
+
(107, 203, 119), # green
|
| 186 |
+
(255, 146, 43), # orange
|
| 187 |
+
(204, 93, 232), # violet
|
| 188 |
+
(51, 154, 240), # blue
|
| 189 |
+
]
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _get_font(size: int = 14):
|
| 193 |
+
"""Try to load a TrueType font; fall back to PIL default."""
|
| 194 |
+
for name in ["DejaVuSans-Bold.ttf", "arial.ttf", "Arial.ttf",
|
| 195 |
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"]:
|
| 196 |
+
try:
|
| 197 |
+
return ImageFont.truetype(name, size)
|
| 198 |
+
except (IOError, OSError):
|
| 199 |
+
pass
|
| 200 |
+
return ImageFont.load_default()
|
| 201 |
|
| 202 |
|
|
|
|
| 203 |
def safe_parse_json(text: str):
|
| 204 |
+
"""Strip markdown fences + <think> blocks, then parse JSON."""
|
| 205 |
+
# Remove <think>β¦</think>
|
| 206 |
+
text = re.sub(r"<think>[\s\S]*?</think>", "", text, flags=re.IGNORECASE)
|
| 207 |
text = text.strip()
|
| 208 |
+
# Strip markdown fences
|
| 209 |
text = re.sub(r"^```(json)?", "", text)
|
| 210 |
text = re.sub(r"```$", "", text)
|
| 211 |
text = text.strip()
|
|
|
|
| 213 |
return json.loads(text)
|
| 214 |
except json.JSONDecodeError:
|
| 215 |
pass
|
| 216 |
+
# Try to find the first [...] or {...} block
|
| 217 |
+
for ch_open, ch_close in [('[', ']'), ('{', '}')]:
|
| 218 |
+
idx = text.find(ch_open)
|
| 219 |
+
if idx != -1:
|
| 220 |
+
depth, in_str, esc = 0, False, False
|
| 221 |
+
for i in range(idx, len(text)):
|
| 222 |
+
c = text[i]
|
| 223 |
+
if esc: esc = False; continue
|
| 224 |
+
if c == '\\': esc = True; continue
|
| 225 |
+
if c == '"': in_str = not in_str; continue
|
| 226 |
+
if in_str: continue
|
| 227 |
+
if c == ch_open: depth += 1
|
| 228 |
+
if c == ch_close:
|
| 229 |
+
depth -= 1
|
| 230 |
+
if depth == 0:
|
| 231 |
+
try:
|
| 232 |
+
return json.loads(text[idx:i+1])
|
| 233 |
+
except Exception:
|
| 234 |
+
break
|
| 235 |
try:
|
| 236 |
return ast.literal_eval(text)
|
| 237 |
except Exception:
|
| 238 |
return {}
|
| 239 |
|
| 240 |
|
| 241 |
+
def annotate_detections(image: Image.Image, parsed) -> Image.Image:
|
| 242 |
+
"""
|
| 243 |
+
Draw bounding boxes on image.
|
| 244 |
+
parsed: list of dicts with 'bbox_2d' ([x1,y1,x2,y2] in 0-1000 scale)
|
| 245 |
+
and optional 'label'.
|
| 246 |
+
Mirrors reference _run_detection_on_frame output β annotate_image.
|
| 247 |
+
"""
|
| 248 |
+
image = image.convert("RGB")
|
| 249 |
+
ow, oh = image.size
|
| 250 |
+
draw = ImageDraw.Draw(image, "RGBA")
|
| 251 |
+
font_lbl = _get_font(max(12, min(ow // 35, 22)))
|
| 252 |
+
|
| 253 |
+
items = parsed if isinstance(parsed, list) else [parsed]
|
| 254 |
+
drawn = 0
|
| 255 |
+
for i, item in enumerate(items):
|
| 256 |
+
if not isinstance(item, dict):
|
| 257 |
+
continue
|
| 258 |
+
bbox = (item.get("bbox_2d") or item.get("bbox") or item.get("box"))
|
| 259 |
+
if not bbox or len(bbox) != 4:
|
| 260 |
+
continue
|
| 261 |
+
col = PALETTE_RGB[i % len(PALETTE_RGB)]
|
| 262 |
+
|
| 263 |
+
# ββ Normalise coordinates (0-1000 β pixels) ββββββββββββββββββββββ
|
| 264 |
+
x1, y1, x2, y2 = [float(v) for v in bbox]
|
| 265 |
+
max_v = max(x1, y1, x2, y2)
|
| 266 |
+
if max_v <= 1.0: # 0-1 fraction
|
| 267 |
+
x1, y1, x2, y2 = x1*ow, y1*oh, x2*ow, y2*oh
|
| 268 |
+
elif max_v <= 1000.0: # 0-1000 Qwen scale
|
| 269 |
+
x1, y1, x2, y2 = x1/1000*ow, y1/1000*oh, x2/1000*ow, y2/1000*oh
|
| 270 |
+
# else already in pixels
|
| 271 |
+
|
| 272 |
+
if x2 < x1: x1, x2 = x2, x1
|
| 273 |
+
if y2 < y1: y1, y2 = y2, y1
|
| 274 |
+
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
|
| 275 |
+
|
| 276 |
+
# ββ Fill (semi-transparent) βββββββββββββββββββββββββββββββββββββββ
|
| 277 |
+
draw.rectangle([x1, y1, x2, y2], fill=(*col, 46))
|
| 278 |
+
|
| 279 |
+
# ββ Border βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
lw = max(2, ow // 200)
|
| 281 |
+
for t in range(lw):
|
| 282 |
+
draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=(*col, 255))
|
| 283 |
+
|
| 284 |
+
# ββ Corner accent marks βββββββββββββββββββββββββββββββββββββββββββ
|
| 285 |
+
clen = max(10, min(int((x2-x1)*0.18), int((y2-y1)*0.18), 24))
|
| 286 |
+
corners = [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]
|
| 287 |
+
for cx, cy, sx, sy in corners:
|
| 288 |
+
draw.line([(cx, cy),(cx+sx*clen, cy)], fill=col, width=lw+1)
|
| 289 |
+
draw.line([(cx, cy),(cx, cy+sy*clen)], fill=col, width=lw+1)
|
| 290 |
+
|
| 291 |
+
# ββ Label βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 292 |
+
label = str(item.get("label") or item.get("class_name") or item.get("name") or f"obj {i+1}")
|
| 293 |
+
try:
|
| 294 |
+
bb = font_lbl.getbbox(label)
|
| 295 |
+
tw, th = bb[2]-bb[0], bb[3]-bb[1]
|
| 296 |
+
except AttributeError:
|
| 297 |
+
tw, th = font_lbl.getsize(label)
|
| 298 |
+
pad = 5
|
| 299 |
+
lx = max(0, min(x1, ow - tw - pad*2))
|
| 300 |
+
ly = max(0, y1 - th - pad*2) if y1 - th - pad*2 >= 0 else y1 + 2
|
| 301 |
+
draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 230))
|
| 302 |
+
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
|
| 303 |
+
drawn += 1
|
| 304 |
+
|
| 305 |
+
return image
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def annotate_points(image: Image.Image, parsed) -> Image.Image:
|
| 309 |
+
"""
|
| 310 |
+
Draw point markers on image.
|
| 311 |
+
parsed: list of dicts with 'point_2d' ([x,y] in 0-1000 scale)
|
| 312 |
+
and optional 'label'.
|
| 313 |
+
Mirrors reference _run_point_detection_on_frame β annotate_image_red_points.
|
| 314 |
+
"""
|
| 315 |
+
image = image.convert("RGB")
|
| 316 |
+
ow, oh = image.size
|
| 317 |
+
draw = ImageDraw.Draw(image, "RGBA")
|
| 318 |
+
font_lbl = _get_font(max(12, min(ow // 35, 22)))
|
| 319 |
+
|
| 320 |
+
items = parsed if isinstance(parsed, list) else [parsed]
|
| 321 |
+
drawn = 0
|
| 322 |
+
for i, item in enumerate(items):
|
| 323 |
+
if not isinstance(item, dict):
|
| 324 |
+
continue
|
| 325 |
+
pt = (item.get("point_2d") or item.get("point") or item.get("coord"))
|
| 326 |
+
if not pt or len(pt) != 2:
|
| 327 |
+
continue
|
| 328 |
+
col = PALETTE_RGB[i % len(PALETTE_RGB)]
|
| 329 |
+
|
| 330 |
+
# ββ Normalise coordinates βββββββββββββββββββββββββββββββββββββββββ
|
| 331 |
+
x, y = float(pt[0]), float(pt[1])
|
| 332 |
+
max_v = max(x, y)
|
| 333 |
+
if max_v <= 1.0:
|
| 334 |
+
x, y = x*ow, y*oh
|
| 335 |
+
elif max_v <= 1000.0:
|
| 336 |
+
x, y = x/1000*ow, y/1000*oh
|
| 337 |
+
|
| 338 |
+
cx, cy = int(x), int(y)
|
| 339 |
+
r = max(7, min(ow // 55, 18))
|
| 340 |
+
|
| 341 |
+
# ββ Glow rings βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 342 |
+
draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=(*col, 38))
|
| 343 |
+
draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)],
|
| 344 |
+
fill=(*col, 64))
|
| 345 |
+
|
| 346 |
+
# ββ Core dot βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 347 |
+
draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=(*col, 255),
|
| 348 |
+
outline=(255,255,255,255), width=max(2, r//4))
|
| 349 |
+
|
| 350 |
+
# ββ Centre pip βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 351 |
+
rp = max(2, r//4)
|
| 352 |
+
draw.ellipse([cx-rp, cy-rp, cx+rp, cy+rp], fill=(255,255,255,255))
|
| 353 |
+
|
| 354 |
+
# ββ Label βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 355 |
+
label = str(item.get("label") or item.get("name") or f"pt {i+1}")
|
| 356 |
+
try:
|
| 357 |
+
bb = font_lbl.getbbox(label)
|
| 358 |
+
tw, th = bb[2]-bb[0], bb[3]-bb[1]
|
| 359 |
+
except AttributeError:
|
| 360 |
+
tw, th = font_lbl.getsize(label)
|
| 361 |
+
pad = 5
|
| 362 |
+
lx = min(cx + r + 8, ow - tw - pad*2)
|
| 363 |
+
ly = max(0, cy - th//2 - pad)
|
| 364 |
+
draw.rectangle([lx, ly, lx+tw+pad*2, ly+th+pad*2], fill=(*col, 220))
|
| 365 |
+
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
|
| 366 |
+
drawn += 1
|
| 367 |
+
|
| 368 |
+
return image
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def image_to_b64(img: Image.Image, fmt: str = "PNG") -> str:
|
| 372 |
+
"""Convert PIL image β base64 data-URI."""
|
| 373 |
+
buf = io.BytesIO()
|
| 374 |
+
img.save(buf, format=fmt)
|
| 375 |
+
buf.seek(0)
|
| 376 |
+
return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 380 |
+
# NEW ENDPOINT: /api/annotate
|
| 381 |
+
# Receives the image + raw model output text + category,
|
| 382 |
+
# runs server-side annotation, returns base64 PNG.
|
| 383 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
@app.post("/api/annotate")
|
| 385 |
+
async def annotate_endpoint(
|
| 386 |
+
image: UploadFile = File(...),
|
| 387 |
+
text: str = Form(...),
|
| 388 |
+
category: str = Form(...),
|
| 389 |
+
):
|
| 390 |
+
try:
|
| 391 |
+
img_bytes = await image.read()
|
| 392 |
+
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
| 393 |
+
img.thumbnail((512, 512))
|
| 394 |
+
|
| 395 |
+
parsed = safe_parse_json(text)
|
| 396 |
+
if not parsed:
|
| 397 |
+
return JSONResponse({"error": "no_json", "b64": None})
|
| 398 |
+
|
| 399 |
+
if category == "Detect":
|
| 400 |
+
annotated = annotate_detections(img, parsed)
|
| 401 |
+
elif category == "Point":
|
| 402 |
+
annotated = annotate_points(img, parsed)
|
| 403 |
+
else:
|
| 404 |
+
return JSONResponse({"error": "unsupported_category", "b64": None})
|
| 405 |
+
|
| 406 |
+
return JSONResponse({"b64": image_to_b64(annotated)})
|
| 407 |
+
except Exception as e:
|
| 408 |
+
return JSONResponse({"error": str(e), "b64": None}, status_code=500)
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 412 |
+
# STREAMING INFERENCE
|
| 413 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 414 |
@spaces.GPU(duration=120)
|
| 415 |
def generate_inference_stream(
|
| 416 |
image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
|
|
|
|
| 430 |
if model_id == "qwen_vl_2b":
|
| 431 |
if qwen_vl_2b_model is None or qwen_vl_2b_processor is None:
|
| 432 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
|
| 433 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 434 |
messages = [{"role": "user", "content": [
|
| 435 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 436 |
text_input = qwen_vl_2b_processor.apply_chat_template(
|
| 437 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 438 |
inputs = qwen_vl_2b_processor(
|
| 439 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 440 |
).to(qwen_vl_2b_model.device)
|
| 441 |
+
streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer,
|
| 442 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 443 |
+
threading.Thread(target=qwen_vl_2b_model.generate,
|
| 444 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 445 |
+
use_cache=True, temperature=1.0, do_sample=True)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
for tok in streamer:
|
| 447 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 448 |
|
| 449 |
# ββ Qwen3-VL-4B βββββββββββββββββββββββββββββββββββββ
|
| 450 |
elif model_id == "qwen_vl_4b":
|
| 451 |
if qwen_vl_4b_model is None or qwen_vl_4b_processor is None:
|
| 452 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
|
| 453 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 454 |
messages = [{"role": "user", "content": [
|
| 455 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 456 |
text_input = qwen_vl_4b_processor.apply_chat_template(
|
| 457 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 458 |
inputs = qwen_vl_4b_processor(
|
| 459 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 460 |
).to(qwen_vl_4b_model.device)
|
| 461 |
+
streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer,
|
| 462 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 463 |
+
threading.Thread(target=qwen_vl_4b_model.generate,
|
| 464 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 465 |
+
use_cache=True, temperature=1.0, do_sample=True)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
for tok in streamer:
|
| 467 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 468 |
|
| 469 |
# ββ Qwen3.5-4B-Unredacted-MAX βββββββββββββββββββββββ
|
| 470 |
elif model_id == "qwen_4b_unredacted":
|
| 471 |
if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None:
|
| 472 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 473 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 474 |
messages = [{"role": "user", "content": [
|
| 475 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 476 |
text_input = qwen_4b_unredacted_processor.apply_chat_template(
|
| 477 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 478 |
inputs = qwen_4b_unredacted_processor(
|
| 479 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 480 |
).to(qwen_4b_unredacted_model.device)
|
| 481 |
+
streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer,
|
| 482 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 483 |
+
threading.Thread(target=qwen_4b_unredacted_model.generate,
|
| 484 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 485 |
+
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
for tok in streamer:
|
| 487 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 488 |
|
| 489 |
# ββ Qwen3.5-4B βββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββ
|
| 490 |
elif model_id == "qwen_4b":
|
| 491 |
if qwen_4b_model is None or qwen_4b_processor is None:
|
| 492 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
|
| 493 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 494 |
messages = [{"role": "user", "content": [
|
| 495 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 496 |
text_input = qwen_4b_processor.apply_chat_template(
|
| 497 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 498 |
inputs = qwen_4b_processor(
|
| 499 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 500 |
).to(qwen_4b_model.device)
|
| 501 |
+
streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer,
|
| 502 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 503 |
+
threading.Thread(target=qwen_4b_model.generate,
|
| 504 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 505 |
+
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
for tok in streamer:
|
| 507 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 508 |
|
| 509 |
# ββ Qwen3.5-2B ββββββββββββββββββββββββββββββββββββββ
|
| 510 |
elif model_id == "qwen_2b":
|
| 511 |
if qwen_2b_model is None or qwen_2b_processor is None:
|
| 512 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
|
| 513 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 514 |
messages = [{"role": "user", "content": [
|
| 515 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 516 |
text_input = qwen_2b_processor.apply_chat_template(
|
| 517 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 518 |
inputs = qwen_2b_processor(
|
| 519 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 520 |
).to(qwen_2b_model.device)
|
| 521 |
+
streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer,
|
| 522 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 523 |
+
threading.Thread(target=qwen_2b_model.generate,
|
| 524 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 525 |
+
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
for tok in streamer:
|
| 527 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 528 |
|
| 529 |
# ββ LFM-450M ββββββββββββββββββββββββββββββββββββββββ
|
| 530 |
elif model_id == "lfm_450":
|
| 531 |
if lfm_450_model is None or lfm_450_processor is None:
|
| 532 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
|
| 533 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 534 |
conversation = [{"role": "user", "content": [
|
| 535 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 536 |
inputs = lfm_450_processor.apply_chat_template(
|
| 537 |
conversation, add_generation_prompt=True,
|
| 538 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 539 |
).to(lfm_450_model.device)
|
| 540 |
+
streamer = TextIteratorStreamer(lfm_450_processor.tokenizer,
|
| 541 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 542 |
+
threading.Thread(target=lfm_450_model.generate,
|
| 543 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 544 |
+
use_cache=True)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
for tok in streamer:
|
| 546 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 547 |
|
| 548 |
# ββ Gemma4-E2B-it βββββββββββββββββββββββββββββββββββ
|
| 549 |
elif model_id == "gemma4_e2b":
|
| 550 |
if gemma4_e2b_model is None or gemma4_e2b_processor is None:
|
| 551 |
yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
|
| 552 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 553 |
messages = [{"role": "user", "content": [
|
| 554 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 555 |
text_input = gemma4_e2b_processor.apply_chat_template(
|
| 556 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 557 |
inputs = gemma4_e2b_processor(
|
| 558 |
text=[text_input], images=[image], return_tensors="pt", padding=True,
|
| 559 |
).to(gemma4_e2b_model.device)
|
| 560 |
+
streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer,
|
| 561 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 562 |
+
threading.Thread(target=gemma4_e2b_model.generate,
|
| 563 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 564 |
+
use_cache=True, temperature=1.0, do_sample=True)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
for tok in streamer:
|
| 566 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 567 |
|
| 568 |
# ββ LFM-1.6B ββββββββββββββββββββββββββββββββββββββββ
|
| 569 |
elif model_id == "lfm_16":
|
| 570 |
if lfm_16_model is None or lfm_16_processor is None:
|
| 571 |
yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
|
| 572 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 573 |
conversation = [{"role": "user", "content": [
|
| 574 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 575 |
inputs = lfm_16_processor.apply_chat_template(
|
| 576 |
conversation, add_generation_prompt=True,
|
| 577 |
return_tensors="pt", return_dict=True, tokenize=True,
|
| 578 |
).to(lfm_16_model.device)
|
| 579 |
+
streamer = TextIteratorStreamer(lfm_16_processor.tokenizer,
|
| 580 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 581 |
+
threading.Thread(target=lfm_16_model.generate,
|
| 582 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 583 |
+
use_cache=True)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
for tok in streamer:
|
| 585 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 586 |
|
| 587 |
# ββ Qwen3.5-2B-Unredacted-MAX βββββββββββββββββββββββ
|
| 588 |
elif model_id == "qwen_unredacted":
|
| 589 |
if qwen_unredacted_model is None or qwen_unredacted_processor is None:
|
| 590 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
|
| 591 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 592 |
messages = [{"role": "user", "content": [
|
| 593 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 594 |
text_input = qwen_unredacted_processor.apply_chat_template(
|
| 595 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 596 |
inputs = qwen_unredacted_processor(
|
| 597 |
text=[text_input], images=[image], return_tensors="pt", padding=True
|
| 598 |
).to(qwen_unredacted_model.device)
|
| 599 |
+
streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer,
|
| 600 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 601 |
+
threading.Thread(target=qwen_unredacted_model.generate,
|
| 602 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 603 |
+
use_cache=True, temperature=1.5, min_p=0.1)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
for tok in streamer:
|
| 605 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 606 |
|
| 607 |
# ββ Qwen2.5-VL-3B-Instruct ββββββββββββββββββββββββββ
|
| 608 |
elif model_id == "qwen25_vl_3b":
|
| 609 |
if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None:
|
| 610 |
yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
|
| 611 |
+
yield "data: [DONE]\n\n"; return
|
|
|
|
| 612 |
messages = [{"role": "user", "content": [
|
| 613 |
+
{"type": "image", "image": image}, {"type": "text", "text": full_prompt}]}]
|
|
|
|
|
|
|
| 614 |
text_input = qwen25_vl_3b_processor.apply_chat_template(
|
| 615 |
+
messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 616 |
image_inputs, video_inputs = process_vision_info(messages)
|
| 617 |
inputs = qwen25_vl_3b_processor(
|
| 618 |
+
text=[text_input], images=image_inputs, videos=video_inputs,
|
| 619 |
+
return_tensors="pt", padding=True,
|
|
|
|
|
|
|
|
|
|
| 620 |
).to(qwen25_vl_3b_model.device)
|
| 621 |
+
streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer,
|
| 622 |
+
skip_prompt=True, skip_special_tokens=True, timeout=120)
|
| 623 |
+
threading.Thread(target=qwen25_vl_3b_model.generate,
|
| 624 |
+
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
|
| 625 |
+
use_cache=True, temperature=1.0, do_sample=True)).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
for tok in streamer:
|
| 627 |
+
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
|
|
|
|
|
|
|
| 628 |
|
| 629 |
yield "data: [DONE]\n\n"
|
| 630 |
|
| 631 |
|
| 632 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 633 |
+
# FastAPI Endpoints
|
| 634 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 635 |
@app.post("/api/run")
|
| 636 |
async def run_inference(
|
| 637 |
image: UploadFile = File(...),
|
|
|
|
| 651 |
return JSONResponse({"error": str(e)}, status_code=500)
|
| 652 |
|
| 653 |
|
| 654 |
+
# βββββββββββββββββββββββββββοΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 655 |
+
# Frontend UI
|
| 656 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 657 |
@app.get("/", response_class=HTMLResponse)
|
| 658 |
async def homepage(request: Request):
|
| 659 |
return """
|
|
|
|
| 707 |
.top-bar .sub { font-size: 11px; color: var(--muted); }
|
| 708 |
.top-bar .badge {
|
| 709 |
margin-left: auto;
|
| 710 |
+
background: rgba(124,106,247,0.15); border: 1px solid rgba(124,106,247,0.3);
|
| 711 |
+
padding: 3px 10px; border-radius: 20px; font-size: 10px; color: var(--accent);
|
|
|
|
|
|
|
| 712 |
}
|
| 713 |
/* ββ Canvas ββ */
|
| 714 |
#canvas {
|
|
|
|
| 716 |
min-height: calc(100vh - 42px); height: 900px; margin: 0 auto;
|
| 717 |
}
|
| 718 |
svg.wires {
|
| 719 |
+
position: absolute; top: 0; left: 0; width: 100%; height: 100%;
|
|
|
|
| 720 |
pointer-events: none; z-index: 2; overflow: visible;
|
| 721 |
}
|
| 722 |
path.wire { fill: none; stroke: var(--wire); stroke-width: 2.5; stroke-linecap: round; }
|
|
|
|
| 797 |
border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
|
| 798 |
}
|
| 799 |
.img-chip.visible { display: flex; }
|
| 800 |
+
.img-chip .chip-dot { width:5px;height:5px;border-radius:50%;background:var(--accent2);flex-shrink:0;box-shadow:0 0 4px var(--accent2); }
|
| 801 |
+
.img-chip .chip-name { overflow:hidden;text-overflow:ellipsis;white-space:nowrap;flex:1;color:var(--text);font-size:9px; }
|
| 802 |
+
.img-chip .chip-size { color:var(--muted);flex-shrink:0;font-size:9px; }
|
| 803 |
select, textarea {
|
| 804 |
width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
|
| 805 |
color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
|
|
|
|
| 822 |
/* ββ Output node ββ */
|
| 823 |
.output-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
|
| 824 |
.output-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
|
| 825 |
+
/* ββ Icon buttons ββ */
|
| 826 |
.icon-btn {
|
| 827 |
display: flex; align-items: center; gap: 5px;
|
| 828 |
background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
|
|
|
|
| 830 |
font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
|
| 831 |
color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
|
| 832 |
transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
|
| 833 |
+
text-decoration: none; border: 1px solid rgba(124,106,247,0.25);
|
| 834 |
}
|
| 835 |
.icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
|
| 836 |
.icon-btn:active { transform: scale(0.95); }
|
| 837 |
+
.icon-btn.teal { background:rgba(78,205,196,0.10);border-color:rgba(78,205,196,0.25);color:var(--accent2); }
|
| 838 |
+
.icon-btn.teal:hover { background:rgba(78,205,196,0.22);border-color:var(--accent2); }
|
| 839 |
+
.icon-btn.copied { background:rgba(78,205,196,0.15);border-color:var(--accent2);color:var(--accent2); }
|
| 840 |
+
.icon-btn svg { pointer-events:none;flex-shrink:0; }
|
|
|
|
|
|
|
| 841 |
.output-box {
|
| 842 |
background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
|
| 843 |
border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
|
|
|
|
| 847 |
/* ββ Grounding node βοΏ½οΏ½ */
|
| 848 |
.ground-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
|
| 849 |
.ground-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
|
| 850 |
+
.ground-img-wrap {
|
| 851 |
position: relative; flex: 1; border: 1px solid var(--node-border);
|
| 852 |
border-radius: 5px; overflow: hidden; background: #111; min-height: 0;
|
| 853 |
+
display: flex; align-items: center; justify-content: center;
|
| 854 |
}
|
| 855 |
+
.ground-img-wrap img {
|
| 856 |
+
width: 100%; height: 100%; object-fit: contain; display: block;
|
|
|
|
|
|
|
|
|
|
| 857 |
}
|
| 858 |
.ground-placeholder {
|
| 859 |
position: absolute; inset: 0; display: flex; align-items: center;
|
| 860 |
+
justify-content: center; font-size: 11px; color: var(--muted);
|
| 861 |
+
text-align: center; padding: 10px; pointer-events: none; z-index: 5;
|
| 862 |
}
|
| 863 |
.loader {
|
| 864 |
width: 11px; height: 11px; border: 2px solid rgba(255,255,255,0.3);
|
|
|
|
| 866 |
animation: spin 0.7s linear infinite; display: none;
|
| 867 |
}
|
| 868 |
@keyframes spin { to { transform: rotate(360deg); } }
|
| 869 |
+
.status-dot { width:6px;height:6px;border-radius:50%;background:var(--muted);display:inline-block;margin-right:6px; }
|
| 870 |
+
.status-dot.active { background:var(--accent2);box-shadow:0 0 5px var(--accent2); }
|
| 871 |
/* ββ Model badges ββ */
|
| 872 |
.model-badge {
|
| 873 |
+
display:inline-block;padding:2px 7px;border-radius:4px;
|
| 874 |
+
font-size:9px;font-weight:700;letter-spacing:0.06em;text-transform:uppercase;
|
| 875 |
}
|
| 876 |
+
.model-badge.qvl2b { background:rgba(255,150,50,0.15); color:#ff9632; border:1px solid rgba(255,150,50,0.35); }
|
| 877 |
+
.model-badge.qvl4b { background:rgba(255,100,80,0.15); color:#ff6450; border:1px solid rgba(255,100,80,0.35); }
|
| 878 |
+
.model-badge.q4bunred { background:rgba(255,80,80,0.18); color:#ff5050; border:1px solid rgba(255,80,80,0.40); }
|
| 879 |
+
.model-badge.q4b { background:rgba(255,200,80,0.15); color:#ffc850; border:1px solid rgba(255,200,80,0.35); }
|
| 880 |
+
.model-badge.q2b { background:rgba(124,106,247,0.2); color:var(--accent); border:1px solid rgba(124,106,247,0.3); }
|
| 881 |
+
.model-badge.lfm450 { background:rgba(78,205,196,0.15); color:var(--accent2); border:1px solid rgba(78,205,196,0.3); }
|
| 882 |
+
.model-badge.g4e2b { background:rgba(66,197,107,0.15); color:#42c56b; border:1px solid rgba(66,197,107,0.35); }
|
| 883 |
+
.model-badge.lfm16 { background:rgba(107,203,119,0.15);color:#6bcb77; border:1px solid rgba(107,203,119,0.35); }
|
| 884 |
+
.model-badge.qunred { background:rgba(255,80,160,0.15); color:#ff50a0; border:1px solid rgba(255,80,160,0.35); }
|
| 885 |
+
.model-badge.q25vl3b { background:rgba(80,180,255,0.15); color:#50b4ff; border:1px solid rgba(80,180,255,0.35); }
|
| 886 |
+
.model-info-box { border-radius:6px;padding:9px;font-size:10px;color:var(--muted);line-height:1.55;flex-shrink:0; }
|
| 887 |
.canvas-footer { height: 36px; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 888 |
</style>
|
| 889 |
</head>
|
| 890 |
<body>
|
|
|
|
| 896 |
<span class="badge">10x Vision Models</span>
|
| 897 |
</div>
|
| 898 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 899 |
<div id="canvas">
|
| 900 |
<svg class="wires">
|
| 901 |
<path id="wire-img-task" class="wire" />
|
|
|
|
| 1049 |
SAVE
|
| 1050 |
</a>
|
| 1051 |
</div>
|
| 1052 |
+
<div class="ground-img-wrap">
|
| 1053 |
+
<!-- Server-rendered annotated image displayed here -->
|
| 1054 |
+
<img id="groundImg" src="" alt="" style="display:none;" />
|
| 1055 |
<div class="ground-placeholder" id="groundPlaceholder">
|
| 1056 |
Active for Point / Detect tasks.<br>Run inference to visualise.
|
| 1057 |
</div>
|
|
|
|
| 1103 |
});
|
| 1104 |
document.addEventListener('mousemove', e => {
|
| 1105 |
if (!drag) return;
|
| 1106 |
+
node.style.left=`${il+e.clientX-sx}px`;
|
| 1107 |
+
node.style.top=`${it+e.clientY-sy}px`;
|
| 1108 |
updateWires();
|
| 1109 |
});
|
| 1110 |
document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
|
|
|
|
| 1129 |
const dotImg = document.getElementById('dot-img');
|
| 1130 |
|
| 1131 |
function formatBytes(b) {
|
| 1132 |
+
if (b<1024) return b+' B';
|
| 1133 |
+
if (b<1048576) return (b/1024).toFixed(1)+' KB';
|
| 1134 |
return (b/1048576).toFixed(1)+' MB';
|
| 1135 |
}
|
| 1136 |
function handleFile(file) {
|
| 1137 |
+
if (!file || !file.type.startsWith('image/')) return;
|
| 1138 |
+
currentFile = file;
|
| 1139 |
+
imgPreview.src = URL.createObjectURL(file);
|
| 1140 |
+
previewWrap.classList.add('visible');
|
| 1141 |
+
dropZone.style.display = 'none';
|
| 1142 |
+
chipName.textContent = file.name;
|
| 1143 |
+
chipSize.textContent = formatBytes(file.size);
|
| 1144 |
+
imgChip.classList.add('visible');
|
| 1145 |
+
dotImg.classList.add('active');
|
| 1146 |
requestAnimationFrame(updateWires);
|
| 1147 |
}
|
| 1148 |
function clearImage() {
|
| 1149 |
+
currentFile = null;
|
| 1150 |
+
imgPreview.src = '';
|
| 1151 |
+
previewWrap.classList.remove('visible');
|
| 1152 |
+
dropZone.style.display = '';
|
| 1153 |
+
imgChip.classList.remove('visible');
|
| 1154 |
+
chipName.textContent = 'β';
|
| 1155 |
+
chipSize.textContent = '';
|
| 1156 |
+
fileInput.value = '';
|
| 1157 |
+
dotImg.classList.remove('active');
|
| 1158 |
+
requestAnimationFrame(updateWires);
|
| 1159 |
}
|
| 1160 |
dropZone.onclick = () => fileInput.click();
|
| 1161 |
fileInput.onchange = e => handleFile(e.target.files[0]);
|
|
|
|
| 1177 |
|
| 1178 |
const MODEL_INFO = {
|
| 1179 |
qwen_vl_2b: {
|
| 1180 |
+
html: `<span class="model-badge qvl2b">QWEN3-VL Β· 2B</span><br><br>
|
| 1181 |
+
Qwen3-VL-2B-Instruct β dedicated vision-language model by Alibaba Cloud.
|
| 1182 |
+
Strong spatial grounding, OCR & instruction-following.`,
|
| 1183 |
bg: 'rgba(255,150,50,0.07)', border: 'rgba(255,150,50,0.30)',
|
| 1184 |
},
|
| 1185 |
qwen_vl_4b: {
|
| 1186 |
+
html: `<span class="model-badge qvl4b">QWEN3-VL Β· 4B</span><br><br>
|
| 1187 |
+
Qwen3-VL-4B-Instruct β enhanced vision-language model by Alibaba Cloud.
|
| 1188 |
+
Superior spatial grounding, richer OCR & stronger multi-step reasoning.`,
|
| 1189 |
bg: 'rgba(255,100,80,0.07)', border: 'rgba(255,100,80,0.25)',
|
| 1190 |
},
|
| 1191 |
qwen_4b_unredacted: {
|
| 1192 |
+
html: `<span class="model-badge q4bunred">QWEN 3.5 Β· 4B UNREDACTED MAX</span><br><br>
|
| 1193 |
+
Qwen3.5-4B-Unredacted-MAX by prithivMLmods. Uncensored fine-tune of Qwen3.5-4B
|
| 1194 |
+
with extended instruction-following & unrestricted reasoning.`,
|
| 1195 |
bg: 'rgba(255,80,80,0.07)', border: 'rgba(255,80,80,0.30)',
|
| 1196 |
},
|
| 1197 |
qwen_4b: {
|
| 1198 |
+
html: `<span class="model-badge q4b">QWEN 3.5 Β· 4B</span><br><br>
|
| 1199 |
+
Qwen3.5 4B multimodal model by Alibaba Cloud.
|
| 1200 |
+
Enhanced capacity β richer reasoning & better instruction following.`,
|
| 1201 |
bg: 'rgba(255,200,80,0.07)', border: 'rgba(255,200,80,0.30)',
|
| 1202 |
},
|
| 1203 |
qwen_2b: {
|
| 1204 |
+
html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
|
| 1205 |
+
Qwen3.5 2B multimodal model by Alibaba Cloud.
|
| 1206 |
+
Lightweight & fast β ideal for quick tasks.`,
|
| 1207 |
bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
|
| 1208 |
},
|
| 1209 |
lfm_450: {
|
| 1210 |
+
html: `<span class="model-badge lfm450">LFM Β· 450M</span><br><br>
|
| 1211 |
+
LFM2.5-VL 450M by LiquidAI. Ultra-lightweight edge model
|
| 1212 |
+
with solid grounding capabilities.`,
|
| 1213 |
bg: 'rgba(78,205,196,0.07)', border: 'rgba(78,205,196,0.25)',
|
| 1214 |
},
|
| 1215 |
gemma4_e2b: {
|
| 1216 |
+
html: `<span class="model-badge g4e2b">GEMMA 4 Β· E2B</span><br><br>
|
| 1217 |
+
Gemma4-E2B-it by Google DeepMind. Efficient 2B multimodal model
|
| 1218 |
+
with strong vision-language understanding & instruction-following.`,
|
| 1219 |
bg: 'rgba(66,197,107,0.07)', border: 'rgba(66,197,107,0.25)',
|
| 1220 |
},
|
| 1221 |
lfm_16: {
|
| 1222 |
+
html: `<span class="model-badge lfm16">LFM Β· 1.6B</span><br><br>
|
| 1223 |
+
LFM2.5-VL 1.6B by LiquidAI. Larger liquid-state model offering
|
| 1224 |
+
enhanced reasoning & richer visual understanding.`,
|
| 1225 |
bg: 'rgba(107,203,119,0.07)', border: 'rgba(107,203,119,0.25)',
|
| 1226 |
},
|
| 1227 |
qwen_unredacted: {
|
| 1228 |
+
html: `<span class="model-badge qunred">QWEN 3.5 Β· 2B UNREDACTED MAX</span><br><br>
|
| 1229 |
+
Qwen3.5-2B-Unredacted-MAX by prithivMLmods. Fine-tuned variant of Qwen3.5-2B
|
| 1230 |
+
with uncensored & extended instruction-following capabilities.`,
|
| 1231 |
bg: 'rgba(255,80,160,0.07)', border: 'rgba(255,80,160,0.25)',
|
| 1232 |
},
|
| 1233 |
qwen25_vl_3b: {
|
| 1234 |
+
html: `<span class="model-badge q25vl3b">QWEN 2.5-VL Β· 3B</span><br><br>
|
| 1235 |
+
Qwen2.5-VL-3B-Instruct by Alibaba Cloud. Powerful 3B vision-language model
|
| 1236 |
+
with strong grounding, OCR & multi-task visual reasoning.`,
|
| 1237 |
bg: 'rgba(80,180,255,0.07)', border: 'rgba(80,180,255,0.25)',
|
| 1238 |
},
|
| 1239 |
};
|
|
|
|
| 1256 |
Point: 'e.g., The gun held by the person.',
|
| 1257 |
Detect: 'e.g., The headlight of the car.',
|
| 1258 |
};
|
| 1259 |
+
categorySelect.onchange = e => {
|
| 1260 |
+
promptInput.placeholder = PLACEHOLDERS[e.target.value] || '';
|
| 1261 |
+
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1262 |
|
| 1263 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1264 |
// COPY BUTTON
|
|
|
|
| 1277 |
</svg> COPY`;
|
| 1278 |
}
|
| 1279 |
copyBtn.onclick = () => {
|
| 1280 |
+
const txt = outputBox.innerText || '';
|
| 1281 |
+
if (!txt || txt === 'Results will stream here...') return;
|
| 1282 |
navigator.clipboard.writeText(txt).then(() => {
|
| 1283 |
copyBtn.classList.add('copied');
|
| 1284 |
copyBtn.innerHTML = `
|
|
|
|
| 1286 |
stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round">
|
| 1287 |
<polyline points="20 6 9 17 4 12"/>
|
| 1288 |
</svg> COPIED`;
|
| 1289 |
+
clearTimeout(copyTimer);
|
| 1290 |
+
copyTimer = setTimeout(resetCopyBtn, 2000);
|
| 1291 |
}).catch(() => {
|
| 1292 |
+
const ta = document.createElement('textarea');
|
| 1293 |
+
ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0';
|
| 1294 |
+
document.body.appendChild(ta); ta.select();
|
| 1295 |
+
document.execCommand('copy'); document.body.removeChild(ta);
|
| 1296 |
});
|
| 1297 |
};
|
| 1298 |
|
| 1299 |
+
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1300 |
+
// GROUNDING IMAGE (server-rendered, base64)
|
| 1301 |
+
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1302 |
+
const groundImg = document.getElementById('groundImg');
|
| 1303 |
+
const groundPlaceholder = document.getElementById('groundPlaceholder');
|
| 1304 |
+
const downloadBtn = document.getElementById('downloadBtn');
|
| 1305 |
+
const dotGnd = document.getElementById('dot-gnd');
|
| 1306 |
+
|
| 1307 |
+
function showGroundingImage(b64DataUri) {
|
| 1308 |
+
groundImg.src = b64DataUri;
|
| 1309 |
+
groundImg.style.display = 'block';
|
| 1310 |
+
groundPlaceholder.style.display = 'none';
|
| 1311 |
+
// Wire up download button
|
| 1312 |
+
const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
| 1313 |
+
downloadBtn.href = b64DataUri;
|
| 1314 |
+
downloadBtn.download = `grounding_${ts}.png`;
|
| 1315 |
+
downloadBtn.style.display = 'flex';
|
| 1316 |
+
dotGnd.classList.add('active');
|
| 1317 |
+
}
|
| 1318 |
+
|
| 1319 |
+
function resetGrounding(msg) {
|
| 1320 |
+
groundImg.src = '';
|
| 1321 |
+
groundImg.style.display = 'none';
|
| 1322 |
+
groundPlaceholder.textContent = msg || 'Active for Point / Detect tasks. Run inference to visualise.';
|
| 1323 |
+
groundPlaceholder.style.display = 'flex';
|
| 1324 |
+
downloadBtn.style.display = 'none';
|
| 1325 |
+
dotGnd.classList.remove('active');
|
| 1326 |
+
}
|
| 1327 |
+
|
| 1328 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1329 |
// RUN INFERENCE
|
| 1330 |
// ββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1333 |
const allWires = ['wire-img-task','wire-model-task','wire-task-out','wire-task-gnd'];
|
| 1334 |
const dotTask = document.getElementById('dot-task');
|
| 1335 |
const dotOut = document.getElementById('dot-out');
|
|
|
|
| 1336 |
|
| 1337 |
runBtn.onclick = async () => {
|
| 1338 |
if (!currentFile) { alert('Please upload an image into the Input Node.'); return; }
|
| 1339 |
const promptStr = promptInput.value.trim();
|
| 1340 |
if (!promptStr) { alert('Please enter a prompt directive.'); return; }
|
| 1341 |
|
| 1342 |
+
// ββ Reset UI βββββββββββββββββββββββββββββββββββββββββ
|
| 1343 |
runBtn.disabled = true;
|
| 1344 |
btnLoader.style.display = 'inline-block';
|
| 1345 |
outputBox.innerText = '';
|
| 1346 |
outputBox.style.color = '';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1347 |
dotTask.classList.add('active');
|
| 1348 |
dotOut.classList.remove('active');
|
|
|
|
| 1349 |
allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
|
| 1350 |
resetCopyBtn();
|
| 1351 |
|
| 1352 |
+
const cat = categorySelect.value;
|
| 1353 |
+
if (cat === 'Point' || cat === 'Detect') {
|
| 1354 |
+
resetGrounding('Running inferenceβ¦');
|
| 1355 |
+
}
|
| 1356 |
+
|
| 1357 |
+
// ββ Build FormData ββββββββββββββββββββββββββββββββββββ
|
| 1358 |
const formData = new FormData();
|
| 1359 |
formData.append('image', currentFile);
|
| 1360 |
+
formData.append('category', cat);
|
| 1361 |
formData.append('prompt', promptStr);
|
| 1362 |
formData.append('model_id', modelSelect.value);
|
| 1363 |
|
| 1364 |
let fullText = '';
|
|
|
|
|
|
|
| 1365 |
|
| 1366 |
try {
|
| 1367 |
+
// ββ 1. Stream inference βββββββββββββββββββββββββββ
|
| 1368 |
const response = await fetch('/api/run', { method: 'POST', body: formData });
|
| 1369 |
if (!response.ok) {
|
| 1370 |
const err = await response.json();
|
|
|
|
| 1373 |
|
| 1374 |
const reader = response.body.getReader();
|
| 1375 |
const decoder = new TextDecoder('utf-8');
|
| 1376 |
+
let buffer = '';
|
| 1377 |
|
| 1378 |
while (true) {
|
| 1379 |
const { value, done } = await reader.read();
|
| 1380 |
if (done) break;
|
| 1381 |
buffer += decoder.decode(value, { stream: true });
|
| 1382 |
const lines = buffer.split('\\n\\n');
|
| 1383 |
+
buffer = lines.pop(); // keep incomplete chunk
|
| 1384 |
|
| 1385 |
for (const line of lines) {
|
| 1386 |
if (!line.startsWith('data: ')) continue;
|
| 1387 |
+
const payload = line.slice(6);
|
| 1388 |
if (payload === '[DONE]') break;
|
| 1389 |
try {
|
| 1390 |
const data = JSON.parse(payload);
|
|
|
|
| 1399 |
|
| 1400 |
dotOut.classList.add('active');
|
| 1401 |
|
| 1402 |
+
// ββ 2. Server-side annotation for Point / Detect ββ
|
|
|
|
| 1403 |
if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
|
| 1404 |
+
resetGrounding('Annotating imageβ¦');
|
| 1405 |
+
try {
|
| 1406 |
+
const annForm = new FormData();
|
| 1407 |
+
annForm.append('image', currentFile);
|
| 1408 |
+
annForm.append('text', fullText);
|
| 1409 |
+
annForm.append('category', cat);
|
| 1410 |
+
|
| 1411 |
+
const annResp = await fetch('/api/annotate', {
|
| 1412 |
+
method: 'POST', body: annForm,
|
| 1413 |
+
});
|
| 1414 |
+
const annData = await annResp.json();
|
| 1415 |
+
|
| 1416 |
+
if (annData.b64) {
|
| 1417 |
+
showGroundingImage(annData.b64);
|
| 1418 |
} else {
|
| 1419 |
+
resetGrounding(
|
| 1420 |
+
annData.error === 'no_json'
|
| 1421 |
+
? 'No grounding coordinates found in model output.'
|
| 1422 |
+
: `Annotation error: ${annData.error || 'unknown'}`
|
| 1423 |
+
);
|
| 1424 |
}
|
| 1425 |
+
} catch (annErr) {
|
| 1426 |
+
resetGrounding(`Annotation failed: ${annErr.message}`);
|
| 1427 |
+
}
|
| 1428 |
} else if (cat !== 'Point' && cat !== 'Detect') {
|
| 1429 |
+
resetGrounding('Active for Point / Detect tasks. Run inference to visualise.');
|
|
|
|
| 1430 |
}
|
| 1431 |
|
| 1432 |
} catch (err) {
|
| 1433 |
outputBox.innerText = `[Error] ${err.message}`;
|
| 1434 |
outputBox.style.color = '#ff6b6b';
|
| 1435 |
+
if (cat === 'Point' || cat === 'Detect') {
|
| 1436 |
+
resetGrounding('Inference error β see Output Stream node.');
|
| 1437 |
+
}
|
| 1438 |
} finally {
|
| 1439 |
runBtn.disabled = false;
|
| 1440 |
btnLoader.style.display = 'none';
|
| 1441 |
dotTask.classList.remove('active');
|
| 1442 |
allWires.forEach(id => document.getElementById(id)?.classList.remove('active'));
|
|
|
|
|
|
|
| 1443 |
}
|
| 1444 |
};
|
| 1445 |
</script>
|