prithivMLmods commited on
Commit
4592a59
·
verified ·
1 Parent(s): 3267287

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +738 -395
app.py CHANGED
@@ -18,6 +18,7 @@ from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
18
  from transformers import (
19
  Qwen3_5ForConditionalGeneration,
20
  AutoProcessor,
 
21
  TextIteratorStreamer,
22
  )
23
 
@@ -31,18 +32,35 @@ DTYPE = (
31
  else torch.float16
32
  )
33
 
34
- MODEL_NAME = "Qwen/Qwen3.5-2B"
 
35
  CATEGORIES = ["Query", "Caption", "Point", "Detect"]
36
 
37
- print(f"Loading model: {MODEL_NAME} on {DEVICE}...")
38
  try:
39
  qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained(
40
- MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
41
  ).eval()
42
- qwen_processor = AutoProcessor.from_pretrained(MODEL_NAME)
43
- print("Model loaded successfully.")
44
  except Exception as e:
45
- print(f"Warning: Model loading failed (Check if weights exist). Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  # --- Utility Functions ---
@@ -63,7 +81,7 @@ def safe_parse_json(text: str):
63
 
64
  # --- Inference Generator (Streaming) ---
65
  @spaces.GPU(duration=120)
66
- def generate_inference_stream(image: Image.Image, category: str, prompt: str):
67
  if category == "Query":
68
  full_prompt = prompt
69
  elif category == "Caption":
@@ -75,51 +93,102 @@ def generate_inference_stream(image: Image.Image, category: str, prompt: str):
75
  else:
76
  full_prompt = prompt
77
 
78
- messages = [
79
- {
80
- "role": "user",
81
- "content": [
82
- {"type": "image", "image": image},
83
- {"type": "text", "text": full_prompt},
84
- ],
85
- }
86
- ]
87
-
88
- text_input = qwen_processor.apply_chat_template(
89
- messages, tokenize=False, add_generation_prompt=True
90
- )
91
-
92
- inputs = qwen_processor(
93
- text=[text_input], images=[image], return_tensors="pt", padding=True
94
- ).to(qwen_model.device)
95
-
96
- streamer = TextIteratorStreamer(
97
- qwen_processor.tokenizer,
98
- skip_prompt=True,
99
- skip_special_tokens=True,
100
- timeout=120,
101
- )
102
-
103
- thread = threading.Thread(
104
- target=qwen_model.generate,
105
- kwargs=dict(
106
- **inputs,
107
- streamer=streamer,
108
- max_new_tokens=1024,
109
- use_cache=True,
110
- temperature=1.5,
111
- min_p=0.1,
112
- ),
113
- )
114
- thread.start()
115
-
116
- # Stream out tokens as SSE (Server-Sent Events)
117
- for tok in streamer:
118
- if tok:
119
- # Yield as JSON chunks encoded for SSE
120
- yield f"data: {json.dumps({'chunk': tok})}\n\n"
121
-
122
- thread.join()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  yield "data: [DONE]\n\n"
124
 
125
 
@@ -129,393 +198,667 @@ async def run_inference(
129
  image: UploadFile = File(...),
130
  category: str = Form(...),
131
  prompt: str = Form(...),
 
132
  ):
133
  try:
134
- # Load and resize image
135
  img_bytes = await image.read()
136
  img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
137
  img.thumbnail((512, 512))
138
 
139
- # Return streaming response wrapping our inference generator
140
  return StreamingResponse(
141
- generate_inference_stream(img, category, prompt),
142
  media_type="text/event-stream"
143
  )
144
  except Exception as e:
145
  return JSONResponse({"error": str(e)}, status_code=500)
146
 
147
 
148
- # --- Frontend UI (HTML/JS/CSS) ---
149
  @app.get("/", response_class=HTMLResponse)
150
  async def homepage(request: Request):
151
- return f"""
152
  <!DOCTYPE html>
153
  <html lang="en">
154
  <head>
155
  <meta charset="UTF-8">
156
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
157
  <title>Multimodal-Edge-Comparator</title>
158
- <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
159
  <style>
160
- :root {{
161
- --bg-color: #121212;
162
- --grid-color: #242424;
163
- --node-bg: #1e1e1e;
164
- --node-header: #2a2a2a;
165
- --node-border: #3a3a3a;
166
- --accent: #5e81ac;
167
- --accent-hover: #81a1c1;
168
- --text-main: #eceff4;
169
- --text-muted: #8fbcbb;
170
- --port-color: #88c0d0;
171
- --wire-color: #4c566a;
172
- }}
173
-
174
- * {{ box-sizing: border-box; font-family: 'Inter', sans-serif; }}
175
-
176
- body {{
 
177
  margin: 0; padding: 0; overflow: hidden;
178
- background-color: var(--bg-color);
179
- background-image:
180
- linear-gradient(var(--grid-color) 1px, transparent 1px),
181
- linear-gradient(90deg, var(--grid-color) 1px, transparent 1px);
182
- background-size: 20px 20px;
183
- color: var(--text-main);
 
 
 
184
  user-select: none;
185
- }}
186
 
187
- .top-bar {{
188
- position: absolute; top: 0; left: 0; right: 0;
189
- height: 50px; background: rgba(18,18,18,0.8);
190
  border-bottom: 1px solid var(--node-border);
191
  display: flex; align-items: center; padding: 0 20px;
192
- font-weight: 700; font-size: 18px; z-index: 100;
193
- backdrop-filter: blur(4px);
194
- color: var(--port-color);
195
- pointer-events: none;
196
- }}
197
-
198
- /* Canvas & SVG Wires */
199
- #canvas {{ position: relative; width: 100vw; height: 100vh; }}
200
- svg {{ position: absolute; top: 0; left: 0; width: 100%; height: 100%; pointer-events: none; z-index: 1; }}
201
- path.wire {{ fill: none; stroke: var(--wire-color); stroke-width: 3; stroke-linecap: round; }}
202
- path.wire.active {{ stroke: var(--accent); stroke-width: 4; animation: flow 1s linear infinite; }}
203
-
204
- @keyframes flow {{
205
- 0% {{ stroke-dashoffset: 20; }}
206
- 100% {{ stroke-dashoffset: 0; }}
207
- }}
208
-
209
- /* Nodes */
210
- .node {{
211
- position: absolute; width: 340px;
212
- background: var(--node-bg); border: 1px solid var(--node-border);
213
- border-radius: 8px; box-shadow: 0 8px 24px rgba(0,0,0,0.5);
214
- z-index: 10; display: flex; flex-direction: column;
215
- }}
216
-
217
- .node-header {{
218
- background: var(--node-header); padding: 12px 15px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  border-bottom: 1px solid var(--node-border);
220
- border-radius: 8px 8px 0 0; font-weight: 600; font-size: 14px;
221
- cursor: grab; display: flex; justify-content: space-between;
222
- }}
223
- .node-header:active {{ cursor: grabbing; }}
224
-
225
- .node-body {{ padding: 15px; display: flex; flex-direction: column; gap: 12px; }}
226
-
227
- /* Ports */
228
- .port {{
229
- position: absolute; width: 14px; height: 14px;
230
- background: var(--node-bg); border: 2px solid var(--port-color);
231
- border-radius: 50%; z-index: 20; top: 50%; transform: translateY(-50%);
232
- }}
233
- .port.out {{ right: -8px; }}
234
- .port.in {{ left: -8px; }}
235
-
236
- /* Form Elements */
237
- label {{ font-size: 12px; color: var(--text-muted); font-weight: 500; display: block; margin-bottom: 4px; }}
238
-
239
- input[type="file"] {{ display: none; }}
240
- .file-upload {{
241
- border: 2px dashed var(--node-border); border-radius: 6px;
242
- padding: 20px; text-align: center; cursor: pointer;
243
- font-size: 13px; color: var(--text-muted); transition: border 0.2s;
244
- }}
245
- .file-upload:hover {{ border-color: var(--accent); }}
246
-
247
- .img-preview {{ width: 100%; height: auto; max-height: 280px; object-fit: contain; border-radius: 4px; display: none; margin-top: 10px; background: #000; }}
248
-
249
- select, textarea {{
250
- width: 100%; background: #121212; border: 1px solid var(--node-border);
251
- color: var(--text-main); padding: 10px; border-radius: 6px;
252
- outline: none; font-size: 13px; resize: none;
253
- }}
254
- select:focus, textarea:focus {{ border-color: var(--accent); }}
255
-
256
- button {{
257
- background: #ff69b4; color: #fff; border: none;
258
- padding: 10px; border-radius: 6px; font-weight: 600; font-size: 14px;
259
- cursor: pointer; transition: background 0.2s; display: flex; justify-content: center; gap: 8px; align-items: center;
260
- }}
261
- button:hover {{ background: #ff1493; }}
262
- button:disabled {{ background: var(--node-border); cursor: not-allowed; color: #888; }}
263
-
264
- .output-box {{
265
- background: #121212; border: 1px solid var(--node-border);
266
- border-radius: 6px; padding: 12px; min-height: 150px; max-height: 300px;
267
- overflow-y: auto; font-size: 13px; line-height: 1.5; color: #d8dee9;
268
- white-space: pre-wrap; user-select: text;
269
- }}
270
-
271
- .loader {{ width: 14px; height: 14px; border: 2px solid #fff; border-bottom-color: transparent; border-radius: 50%; display: inline-block; animation: rotation 1s linear infinite; display: none; }}
272
- @keyframes rotation {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  </style>
274
  </head>
275
  <body>
276
 
277
- <div class="top-bar">
278
- Multimodal Edge Comparator [Qwen3.5-2B]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  </div>
280
 
281
- <div id="canvas">
282
- <svg>
283
- <path id="wire1" class="wire" />
284
- <path id="wire2" class="wire" />
285
- </svg>
286
-
287
- <!-- Node 1: Image Input -->
288
- <div class="node" id="node-img" style="left: 100px; top: 150px;">
289
- <div class="node-header"><span>Input Node</span><span>ID: 01</span></div>
290
- <div class="node-body">
291
- <div>
292
- <label>Upload Image</label>
293
- <div class="file-upload" id="dropZone">
294
- Click or Drop Image Here
295
- <input type="file" id="fileInput" accept="image/*">
296
- </div>
297
- <img id="imgPreview" class="img-preview" />
298
- </div>
299
  </div>
300
- <div class="port out" id="port-img-out"></div>
301
  </div>
 
 
 
302
 
303
- <!-- Node 2: Task Execution -->
304
- <div class="node" id="node-task" style="left: 550px; top: 150px;">
305
- <div class="port in" id="port-task-in" style="top: 30%;"></div>
306
- <div class="node-header"><span>Task Node</span><span>ID: 02</span></div>
307
- <div class="node-body">
308
- <div>
309
- <label>Task Category</label>
310
- <select id="categorySelect">
311
- <option value="Query" selected>Query</option>
312
- <option value="Caption">Caption</option>
313
- <option value="Point">Point</option>
314
- <option value="Detect">Detect</option>
315
- </select>
316
- </div>
317
- <div>
318
- <label>Prompt Directive</label>
319
- <textarea id="promptInput" rows="4" placeholder="e.g., Count the total number of boats and describe the environment."></textarea>
320
- </div>
321
- <button id="runBtn">
322
- <span>Quick Execution</span>
323
- <span class="loader" id="btnLoader"></span>
324
- </button>
325
  </div>
326
- <div class="port out" id="port-task-out" style="top: 70%;"></div>
 
 
 
327
  </div>
 
 
 
328
 
329
- <!-- Node 3: Output Display -->
330
- <div class="node" id="node-out" style="left: 1000px; top: 150px; width: 380px;">
331
- <div class="port in" id="port-out-in"></div>
332
- <div class="node-header"><span>Output Node</span><span>ID: 03</span></div>
333
- <div class="node-body">
334
- <label>Streamed Result</label>
335
- <div class="output-box" id="outputBox">Results will stream here...</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  </div>
337
  </div>
338
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
- <script>
341
- // --- 1. Node Canvas & Dragging Logic ---
342
- function getCenter(elementId) {{
343
- const el = document.getElementById(elementId);
344
- const rect = el.getBoundingClientRect();
345
- return {{ x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }};
346
- }}
347
-
348
- function drawCurve(p1, p2) {{
349
- const dx = Math.abs(p2.x - p1.x) * 0.5;
350
- return `M ${{p1.x}} ${{p1.y}} C ${{p1.x + dx}} ${{p1.y}}, ${{p2.x - dx}} ${{p2.y}}, ${{p2.x}} ${{p2.y}}`;
351
- }}
352
-
353
- function updateWires() {{
354
- const pImgOut = getCenter('port-img-out');
355
- const pTaskIn = getCenter('port-task-in');
356
- const pTaskOut = getCenter('port-task-out');
357
- const pOutIn = getCenter('port-out-in');
358
-
359
- document.getElementById('wire1').setAttribute('d', drawCurve(pImgOut, pTaskIn));
360
- document.getElementById('wire2').setAttribute('d', drawCurve(pTaskOut, pOutIn));
361
- }}
362
-
363
- // Initialize node dragging
364
- document.querySelectorAll('.node').forEach(node => {{
365
- const header = node.querySelector('.node-header');
366
- let isDragging = false, startX, startY, initialLeft, initialTop;
367
-
368
- header.addEventListener('mousedown', (e) => {{
369
- isDragging = true;
370
- startX = e.clientX; startY = e.clientY;
371
- initialLeft = parseInt(node.style.left || 0);
372
- initialTop = parseInt(node.style.top || 0);
373
- node.style.zIndex = 100; // bring to front
374
- }});
375
-
376
- document.addEventListener('mousemove', (e) => {{
377
- if (!isDragging) return;
378
- const dx = e.clientX - startX;
379
- const dy = e.clientY - startY;
380
- node.style.left = `${{initialLeft + dx}}px`;
381
- node.style.top = `${{initialTop + dy}}px`;
382
- updateWires();
383
- }});
384
-
385
- document.addEventListener('mouseup', () => {{
386
- if (isDragging) {{
387
- isDragging = false;
388
- node.style.zIndex = 10;
389
- }}
390
- }});
391
- }});
392
-
393
- window.addEventListener('resize', updateWires);
394
- updateWires(); // initial draw
395
-
396
-
397
- // --- 2. Application Logic ---
398
- let currentFile = null;
399
-
400
- // File Upload Handlers
401
- const dropZone = document.getElementById('dropZone');
402
- const fileInput = document.getElementById('fileInput');
403
- const imgPreview = document.getElementById('imgPreview');
404
-
405
- function handleFile(file) {{
406
- if (file && file.type.startsWith('image/')) {{
407
- currentFile = file;
408
- imgPreview.src = URL.createObjectURL(file);
409
- imgPreview.style.display = 'block';
410
- dropZone.style.display = 'none';
411
- updateWires(); // redraw wires because node size changed
412
- }}
413
- }}
414
-
415
- dropZone.onclick = () => fileInput.click();
416
- fileInput.onchange = e => handleFile(e.target.files[0]);
417
- dropZone.ondragover = e => {{ e.preventDefault(); dropZone.style.borderColor = "var(--accent)"; }};
418
- dropZone.ondragleave = e => {{ dropZone.style.borderColor = "var(--node-border)"; }};
419
- dropZone.ondrop = e => {{
420
- e.preventDefault();
421
- dropZone.style.borderColor = "var(--node-border)";
422
- if(e.dataTransfer.files.length) handleFile(e.dataTransfer.files[0]);
423
- }};
424
-
425
- // Category Selection Updates
426
- const categorySelect = document.getElementById('categorySelect');
427
- const promptInput = document.getElementById('promptInput');
428
- const placeholders = {{
429
- "Query": "e.g., Count the total number of boats and describe the environment.",
430
- "Caption": "e.g., short, normal, detailed",
431
- "Point": "e.g., The gun held by the person.",
432
- "Detect": "e.g., The headlight of the car."
433
- }};
434
-
435
- categorySelect.onchange = (e) => {{
436
- const val = e.target.value;
437
- promptInput.placeholder = placeholders[val] || "Enter your prompt here.";
438
- }};
439
-
440
- // Execution Logic
441
- const runBtn = document.getElementById('runBtn');
442
- const btnLoader = document.getElementById('btnLoader');
443
- const outputBox = document.getElementById('outputBox');
444
- const wire1 = document.getElementById('wire1');
445
- const wire2 = document.getElementById('wire2');
446
-
447
- runBtn.onclick = async () => {{
448
- if (!currentFile) return alert("Please upload an image into the Input Node.");
449
- const promptStr = promptInput.value.trim();
450
- if (!promptStr) return alert("Please enter a prompt directive in the Task Node.");
451
-
452
- // UI State updates
453
- runBtn.disabled = true;
454
- btnLoader.style.display = 'inline-block';
455
- outputBox.innerText = ""; // clear
456
- wire1.classList.add('active');
457
- wire2.classList.add('active');
458
-
459
- const formData = new FormData();
460
- formData.append("image", currentFile);
461
- formData.append("category", categorySelect.value);
462
- formData.append("prompt", promptStr);
463
-
464
- try {{
465
- const response = await fetch('/api/run', {{
466
- method: 'POST',
467
- body: formData
468
- }});
469
-
470
- if (!response.ok) {{
471
- const err = await response.json();
472
- throw new Error(err.error || "Execution failed.");
473
- }}
474
-
475
- // Streaming Reader setup
476
- const reader = response.body.getReader();
477
- const decoder = new TextDecoder('utf-8');
478
- let buffer = '';
479
-
480
- while (true) {{
481
- const {{ value, done }} = await reader.read();
482
- if (done) break;
483
-
484
- buffer += decoder.decode(value, {{ stream: true }});
485
-
486
- // Parse SSE chunks
487
- const lines = buffer.split('\\n\\n');
488
- buffer = lines.pop(); // keep last incomplete chunk
489
-
490
- for (const line of lines) {{
491
- if (line.startsWith('data: ')) {{
492
- const payload = line.replace('data: ', '');
493
- if (payload === '[DONE]') break;
494
-
495
- try {{
496
- const data = JSON.parse(payload);
497
- if (data.chunk) {{
498
- outputBox.innerText += data.chunk;
499
- outputBox.scrollTop = outputBox.scrollHeight; // auto-scroll
500
- }}
501
- }} catch(e) {{
502
- console.error("Chunk parsing error:", e);
503
- }}
504
- }}
505
- }}
506
- }}
507
- }} catch (error) {{
508
- outputBox.innerText = `[Error] ${{error.message}}`;
509
- outputBox.style.color = "#bf616a"; // Red error tint
510
- }} finally {{
511
- // Revert UI State
512
- runBtn.disabled = false;
513
- btnLoader.style.display = 'none';
514
- wire1.classList.remove('active');
515
- wire2.classList.remove('active');
516
- }}
517
- }};
518
- </script>
519
  </body>
520
  </html>
521
  """
 
18
  from transformers import (
19
  Qwen3_5ForConditionalGeneration,
20
  AutoProcessor,
21
+ AutoModelForImageTextToText,
22
  TextIteratorStreamer,
23
  )
24
 
 
32
  else torch.float16
33
  )
34
 
35
+ QWEN_MODEL_NAME = "Qwen/Qwen3.5-2B"
36
+ LFM_MODEL_NAME = "LiquidAI/LFM2.5-VL-450M"
37
  CATEGORIES = ["Query", "Caption", "Point", "Detect"]
38
 
39
+ print(f"Loading Qwen model: {QWEN_MODEL_NAME} on {DEVICE}...")
40
  try:
41
  qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained(
42
+ QWEN_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
43
  ).eval()
44
+ qwen_processor = AutoProcessor.from_pretrained(QWEN_MODEL_NAME)
45
+ print("Qwen model loaded successfully.")
46
  except Exception as e:
47
+ print(f"Warning: Qwen model loading failed. Error: {e}")
48
+ qwen_model = None
49
+ qwen_processor = None
50
+
51
+ print(f"Loading LFM model: {LFM_MODEL_NAME} on {DEVICE}...")
52
+ try:
53
+ lfm_model = AutoModelForImageTextToText.from_pretrained(
54
+ LFM_MODEL_NAME,
55
+ device_map="auto",
56
+ torch_dtype=torch.bfloat16,
57
+ ).eval()
58
+ lfm_processor = AutoProcessor.from_pretrained(LFM_MODEL_NAME)
59
+ print("LFM model loaded successfully.")
60
+ except Exception as e:
61
+ print(f"Warning: LFM model loading failed. Error: {e}")
62
+ lfm_model = None
63
+ lfm_processor = None
64
 
65
 
66
  # --- Utility Functions ---
 
81
 
82
  # --- Inference Generator (Streaming) ---
83
  @spaces.GPU(duration=120)
84
+ def generate_inference_stream(image: Image.Image, category: str, prompt: str, model_id: str = "qwen"):
85
  if category == "Query":
86
  full_prompt = prompt
87
  elif category == "Caption":
 
93
  else:
94
  full_prompt = prompt
95
 
96
+ if model_id == "lfm":
97
+ # LFM2.5-VL inference
98
+ if lfm_model is None or lfm_processor is None:
99
+ yield f"data: {json.dumps({'chunk': '[Error] LFM model not loaded.'})}\n\n"
100
+ yield "data: [DONE]\n\n"
101
+ return
102
+
103
+ conversation = [
104
+ {
105
+ "role": "user",
106
+ "content": [
107
+ {"type": "image", "image": image},
108
+ {"type": "text", "text": full_prompt},
109
+ ],
110
+ }
111
+ ]
112
+
113
+ inputs = lfm_processor.apply_chat_template(
114
+ conversation,
115
+ add_generation_prompt=True,
116
+ return_tensors="pt",
117
+ return_dict=True,
118
+ tokenize=True,
119
+ ).to(lfm_model.device)
120
+
121
+ streamer = TextIteratorStreamer(
122
+ lfm_processor.tokenizer,
123
+ skip_prompt=True,
124
+ skip_special_tokens=True,
125
+ timeout=120,
126
+ )
127
+
128
+ thread = threading.Thread(
129
+ target=lfm_model.generate,
130
+ kwargs=dict(
131
+ **inputs,
132
+ streamer=streamer,
133
+ max_new_tokens=1024,
134
+ use_cache=True,
135
+ ),
136
+ )
137
+ thread.start()
138
+ for tok in streamer:
139
+ if tok:
140
+ yield f"data: {json.dumps({'chunk': tok})}\n\n"
141
+ thread.join()
142
+
143
+ else:
144
+ # Qwen3.5 inference
145
+ if qwen_model is None or qwen_processor is None:
146
+ yield f"data: {json.dumps({'chunk': '[Error] Qwen model not loaded.'})}\n\n"
147
+ yield "data: [DONE]\n\n"
148
+ return
149
+
150
+ messages = [
151
+ {
152
+ "role": "user",
153
+ "content": [
154
+ {"type": "image", "image": image},
155
+ {"type": "text", "text": full_prompt},
156
+ ],
157
+ }
158
+ ]
159
+
160
+ text_input = qwen_processor.apply_chat_template(
161
+ messages, tokenize=False, add_generation_prompt=True
162
+ )
163
+
164
+ inputs = qwen_processor(
165
+ text=[text_input], images=[image], return_tensors="pt", padding=True
166
+ ).to(qwen_model.device)
167
+
168
+ streamer = TextIteratorStreamer(
169
+ qwen_processor.tokenizer,
170
+ skip_prompt=True,
171
+ skip_special_tokens=True,
172
+ timeout=120,
173
+ )
174
+
175
+ thread = threading.Thread(
176
+ target=qwen_model.generate,
177
+ kwargs=dict(
178
+ **inputs,
179
+ streamer=streamer,
180
+ max_new_tokens=1024,
181
+ use_cache=True,
182
+ temperature=1.5,
183
+ min_p=0.1,
184
+ ),
185
+ )
186
+ thread.start()
187
+ for tok in streamer:
188
+ if tok:
189
+ yield f"data: {json.dumps({'chunk': tok})}\n\n"
190
+ thread.join()
191
+
192
  yield "data: [DONE]\n\n"
193
 
194
 
 
198
  image: UploadFile = File(...),
199
  category: str = Form(...),
200
  prompt: str = Form(...),
201
+ model_id: str = Form("qwen"),
202
  ):
203
  try:
 
204
  img_bytes = await image.read()
205
  img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
206
  img.thumbnail((512, 512))
207
 
 
208
  return StreamingResponse(
209
+ generate_inference_stream(img, category, prompt, model_id),
210
  media_type="text/event-stream"
211
  )
212
  except Exception as e:
213
  return JSONResponse({"error": str(e)}, status_code=500)
214
 
215
 
216
+ # --- Frontend UI ---
217
  @app.get("/", response_class=HTMLResponse)
218
  async def homepage(request: Request):
219
+ return """
220
  <!DOCTYPE html>
221
  <html lang="en">
222
  <head>
223
  <meta charset="UTF-8">
224
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
225
  <title>Multimodal-Edge-Comparator</title>
226
+ <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;600;700&family=Space+Mono:wght@400;700&display=swap" rel="stylesheet">
227
  <style>
228
+ :root {
229
+ --bg: #0d0d0f;
230
+ --grid: #1a1a1f;
231
+ --node-bg: #13131a;
232
+ --node-header: #1c1c26;
233
+ --node-border: #2a2a3a;
234
+ --accent: #7c6af7;
235
+ --accent2: #4ecdc4;
236
+ --accent3: #ff6b6b;
237
+ --text: #e8e8f0;
238
+ --muted: #6b6b8a;
239
+ --port: #4ecdc4;
240
+ --wire: #2a2a4a;
241
+ --wire-active: #7c6af7;
242
+ }
243
+
244
+ * { box-sizing: border-box; }
245
+ body {
246
  margin: 0; padding: 0; overflow: hidden;
247
+ background: var(--bg);
248
+ background-image:
249
+ radial-gradient(circle at 20% 50%, rgba(124,106,247,0.04) 0%, transparent 50%),
250
+ radial-gradient(circle at 80% 20%, rgba(78,205,196,0.04) 0%, transparent 50%),
251
+ linear-gradient(var(--grid) 1px, transparent 1px),
252
+ linear-gradient(90deg, var(--grid) 1px, transparent 1px);
253
+ background-size: 100% 100%, 100% 100%, 24px 24px, 24px 24px;
254
+ color: var(--text);
255
+ font-family: 'JetBrains Mono', monospace;
256
  user-select: none;
257
+ }
258
 
259
+ .top-bar {
260
+ position: fixed; top: 0; left: 0; right: 0;
261
+ height: 48px; background: rgba(13,13,15,0.9);
262
  border-bottom: 1px solid var(--node-border);
263
  display: flex; align-items: center; padding: 0 20px;
264
+ gap: 12px; z-index: 1000;
265
+ backdrop-filter: blur(12px);
266
+ }
267
+ .top-bar .logo { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
268
+ .top-bar .sep { color: var(--node-border); }
269
+ .top-bar .sub { font-size: 11px; color: var(--muted); }
270
+ .top-bar .badge {
271
+ margin-left: auto; background: rgba(124,106,247,0.15);
272
+ border: 1px solid rgba(124,106,247,0.3); padding: 3px 10px;
273
+ border-radius: 20px; font-size: 10px; color: var(--accent);
274
+ }
275
+
276
+ #canvas {
277
+ position: relative; width: 100vw; height: 100vh;
278
+ padding-top: 48px;
279
+ overflow: hidden;
280
+ }
281
+
282
+ svg.wires {
283
+ position: absolute; top: 0; left: 0;
284
+ width: 100%; height: 100%;
285
+ pointer-events: none; z-index: 2;
286
+ }
287
+
288
+ path.wire {
289
+ fill: none; stroke: var(--wire); stroke-width: 2.5;
290
+ stroke-linecap: round;
291
+ }
292
+ path.wire.active {
293
+ stroke: var(--wire-active); stroke-width: 3;
294
+ stroke-dasharray: 8 4;
295
+ animation: flow 0.6s linear infinite;
296
+ }
297
+ @keyframes flow { to { stroke-dashoffset: -24; } }
298
+
299
+ /* ─── Nodes ─── */
300
+ .node {
301
+ position: absolute;
302
+ width: 300px;
303
+ background: var(--node-bg);
304
+ border: 1px solid var(--node-border);
305
+ border-radius: 10px;
306
+ box-shadow: 0 12px 40px rgba(0,0,0,0.6), 0 0 0 0px rgba(124,106,247,0);
307
+ z-index: 10;
308
+ display: flex;
309
+ flex-direction: column;
310
+ transition: box-shadow 0.2s;
311
+ }
312
+ .node:hover {
313
+ box-shadow: 0 12px 40px rgba(0,0,0,0.6), 0 0 0 1px rgba(124,106,247,0.3);
314
+ }
315
+
316
+ /* Fixed height nodes */
317
+ .node.fixed-height {
318
+ height: 420px;
319
+ }
320
+
321
+ .node-header {
322
+ background: var(--node-header);
323
+ padding: 10px 14px;
324
  border-bottom: 1px solid var(--node-border);
325
+ border-radius: 10px 10px 0 0;
326
+ font-size: 11px; font-weight: 700;
327
+ cursor: grab; display: flex; justify-content: space-between; align-items: center;
328
+ flex-shrink: 0;
329
+ }
330
+ .node-header:active { cursor: grabbing; }
331
+ .node-header .id {
332
+ font-size: 10px; color: var(--muted);
333
+ background: rgba(255,255,255,0.04);
334
+ padding: 2px 8px; border-radius: 4px;
335
+ }
336
+
337
+ .node-body {
338
+ padding: 14px;
339
+ display: flex; flex-direction: column; gap: 12px;
340
+ flex: 1; overflow: hidden;
341
+ }
342
+
343
+ /* ─── Ports ─── */
344
+ .port {
345
+ position: absolute;
346
+ width: 12px; height: 12px;
347
+ background: var(--node-bg);
348
+ border: 2px solid var(--port);
349
+ border-radius: 50%;
350
+ z-index: 30;
351
+ }
352
+ .port.out { right: -7px; }
353
+ .port.in { left: -7px; }
354
+
355
+ /* ─── Form Elements ─── */
356
+ label {
357
+ font-size: 10px; color: var(--muted);
358
+ font-weight: 600; display: block; margin-bottom: 4px;
359
+ letter-spacing: 0.08em; text-transform: uppercase;
360
+ }
361
+
362
+ input[type="file"] { display: none; }
363
+
364
+ .file-upload {
365
+ border: 1.5px dashed var(--node-border);
366
+ border-radius: 8px; padding: 16px 12px;
367
+ text-align: center; cursor: pointer;
368
+ font-size: 11px; color: var(--muted);
369
+ transition: border-color 0.2s, background 0.2s;
370
+ background: rgba(255,255,255,0.01);
371
+ }
372
+ .file-upload:hover { border-color: var(--accent); background: rgba(124,106,247,0.04); }
373
+ .file-upload .icon { font-size: 22px; margin-bottom: 6px; }
374
+
375
+ .img-preview {
376
+ width: 100%; height: 190px;
377
+ object-fit: contain;
378
+ border-radius: 6px;
379
+ display: none;
380
+ background: #000;
381
+ border: 1px solid var(--node-border);
382
+ }
383
+
384
+ select, textarea {
385
+ width: 100%;
386
+ background: rgba(0,0,0,0.3);
387
+ border: 1px solid var(--node-border);
388
+ color: var(--text); padding: 9px 11px;
389
+ border-radius: 6px; outline: none;
390
+ font-size: 12px; font-family: 'JetBrains Mono', monospace;
391
+ resize: none; transition: border-color 0.2s;
392
+ }
393
+ select:focus, textarea:focus { border-color: var(--accent); }
394
+ select option { background: #1c1c26; }
395
+
396
+ button.run-btn {
397
+ background: linear-gradient(135deg, var(--accent), #9b59b6);
398
+ color: #fff; border: none;
399
+ padding: 10px; border-radius: 7px;
400
+ font-weight: 700; font-size: 12px;
401
+ font-family: 'JetBrains Mono', monospace;
402
+ cursor: pointer;
403
+ transition: opacity 0.2s, transform 0.1s;
404
+ display: flex; justify-content: center; align-items: center; gap: 8px;
405
+ letter-spacing: 0.04em;
406
+ }
407
+ button.run-btn:hover { opacity: 0.9; }
408
+ button.run-btn:active { transform: scale(0.98); }
409
+ button.run-btn:disabled { background: var(--node-border); cursor: not-allowed; color: #555; }
410
+
411
+ .output-box {
412
+ background: rgba(0,0,0,0.4);
413
+ border: 1px solid var(--node-border);
414
+ border-radius: 6px; padding: 12px;
415
+ flex: 1;
416
+ overflow-y: auto;
417
+ font-size: 12px; line-height: 1.6;
418
+ color: #c8c8e0; white-space: pre-wrap;
419
+ user-select: text;
420
+ font-family: 'JetBrains Mono', monospace;
421
+ }
422
+
423
+ /* Grounding canvas */
424
+ .ground-canvas-wrap {
425
+ position: relative; flex: 1;
426
+ border: 1px solid var(--node-border);
427
+ border-radius: 6px; overflow: hidden;
428
+ background: #000;
429
+ min-height: 0;
430
+ }
431
+ .ground-canvas-wrap canvas {
432
+ width: 100%; height: 100%;
433
+ object-fit: contain;
434
+ }
435
+ .ground-placeholder {
436
+ position: absolute; inset: 0;
437
+ display: flex; align-items: center; justify-content: center;
438
+ font-size: 11px; color: var(--muted); text-align: center;
439
+ padding: 12px;
440
+ }
441
+
442
+ .loader {
443
+ width: 12px; height: 12px;
444
+ border: 2px solid rgba(255,255,255,0.3);
445
+ border-top-color: #fff;
446
+ border-radius: 50%;
447
+ animation: spin 0.7s linear infinite;
448
+ display: none;
449
+ }
450
+ @keyframes spin { to { transform: rotate(360deg); } }
451
+
452
+ /* Status dot */
453
+ .status-dot {
454
+ width: 7px; height: 7px; border-radius: 50%;
455
+ background: var(--muted); display: inline-block;
456
+ margin-right: 6px;
457
+ }
458
+ .status-dot.active { background: var(--accent2); box-shadow: 0 0 6px var(--accent2); }
459
+
460
+ /* Model badge */
461
+ .model-badge {
462
+ display: inline-block; padding: 2px 8px;
463
+ border-radius: 4px; font-size: 9px; font-weight: 700;
464
+ letter-spacing: 0.06em; text-transform: uppercase;
465
+ }
466
+ .model-badge.qwen { background: rgba(124,106,247,0.2); color: var(--accent); border: 1px solid rgba(124,106,247,0.3); }
467
+ .model-badge.lfm { background: rgba(78,205,196,0.15); color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
468
  </style>
469
  </head>
470
  <body>
471
 
472
+ <div class="top-bar">
473
+ <span class="logo">MULTIMODAL EDGE</span>
474
+ <span class="sep">|</span>
475
+ <span class="sub">Node-Based Inference Canvas</span>
476
+ <span class="badge">v2.0 — DUAL MODEL</span>
477
+ </div>
478
+
479
+ <div id="canvas">
480
+ <svg class="wires">
481
+ <!-- Left col → Task node -->
482
+ <path id="wire-img-task" class="wire" />
483
+ <path id="wire-model-task" class="wire" />
484
+ <!-- Task → Output -->
485
+ <path id="wire-task-out" class="wire" />
486
+ <!-- Task → Ground -->
487
+ <path id="wire-task-gnd" class="wire" />
488
+ </svg>
489
+
490
+ <!-- ─── ID 01 : Image Input (left col, top) ─── -->
491
+ <div class="node fixed-height" id="node-img" style="left:60px; top:68px;">
492
+ <div class="node-header">
493
+ <span><span class="status-dot" id="dot-img"></span>Input Image</span>
494
+ <span class="id">ID: 01</span>
495
+ </div>
496
+ <div class="node-body">
497
+ <div>
498
+ <label>Upload Image</label>
499
+ <div class="file-upload" id="dropZone">
500
+ <div class="icon">🖼️</div>
501
+ Click or drop image here
502
+ <input type="file" id="fileInput" accept="image/*">
503
+ </div>
504
+ <img id="imgPreview" class="img-preview" />
505
+ </div>
506
+ </div>
507
+ <!-- OUT port: vertically centered on right edge -->
508
+ <div class="port out" id="port-img-out" style="top:50%; transform:translateY(-50%);"></div>
509
  </div>
510
 
511
+ <!-- ─── ID 02 : Model Selector (left col, bottom) ─── -->
512
+ <div class="node fixed-height" id="node-model" style="left:60px; top:508px;">
513
+ <div class="node-header">
514
+ <span><span class="status-dot" id="dot-model"></span>Model Selector</span>
515
+ <span class="id">ID: 02</span>
516
+ </div>
517
+ <div class="node-body">
518
+ <div>
519
+ <label>Active Model</label>
520
+ <select id="modelSelect">
521
+ <option value="qwen">Qwen3.5-2B (Vision-Language)</option>
522
+ <option value="lfm">LFM2.5-VL-450M (LiquidAI)</option>
523
+ </select>
524
+ </div>
525
+ <div id="modelInfoBox" style="background:rgba(124,106,247,0.07); border:1px solid rgba(124,106,247,0.2); border-radius:6px; padding:10px; font-size:10px; color:var(--muted); line-height:1.6;">
526
+ <span class="model-badge qwen">QWEN</span>
527
+ <br><br>
528
+ Qwen3.5 2B parameter multimodal model by Alibaba Cloud. Supports Query, Caption, Point & Detect tasks with streaming output.
529
  </div>
530
+ <div style="flex:1;"></div>
531
  </div>
532
+ <!-- OUT port: vertically centered on right edge -->
533
+ <div class="port out" id="port-model-out" style="top:50%; transform:translateY(-50%);"></div>
534
+ </div>
535
 
536
+ <!-- ─── ID 03 : Task Node (right col, row 1) ─── -->
537
+ <div class="node fixed-height" id="node-task" style="left:460px; top:68px;">
538
+ <!-- IN port aligned to accept wires from ID01 and ID02 -->
539
+ <div class="port in" id="port-task-in" style="top:50%; transform:translateY(-50%);"></div>
540
+ <div class="node-header">
541
+ <span><span class="status-dot" id="dot-task"></span>Task Config</span>
542
+ <span class="id">ID: 03</span>
543
+ </div>
544
+ <div class="node-body">
545
+ <div>
546
+ <label>Task Category</label>
547
+ <select id="categorySelect">
548
+ <option value="Query">Query</option>
549
+ <option value="Caption">Caption</option>
550
+ <option value="Point">Point</option>
551
+ <option value="Detect">Detect</option>
552
+ </select>
553
+ </div>
554
+ <div>
555
+ <label>Prompt Directive</label>
556
+ <textarea id="promptInput" rows="5" placeholder="e.g., Count the total number of boats and describe the environment."></textarea>
 
557
  </div>
558
+ <button class="run-btn" id="runBtn">
559
+ <span>Execute</span>
560
+ <span class="loader" id="btnLoader"></span>
561
+ </button>
562
  </div>
563
+ <!-- OUT port -->
564
+ <div class="port out" id="port-task-out" style="top:50%; transform:translateY(-50%);"></div>
565
+ </div>
566
 
567
+ <!-- ─── ID 04 : Output Node (right col, row 1, further right) ─── -->
568
+ <div class="node fixed-height" id="node-out" style="left:860px; top:68px;">
569
+ <div class="port in" id="port-out-in" style="top:50%; transform:translateY(-50%);"></div>
570
+ <div class="node-header">
571
+ <span><span class="status-dot" id="dot-out"></span>Output Stream</span>
572
+ <span class="id">ID: 04</span>
573
+ </div>
574
+ <div class="node-body">
575
+ <label>Streamed Result</label>
576
+ <div class="output-box" id="outputBox">Results will stream here...</div>
577
+ </div>
578
+ </div>
579
+
580
+ <!-- ─── ID 05 : Grounding Visualiser (right col, row 2) ─── -->
581
+ <div class="node fixed-height" id="node-gnd" style="left:860px; top:508px;">
582
+ <div class="port in" id="port-gnd-in" style="top:50%; transform:translateY(-50%);"></div>
583
+ <div class="node-header">
584
+ <span><span class="status-dot" id="dot-gnd"></span>View Grounding</span>
585
+ <span class="id">ID: 05</span>
586
+ </div>
587
+ <div class="node-body">
588
+ <label>Point / Detect Overlay</label>
589
+ <div class="ground-canvas-wrap">
590
+ <canvas id="groundCanvas"></canvas>
591
+ <div class="ground-placeholder" id="groundPlaceholder">
592
+ Active for Point / Detect tasks.<br>Run inference to visualise.
593
+ </div>
594
  </div>
595
  </div>
596
  </div>
597
+ </div>
598
+
599
+ <script>
600
+ // ══════════════════════════════════════════════
601
+ // WIRE DRAWING
602
+ // ══════════════════════════════════════════════
603
+ function portCenter(id) {
604
+ const el = document.getElementById(id);
605
+ if (!el) return {x:0,y:0};
606
+ const r = el.getBoundingClientRect();
607
+ return { x: r.left + r.width/2, y: r.top + r.height/2 };
608
+ }
609
+
610
+ function bezier(p1, p2) {
611
+ const dx = Math.abs(p2.x - p1.x) * 0.55;
612
+ return `M ${p1.x} ${p1.y} C ${p1.x+dx} ${p1.y}, ${p2.x-dx} ${p2.y}, ${p2.x} ${p2.y}`;
613
+ }
614
+
615
+ function updateWires() {
616
+ const wires = [
617
+ ['wire-img-task', 'port-img-out', 'port-task-in'],
618
+ ['wire-model-task', 'port-model-out', 'port-task-in'],
619
+ ['wire-task-out', 'port-task-out', 'port-out-in'],
620
+ ['wire-task-gnd', 'port-task-out', 'port-gnd-in'],
621
+ ];
622
+ for (const [id, from, to] of wires) {
623
+ const el = document.getElementById(id);
624
+ if (el) el.setAttribute('d', bezier(portCenter(from), portCenter(to)));
625
+ }
626
+ }
627
+
628
+ // ══════════════════════════════════════════════
629
+ // DRAGGING
630
+ // ══════════════════════════════════════════════
631
+ document.querySelectorAll('.node').forEach(node => {
632
+ const header = node.querySelector('.node-header');
633
+ let drag = false, sx, sy, il, it;
634
+
635
+ header.addEventListener('mousedown', e => {
636
+ drag = true; sx = e.clientX; sy = e.clientY;
637
+ il = parseInt(node.style.left)||0;
638
+ it = parseInt(node.style.top)||0;
639
+ node.style.zIndex = 100;
640
+ e.preventDefault();
641
+ });
642
+ document.addEventListener('mousemove', e => {
643
+ if (!drag) return;
644
+ node.style.left = `${il + e.clientX - sx}px`;
645
+ node.style.top = `${it + e.clientY - sy}px`;
646
+ updateWires();
647
+ });
648
+ document.addEventListener('mouseup', () => {
649
+ if (drag) { drag = false; node.style.zIndex = 10; }
650
+ });
651
+ });
652
+
653
+ window.addEventListener('resize', updateWires);
654
+ // Delay first draw so layout is complete
655
+ requestAnimationFrame(() => { updateWires(); });
656
+
657
+ // ══════════════════════════════════════════════
658
+ // FILE UPLOAD
659
+ // ══════════════════════════════════════════════
660
+ let currentFile = null;
661
+ const dropZone = document.getElementById('dropZone');
662
+ const fileInput = document.getElementById('fileInput');
663
+ const imgPreview = document.getElementById('imgPreview');
664
+ const dotImg = document.getElementById('dot-img');
665
+
666
+ function handleFile(file) {
667
+ if (!file || !file.type.startsWith('image/')) return;
668
+ currentFile = file;
669
+ imgPreview.src = URL.createObjectURL(file);
670
+ imgPreview.style.display = 'block';
671
+ dropZone.style.display = 'none';
672
+ dotImg.classList.add('active');
673
+ // Wait for layout then redraw wires
674
+ requestAnimationFrame(updateWires);
675
+ }
676
+
677
+ dropZone.onclick = () => fileInput.click();
678
+ fileInput.onchange = e => handleFile(e.target.files[0]);
679
+ dropZone.ondragover = e => { e.preventDefault(); dropZone.style.borderColor = 'var(--accent)'; };
680
+ dropZone.ondragleave = ()=> { dropZone.style.borderColor = ''; };
681
+ dropZone.ondrop = e => {
682
+ e.preventDefault(); dropZone.style.borderColor = '';
683
+ if (e.dataTransfer.files.length) handleFile(e.dataTransfer.files[0]);
684
+ };
685
+
686
+ // ══════════════════════════════════════════════
687
+ // MODEL SELECTOR INFO
688
+ // ══════════════════════════════════════════════
689
+ const modelSelect = document.getElementById('modelSelect');
690
+ const modelInfoBox = document.getElementById('modelInfoBox');
691
+ const dotModel = document.getElementById('dot-model');
692
+ dotModel.classList.add('active');
693
+
694
+ const MODEL_INFO = {
695
+ qwen: `<span class="model-badge qwen">QWEN</span><br><br>Qwen3.5 2B parameter multimodal model by Alibaba Cloud. Supports Query, Caption, Point & Detect tasks with streaming output.`,
696
+ lfm: `<span class="model-badge lfm">LFM</span><br><br>LFM2.5-VL 450M parameter vision-language model by LiquidAI. Ultra-lightweight edge model with strong grounding capabilities.`,
697
+ };
698
+
699
+ modelSelect.onchange = () => {
700
+ modelInfoBox.innerHTML = MODEL_INFO[modelSelect.value] || '';
701
+ };
702
+
703
+ // ══════════════════════════════════════════════
704
+ // CATEGORY → PLACEHOLDER
705
+ // ══════════════════════════════════════════════
706
+ const categorySelect = document.getElementById('categorySelect');
707
+ const promptInput = document.getElementById('promptInput');
708
+ const PLACEHOLDERS = {
709
+ Query: 'e.g., Count the total number of boats and describe the environment.',
710
+ Caption: 'e.g., short | normal | detailed',
711
+ Point: 'e.g., The gun held by the person.',
712
+ Detect: 'e.g., The headlight of the car.',
713
+ };
714
+ categorySelect.onchange = e => {
715
+ promptInput.placeholder = PLACEHOLDERS[e.target.value] || '';
716
+ };
717
+
718
+ // ══════════════════════════════════════════════
719
+ // GROUNDING VISUALIZER
720
+ // ══════════════════════════════════════════════
721
+ const groundCanvas = document.getElementById('groundCanvas');
722
+ const groundPlaceholder = document.getElementById('groundPlaceholder');
723
+ const gCtx = groundCanvas.getContext('2d');
724
+
725
+ function drawGrounding(imgSrc, jsonText) {
726
+ const parsed = safeParseJSON(jsonText);
727
+ if (!parsed || (Array.isArray(parsed) && parsed.length === 0)) return;
728
+
729
+ const img = new Image();
730
+ img.onload = () => {
731
+ groundCanvas.width = img.naturalWidth;
732
+ groundCanvas.height = img.naturalHeight;
733
+ gCtx.drawImage(img, 0, 0);
734
+ groundPlaceholder.style.display = 'none';
735
+
736
+ const W = img.naturalWidth, H = img.naturalHeight;
737
+ gCtx.strokeStyle = '#4ecdc4';
738
+ gCtx.lineWidth = Math.max(2, W/200);
739
+ gCtx.fillStyle = 'rgba(78,205,196,0.25)';
740
+ gCtx.font = `bold ${Math.max(12, W/40)}px JetBrains Mono, monospace`;
741
+
742
+ const items = Array.isArray(parsed) ? parsed : [parsed];
743
+ items.forEach((item, i) => {
744
+ // Point format: [x, y] or {x, y}
745
+ if (Array.isArray(item) && item.length === 2 && typeof item[0] === 'number') {
746
+ const [x, y] = item;
747
+ const px = x * W, py = y * H;
748
+ gCtx.beginPath();
749
+ gCtx.arc(px, py, Math.max(8, W/60), 0, Math.PI*2);
750
+ gCtx.fill();
751
+ gCtx.stroke();
752
+ }
753
+ // BBox format: [x1,y1,x2,y2] or {x1,y1,x2,y2}
754
+ if (Array.isArray(item) && item.length === 4) {
755
+ const [x1,y1,x2,y2] = item;
756
+ const bx = x1*W, by = y1*H, bw = (x2-x1)*W, bh = (y2-y1)*H;
757
+ gCtx.fillRect(bx, by, bw, bh);
758
+ gCtx.strokeRect(bx, by, bw, bh);
759
+ gCtx.fillStyle = '#4ecdc4';
760
+ gCtx.fillText(`${i+1}`, bx+4, by+gCtx.font.match(/\d+/)[0]*1.1);
761
+ gCtx.fillStyle = 'rgba(78,205,196,0.25)';
762
+ }
763
+ });
764
+ };
765
+ img.src = imgSrc;
766
+ }
767
+
768
+ function safeParseJSON(text) {
769
+ text = text.trim().replace(/^```(json)?/, '').replace(/```$/, '').trim();
770
+ try { return JSON.parse(text); } catch(_){}
771
+ try { return eval('(' + text + ')'); } catch(_) { return null; }
772
+ }
773
+
774
+ // ══════════════════════════════════════════════
775
+ // RUN INFERENCE
776
+ // ══════════════════════════════════════════════
777
+ const runBtn = document.getElementById('runBtn');
778
+ const btnLoader = document.getElementById('btnLoader');
779
+ const outputBox = document.getElementById('outputBox');
780
+ const allWires = ['wire-img-task','wire-model-task','wire-task-out','wire-task-gnd'];
781
+ const dotTask = document.getElementById('dot-task');
782
+ const dotOut = document.getElementById('dot-out');
783
+ const dotGnd = document.getElementById('dot-gnd');
784
+
785
+ runBtn.onclick = async () => {
786
+ if (!currentFile) { alert('Please upload an image into the Input Node.'); return; }
787
+ const promptStr = promptInput.value.trim();
788
+ if (!promptStr) { alert('Please enter a prompt directive.'); return; }
789
+
790
+ // UI: running state
791
+ runBtn.disabled = true;
792
+ btnLoader.style.display = 'inline-block';
793
+ outputBox.innerText = '';
794
+ outputBox.style.color = '';
795
+ groundPlaceholder.style.display = 'flex';
796
+ gCtx.clearRect(0, 0, groundCanvas.width, groundCanvas.height);
797
+ dotTask.classList.add('active');
798
+ dotOut.classList.remove('active');
799
+ dotGnd.classList.remove('active');
800
+ allWires.forEach(id => document.getElementById(id)?.classList.add('active'));
801
+
802
+ const formData = new FormData();
803
+ formData.append('image', currentFile);
804
+ formData.append('category', categorySelect.value);
805
+ formData.append('prompt', promptStr);
806
+ formData.append('model_id', modelSelect.value);
807
+
808
+ let fullText = '';
809
+
810
+ try {
811
+ const response = await fetch('/api/run', { method: 'POST', body: formData });
812
+ if (!response.ok) {
813
+ const err = await response.json();
814
+ throw new Error(err.error || 'Execution failed.');
815
+ }
816
+
817
+ const reader = response.body.getReader();
818
+ const decoder = new TextDecoder('utf-8');
819
+ let buffer = '';
820
+
821
+ while (true) {
822
+ const { value, done } = await reader.read();
823
+ if (done) break;
824
+ buffer += decoder.decode(value, { stream: true });
825
+ const lines = buffer.split('\\n\\n');
826
+ buffer = lines.pop();
827
+ for (const line of lines) {
828
+ if (!line.startsWith('data: ')) continue;
829
+ const payload = line.replace('data: ', '');
830
+ if (payload === '[DONE]') break;
831
+ try {
832
+ const data = JSON.parse(payload);
833
+ if (data.chunk) {
834
+ fullText += data.chunk;
835
+ outputBox.innerText = fullText;
836
+ outputBox.scrollTop = outputBox.scrollHeight;
837
+ }
838
+ } catch(_) {}
839
+ }
840
+ }
841
+
842
+ dotOut.classList.add('active');
843
+
844
+ // Visualise grounding if applicable
845
+ const cat = categorySelect.value;
846
+ if ((cat === 'Point' || cat === 'Detect') && fullText.trim()) {
847
+ dotGnd.classList.add('active');
848
+ drawGrounding(URL.createObjectURL(currentFile), fullText);
849
+ }
850
 
851
+ } catch (err) {
852
+ outputBox.innerText = `[Error] ${err.message}`;
853
+ outputBox.style.color = '#ff6b6b';
854
+ } finally {
855
+ runBtn.disabled = false;
856
+ btnLoader.style.display = 'none';
857
+ dotTask.classList.remove('active');
858
+ allWires.forEach(id => document.getElementById(id)?.classList.remove('active'));
859
+ }
860
+ };
861
+ </script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862
  </body>
863
  </html>
864
  """