fix-bot commited on
Commit ·
4c3d7bf
1
Parent(s): 605f703
ui: restore staggered detection pop-in and add usage guide
Browse filesBring back one-by-one overlay animation and decoding trace from the earlier demo, and add Quick Start guidance for categories and parameters.
- app.py +118 -6
- index.html +117 -22
app.py
CHANGED
|
@@ -306,6 +306,107 @@ def _parse_out_info_dict(out_info: str) -> dict:
|
|
| 306 |
return stats
|
| 307 |
|
| 308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
def generate_raw_prompt(task_type, category):
|
| 310 |
if not category:
|
| 311 |
category = "objects"
|
|
@@ -407,7 +508,8 @@ def run_image_gpu_api(
|
|
| 407 |
"coords": [round(c, 2) for c in det.get("coords", [])]
|
| 408 |
})
|
| 409 |
|
| 410 |
-
|
|
|
|
| 411 |
|
| 412 |
|
| 413 |
@spaces.GPU(duration=240, size="xlarge")
|
|
@@ -558,7 +660,15 @@ def run_video_gpu_api(
|
|
| 558 |
"early_stop_reason": early_stop_reason
|
| 559 |
}
|
| 560 |
|
| 561 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
|
| 564 |
# ============================================================
|
|
@@ -623,7 +733,7 @@ def run_inference_api(
|
|
| 623 |
if not img_path or not os.path.exists(img_path):
|
| 624 |
return None, None, {"success": False, "error": f"Invalid image file path: {img_path}"}
|
| 625 |
|
| 626 |
-
out_img_path, stats, raw_text, detections = run_image_gpu_api(
|
| 627 |
img_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt
|
| 628 |
)
|
| 629 |
|
|
@@ -633,7 +743,8 @@ def run_inference_api(
|
|
| 633 |
"stats": stats,
|
| 634 |
"raw_text": raw_text,
|
| 635 |
"detections": detections,
|
| 636 |
-
"final_prompt": final_prompt
|
|
|
|
| 637 |
}
|
| 638 |
return FileData(path=out_img_path), None, meta
|
| 639 |
|
|
@@ -652,7 +763,7 @@ def run_inference_api(
|
|
| 652 |
if not vid_path or not os.path.exists(vid_path):
|
| 653 |
return None, None, {"success": False, "error": f"Invalid video file path: {vid_path}"}
|
| 654 |
|
| 655 |
-
out_vid_path, stats, raw_text, detections = run_video_gpu_api(
|
| 656 |
vid_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt, max_video_frames
|
| 657 |
)
|
| 658 |
|
|
@@ -662,7 +773,8 @@ def run_inference_api(
|
|
| 662 |
"stats": stats,
|
| 663 |
"raw_text": raw_text,
|
| 664 |
"detections": detections,
|
| 665 |
-
"final_prompt": final_prompt
|
|
|
|
| 666 |
}
|
| 667 |
return None, FileData(path=out_vid_path), meta
|
| 668 |
|
|
|
|
| 306 |
return stats
|
| 307 |
|
| 308 |
|
| 309 |
+
def generate_dynamic_html(token_sequence, out_info, raw_text):
|
| 310 |
+
uid = f"a{int(time.time() * 1000)}"
|
| 311 |
+
css = f"""
|
| 312 |
+
<style>
|
| 313 |
+
.dc-root-{uid} {{
|
| 314 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
| 315 |
+
border: 1px solid rgba(118, 185, 0, 0.25); border-radius: 12px;
|
| 316 |
+
background: rgba(0, 0, 0, 0.55); overflow: hidden;
|
| 317 |
+
}}
|
| 318 |
+
.dc-header-{uid} {{
|
| 319 |
+
display: flex; align-items: center; justify-content: space-between; flex-wrap: wrap; gap: 8px;
|
| 320 |
+
padding: 10px 14px;
|
| 321 |
+
background: linear-gradient(135deg, rgba(118, 185, 0, 0.25) 0%, rgba(63, 98, 0, 0.35) 100%);
|
| 322 |
+
border-bottom: 1px solid rgba(118, 185, 0, 0.2);
|
| 323 |
+
}}
|
| 324 |
+
.dc-header-title-{uid} {{ font-weight: 700; font-size: 0.82em; color: #d9f99d; letter-spacing: 0.04em; text-transform: uppercase; }}
|
| 325 |
+
.dc-legend-{uid} {{ display: flex; gap: 12px; align-items: center; flex-wrap: wrap; }}
|
| 326 |
+
.dc-legend-item-{uid} {{ display: flex; align-items: center; gap: 5px; font-size: 0.72em; color: rgba(226, 232, 240, 0.85); }}
|
| 327 |
+
.dc-legend-dot-{uid} {{ width: 8px; height: 8px; border-radius: 2px; display: inline-block; }}
|
| 328 |
+
.dc-row-{uid} {{ display: flex; gap: 10px; padding: 12px 14px; border-bottom: 1px solid rgba(255,255,255,0.05); }}
|
| 329 |
+
.dc-row-{uid}:last-child {{ border-bottom: none; }}
|
| 330 |
+
.dc-val-{uid} {{ flex: 1; line-height: 2.2; word-wrap: break-word; color: #cbd5e1; font-size: 0.85em; }}
|
| 331 |
+
@keyframes tk-{uid} {{
|
| 332 |
+
0% {{ opacity: 0; transform: translateY(8px) scale(0.92); }}
|
| 333 |
+
60% {{ opacity: 1; transform: translateY(-2px) scale(1.02); }}
|
| 334 |
+
100% {{ opacity: 1; transform: translateY(0) scale(1); }}
|
| 335 |
+
}}
|
| 336 |
+
.tk-mtp-{uid}, .tk-ar-{uid} {{
|
| 337 |
+
opacity: 0; animation: tk-{uid} 0.35s ease-out forwards;
|
| 338 |
+
border-radius: 5px; padding: 2px 7px; margin: 2px 1px; display: inline-block;
|
| 339 |
+
font-size: 0.78em; font-weight: 600;
|
| 340 |
+
font-family: 'Fira Code', Consolas, monospace; white-space: nowrap;
|
| 341 |
+
}}
|
| 342 |
+
.tk-mtp-{uid} {{ background: rgba(118, 185, 0, 0.15); border: 1px solid rgba(118, 185, 0, 0.55); color: #bbf7d0; }}
|
| 343 |
+
.tk-ar-{uid} {{ background: rgba(230, 81, 0, 0.15); border: 1px solid rgba(230, 81, 0, 0.55); color: #fed7aa; }}
|
| 344 |
+
.tk-stat-{uid} {{
|
| 345 |
+
opacity: 0; animation: tk-{uid} 0.4s ease-out forwards;
|
| 346 |
+
background: rgba(118, 185, 0, 0.12); border: 1px solid rgba(118, 185, 0, 0.35); border-radius: 6px;
|
| 347 |
+
padding: 4px 12px; display: inline-block; font-size: 0.78em; color: #d9f99d; font-weight: 600;
|
| 348 |
+
}}
|
| 349 |
+
.dc-raw-{uid} {{ padding: 0 14px 12px; }}
|
| 350 |
+
.dc-raw-{uid} summary {{ cursor: pointer; color: #94a3b8; font-size: 0.78em; user-select: none; }}
|
| 351 |
+
.dc-raw-{uid} summary:hover {{ color: #76b900; }}
|
| 352 |
+
.dc-raw-pre-{uid} {{
|
| 353 |
+
background: rgba(0,0,0,0.45); border: 1px solid rgba(255,255,255,0.08); border-radius: 6px;
|
| 354 |
+
padding: 10px; margin-top: 8px;
|
| 355 |
+
font-family: 'Fira Code', Consolas, monospace;
|
| 356 |
+
font-size: 0.74em; color: #cbd5e1; white-space: pre-wrap; word-break: break-all;
|
| 357 |
+
max-height: 180px; overflow-y: auto;
|
| 358 |
+
}}
|
| 359 |
+
</style>
|
| 360 |
+
"""
|
| 361 |
+
h = css + f'<div class="dc-root-{uid}">'
|
| 362 |
+
h += (f'<div class="dc-header-{uid}">'
|
| 363 |
+
f'<span class="dc-header-title-{uid}">Decoding Trace</span>'
|
| 364 |
+
f'<div class="dc-legend-{uid}">'
|
| 365 |
+
f'<div class="dc-legend-item-{uid}"><span class="dc-legend-dot-{uid}" style="background:#76b900;"></span>MTP Parallel</div>'
|
| 366 |
+
f'<div class="dc-legend-item-{uid}"><span class="dc-legend-dot-{uid}" style="background:#e65100;"></span>AR Fallback</div>'
|
| 367 |
+
f'</div></div>')
|
| 368 |
+
h += f'<div class="dc-row-{uid}"><div class="dc-val-{uid}">'
|
| 369 |
+
tok_idx = 0
|
| 370 |
+
if token_sequence:
|
| 371 |
+
for item in token_sequence:
|
| 372 |
+
if not isinstance(item, (list, tuple)) or len(item) < 2:
|
| 373 |
+
continue
|
| 374 |
+
decode_type = str(item[0]).lower()
|
| 375 |
+
text = str(item[1])
|
| 376 |
+
safe = text.replace("<", "<").replace(">", ">")
|
| 377 |
+
delay = f"{tok_idx * 0.06:.2f}s"
|
| 378 |
+
cls = f"tk-ar-{uid}" if decode_type == "ar" else f"tk-mtp-{uid}"
|
| 379 |
+
h += f'<span class="{cls}" style="animation-delay:{delay}">{safe}</span> '
|
| 380 |
+
tok_idx += 1
|
| 381 |
+
h += '</div></div>'
|
| 382 |
+
if out_info:
|
| 383 |
+
stats = _parse_out_info_dict(out_info)
|
| 384 |
+
bits = []
|
| 385 |
+
if "forward_step" in stats:
|
| 386 |
+
bits.append(f"{stats['forward_step']} steps")
|
| 387 |
+
if "num_tokens" in stats:
|
| 388 |
+
bits.append(f"{stats['num_tokens']} tokens")
|
| 389 |
+
if "num_boxes" in stats:
|
| 390 |
+
bits.append(f"{stats['num_boxes']} boxes")
|
| 391 |
+
if "switch_to_ar" in stats:
|
| 392 |
+
n = stats["switch_to_ar"]
|
| 393 |
+
bits.append(f"{n} AR fallback{'s' if n != '1' else ''}")
|
| 394 |
+
if "tps" in stats:
|
| 395 |
+
bits.append(f"{stats['tps']} tok/s")
|
| 396 |
+
if "bps" in stats:
|
| 397 |
+
bits.append(f"{stats['bps']} box/s")
|
| 398 |
+
summary = " · ".join(bits) if bits else out_info.strip()
|
| 399 |
+
stat_delay = f"{tok_idx * 0.06 + 0.3:.2f}s"
|
| 400 |
+
h += (f'<div class="dc-row-{uid}" style="justify-content:flex-end;padding-top:4px;padding-bottom:8px;border-bottom:none;">'
|
| 401 |
+
f'<span class="tk-stat-{uid}" style="animation-delay:{stat_delay}">{summary}</span></div>')
|
| 402 |
+
if raw_text:
|
| 403 |
+
safe_raw = raw_text.replace("<", "<").replace(">", ">")
|
| 404 |
+
h += (f'<div class="dc-raw-{uid}"><details><summary>Show Raw Response</summary>'
|
| 405 |
+
f'<div class="dc-raw-pre-{uid}">{safe_raw}</div></details></div>')
|
| 406 |
+
h += '</div>'
|
| 407 |
+
return h
|
| 408 |
+
|
| 409 |
+
|
| 410 |
def generate_raw_prompt(task_type, category):
|
| 411 |
if not category:
|
| 412 |
category = "objects"
|
|
|
|
| 508 |
"coords": [round(c, 2) for c in det.get("coords", [])]
|
| 509 |
})
|
| 510 |
|
| 511 |
+
html = generate_dynamic_html(token_sequence, out_info, output_text)
|
| 512 |
+
return out_img_path, stats, output_text, detections_summary, html
|
| 513 |
|
| 514 |
|
| 515 |
@spaces.GPU(duration=240, size="xlarge")
|
|
|
|
| 660 |
"early_stop_reason": early_stop_reason
|
| 661 |
}
|
| 662 |
|
| 663 |
+
raw_combined = "\n---\n".join(inference_results_for_draw)
|
| 664 |
+
timing_summary = (
|
| 665 |
+
f"Processed {processed_count}/{n_sampled} sampled frames "
|
| 666 |
+
f"({total} total) in {total_time:.1f}s"
|
| 667 |
+
)
|
| 668 |
+
if early_stopped:
|
| 669 |
+
timing_summary += f" — {early_stop_reason}"
|
| 670 |
+
html = generate_dynamic_html([], "", timing_summary + "\n\n" + raw_combined)
|
| 671 |
+
return out_video_path, stats, raw_combined, detections_summary, html
|
| 672 |
|
| 673 |
|
| 674 |
# ============================================================
|
|
|
|
| 733 |
if not img_path or not os.path.exists(img_path):
|
| 734 |
return None, None, {"success": False, "error": f"Invalid image file path: {img_path}"}
|
| 735 |
|
| 736 |
+
out_img_path, stats, raw_text, detections, html = run_image_gpu_api(
|
| 737 |
img_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt
|
| 738 |
)
|
| 739 |
|
|
|
|
| 743 |
"stats": stats,
|
| 744 |
"raw_text": raw_text,
|
| 745 |
"detections": detections,
|
| 746 |
+
"final_prompt": final_prompt,
|
| 747 |
+
"html": html,
|
| 748 |
}
|
| 749 |
return FileData(path=out_img_path), None, meta
|
| 750 |
|
|
|
|
| 763 |
if not vid_path or not os.path.exists(vid_path):
|
| 764 |
return None, None, {"success": False, "error": f"Invalid video file path: {vid_path}"}
|
| 765 |
|
| 766 |
+
out_vid_path, stats, raw_text, detections, html = run_video_gpu_api(
|
| 767 |
vid_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt, max_video_frames
|
| 768 |
)
|
| 769 |
|
|
|
|
| 773 |
"stats": stats,
|
| 774 |
"raw_text": raw_text,
|
| 775 |
"detections": detections,
|
| 776 |
+
"final_prompt": final_prompt,
|
| 777 |
+
"html": html,
|
| 778 |
}
|
| 779 |
return None, FileData(path=out_vid_path), meta
|
| 780 |
|
index.html
CHANGED
|
@@ -184,6 +184,20 @@
|
|
| 184 |
border-color: #76b900 !important;
|
| 185 |
background: rgba(118, 185, 0, 0.04) !important;
|
| 186 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
</style>
|
| 188 |
</head>
|
| 189 |
<body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
|
|
@@ -270,6 +284,9 @@
|
|
| 270 |
<p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
|
| 271 |
NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
|
| 272 |
</p>
|
|
|
|
|
|
|
|
|
|
| 273 |
</div>
|
| 274 |
|
| 275 |
<!-- Setup Glass Card Controls -->
|
|
@@ -367,6 +384,23 @@
|
|
| 367 |
|
| 368 |
</div>
|
| 369 |
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
</div>
|
| 371 |
|
| 372 |
<!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
|
|
@@ -380,7 +414,7 @@
|
|
| 380 |
</div>
|
| 381 |
|
| 382 |
<!-- 3. Floating Categories Search Bar Overlay (Right/Center side, extremely clean glass box) -->
|
| 383 |
-
<div class="absolute top-1/2 right-6 lg:right-16 -translate-y-1/2 z-30 flex
|
| 384 |
<div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full pointer-events-auto">
|
| 385 |
<svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
|
| 386 |
<path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
|
|
@@ -392,6 +426,9 @@
|
|
| 392 |
</svg>
|
| 393 |
</button>
|
| 394 |
</div>
|
|
|
|
|
|
|
|
|
|
| 395 |
</div>
|
| 396 |
|
| 397 |
<!-- Floating Workspace Status -->
|
|
@@ -459,20 +496,23 @@
|
|
| 459 |
</div>
|
| 460 |
|
| 461 |
<!-- Tag drawer box list (Grid: 7) -->
|
| 462 |
-
<div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col">
|
| 463 |
<div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
|
| 464 |
<span>🎯 Detected Target Overlays</span>
|
| 465 |
<span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
|
| 466 |
</div>
|
| 467 |
-
<div id="detection-tags-wrapper" class="flex-1 flex flex-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
| 469 |
</div>
|
| 470 |
</div>
|
| 471 |
|
| 472 |
</div>
|
| 473 |
|
| 474 |
<!-- Optional dynamic trace wrapper -->
|
| 475 |
-
<div id="rich-trace-log" class="hidden border-t border-white/5 pt-3"></div>
|
| 476 |
</div>
|
| 477 |
</div>
|
| 478 |
|
|
@@ -599,6 +639,72 @@
|
|
| 599 |
}
|
| 600 |
}
|
| 601 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
// Switch workspace input styles without clearing
|
| 603 |
function setMediaType(type) {
|
| 604 |
selectedMediaType = type;
|
|
@@ -637,6 +743,7 @@
|
|
| 637 |
outputVideo.src = "";
|
| 638 |
outputVideo.classList.add("hidden");
|
| 639 |
workspaceStatus.textContent = "Workspace Cleared";
|
|
|
|
| 640 |
}
|
| 641 |
|
| 642 |
// Drag and drop utilities
|
|
@@ -780,8 +887,9 @@
|
|
| 780 |
richTraceLog.classList.add("hidden");
|
| 781 |
metaStatus.textContent = "Processing...";
|
| 782 |
metaStatus.className = "text-yellow-500 font-semibold";
|
| 783 |
-
detectionTagsWrapper.innerHTML = "Processing objects in backend...
|
| 784 |
detectionCountBadge.textContent = "0";
|
|
|
|
| 785 |
|
| 786 |
try {
|
| 787 |
const clientInstance = await getClient();
|
|
@@ -855,23 +963,10 @@
|
|
| 855 |
metaBps.textContent = stats.bps || "-";
|
| 856 |
metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
|
| 857 |
|
| 858 |
-
// Render detection tags
|
| 859 |
-
|
| 860 |
-
detectionCountBadge.textContent = detections.length;
|
| 861 |
-
|
| 862 |
-
if (detections.length === 0) {
|
| 863 |
-
detectionTagsWrapper.innerHTML = "No objects matched categories.";
|
| 864 |
-
} else {
|
| 865 |
-
detectionTagsWrapper.innerHTML = "";
|
| 866 |
-
detections.forEach(det => {
|
| 867 |
-
const tag = document.createElement("span");
|
| 868 |
-
tag.className = "px-2 py-0.5 rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 font-bold uppercase tracking-wider text-[8px] animate-fade-in";
|
| 869 |
-
tag.textContent = det.frame ? `[Frame ${det.frame}] ${det.label}` : det.label;
|
| 870 |
-
detectionTagsWrapper.appendChild(tag);
|
| 871 |
-
});
|
| 872 |
-
}
|
| 873 |
|
| 874 |
-
// Render
|
| 875 |
if (meta.html) {
|
| 876 |
richTraceLog.innerHTML = meta.html;
|
| 877 |
richTraceLog.classList.remove("hidden");
|
|
|
|
| 184 |
border-color: #76b900 !important;
|
| 185 |
background: rgba(118, 185, 0, 0.04) !important;
|
| 186 |
}
|
| 187 |
+
|
| 188 |
+
/* Detection overlay tag pop-in (restored from previous demo) */
|
| 189 |
+
@keyframes det-pop {
|
| 190 |
+
0% { opacity: 0; transform: translateY(10px) scale(0.88); }
|
| 191 |
+
60% { opacity: 1; transform: translateY(-2px) scale(1.03); }
|
| 192 |
+
100% { opacity: 1; transform: translateY(0) scale(1); }
|
| 193 |
+
}
|
| 194 |
+
.det-tag-pop {
|
| 195 |
+
opacity: 0;
|
| 196 |
+
animation: det-pop 0.38s cubic-bezier(0.16, 1, 0.3, 1) forwards;
|
| 197 |
+
}
|
| 198 |
+
.det-count-pop {
|
| 199 |
+
animation: det-pop 0.35s cubic-bezier(0.16, 1, 0.3, 1) forwards;
|
| 200 |
+
}
|
| 201 |
</style>
|
| 202 |
</head>
|
| 203 |
<body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
|
|
|
|
| 284 |
<p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
|
| 285 |
NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
|
| 286 |
</p>
|
| 287 |
+
<p class="text-[9px] text-slate-500 max-w-sm leading-relaxed border-l-2 border-nvidia-brand/30 pl-2.5">
|
| 288 |
+
Note: inputs larger than 1K are auto-resized in this Space demo. For full-resolution inference, download the weights and run locally.
|
| 289 |
+
</p>
|
| 290 |
</div>
|
| 291 |
|
| 292 |
<!-- Setup Glass Card Controls -->
|
|
|
|
| 384 |
|
| 385 |
</div>
|
| 386 |
</details>
|
| 387 |
+
|
| 388 |
+
<!-- Quick Start Guide -->
|
| 389 |
+
<details class="group border-t border-white/5 pt-3" open>
|
| 390 |
+
<summary class="list-none flex justify-between items-center cursor-pointer select-none text-[8px] font-bold text-nvidia-brand tracking-wider uppercase hover:text-nvidia-hover transition-colors">
|
| 391 |
+
<span>📖 How to Use</span>
|
| 392 |
+
<svg class="h-3 w-3 transform group-open:rotate-180 transition-transform text-slate-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
| 393 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7" />
|
| 394 |
+
</svg>
|
| 395 |
+
</summary>
|
| 396 |
+
<ol class="space-y-1.5 pt-2.5 text-[9px] text-slate-400 leading-relaxed list-decimal list-inside marker:text-nvidia-brand/70">
|
| 397 |
+
<li>Upload an <strong class="text-slate-300">Image</strong> or <strong class="text-slate-300">Video</strong>, or pick a Quick Sandbox example below.</li>
|
| 398 |
+
<li>Choose a <strong class="text-slate-300">Task Type</strong>: Detection · Grounding · OCR · GUI · Pointing.</li>
|
| 399 |
+
<li>Enter <strong class="text-slate-300">Categories</strong> in the search bar (comma-separated, e.g. <code class="text-nvidia-brand/80">car, person</code>).</li>
|
| 400 |
+
<li>Optionally tune <strong class="text-slate-300">Advanced parameters</strong> above (mode, resize, temperature, etc.).</li>
|
| 401 |
+
<li>Click <strong class="text-nvidia-brand">Run Inference</strong> or press <kbd class="px-1 py-0.5 rounded bg-white/5 border border-white/10 text-[8px]">Enter</kbd> in the search bar.</li>
|
| 402 |
+
</ol>
|
| 403 |
+
</details>
|
| 404 |
</div>
|
| 405 |
|
| 406 |
<!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
|
|
|
|
| 414 |
</div>
|
| 415 |
|
| 416 |
<!-- 3. Floating Categories Search Bar Overlay (Right/Center side, extremely clean glass box) -->
|
| 417 |
+
<div class="absolute top-1/2 right-6 lg:right-16 -translate-y-1/2 z-30 flex flex-col items-end gap-2 pointer-events-none w-full max-w-xs">
|
| 418 |
<div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full pointer-events-auto">
|
| 419 |
<svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
|
| 420 |
<path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
|
|
|
|
| 426 |
</svg>
|
| 427 |
</button>
|
| 428 |
</div>
|
| 429 |
+
<p class="text-[9px] text-slate-500 text-right leading-relaxed pointer-events-none px-1 max-w-full">
|
| 430 |
+
Comma-separated targets · supports English & Chinese · press <span class="text-slate-400">Enter</span> to run
|
| 431 |
+
</p>
|
| 432 |
</div>
|
| 433 |
|
| 434 |
<!-- Floating Workspace Status -->
|
|
|
|
| 496 |
</div>
|
| 497 |
|
| 498 |
<!-- Tag drawer box list (Grid: 7) -->
|
| 499 |
+
<div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col min-h-[140px]">
|
| 500 |
<div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
|
| 501 |
<span>🎯 Detected Target Overlays</span>
|
| 502 |
<span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
|
| 503 |
</div>
|
| 504 |
+
<div id="detection-tags-wrapper" class="flex-1 flex flex-col gap-1.5 max-h-[120px] overflow-y-auto pt-1 text-[10px] text-slate-500">
|
| 505 |
+
<div id="detection-empty-hint" class="space-y-1.5 leading-relaxed">
|
| 506 |
+
<p>Run inference to populate detected targets here — each result will pop in one by one.</p>
|
| 507 |
+
<p class="text-[9px] text-slate-600">Adjustable: Task Type · Categories · Inference Mode · Resize Cap · Temperature · Top P/K · Max Video Frames</p>
|
| 508 |
+
</div>
|
| 509 |
</div>
|
| 510 |
</div>
|
| 511 |
|
| 512 |
</div>
|
| 513 |
|
| 514 |
<!-- Optional dynamic trace wrapper -->
|
| 515 |
+
<div id="rich-trace-log" class="hidden border-t border-white/5 pt-3 text-[10px]"></div>
|
| 516 |
</div>
|
| 517 |
</div>
|
| 518 |
|
|
|
|
| 639 |
}
|
| 640 |
}
|
| 641 |
|
| 642 |
+
function formatDetectionCoords(det) {
|
| 643 |
+
const coords = det.coords || [];
|
| 644 |
+
if (!coords.length) return "";
|
| 645 |
+
const rounded = coords.map(c => Number.isFinite(c) ? Math.round(c) : c);
|
| 646 |
+
return rounded.join(", ");
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
function renderDetectionTags(detections) {
|
| 650 |
+
detectionTagsWrapper.innerHTML = "";
|
| 651 |
+
detectionCountBadge.textContent = "0";
|
| 652 |
+
detectionCountBadge.classList.remove("det-count-pop");
|
| 653 |
+
|
| 654 |
+
if (!detections.length) {
|
| 655 |
+
detectionTagsWrapper.innerHTML = '<p class="text-slate-500">No objects matched the given categories.</p>';
|
| 656 |
+
return;
|
| 657 |
+
}
|
| 658 |
+
|
| 659 |
+
// Animate count badge after tags finish popping in
|
| 660 |
+
const countDelay = detections.length * 80 + 120;
|
| 661 |
+
setTimeout(() => {
|
| 662 |
+
detectionCountBadge.textContent = detections.length;
|
| 663 |
+
detectionCountBadge.classList.add("det-count-pop");
|
| 664 |
+
}, countDelay);
|
| 665 |
+
|
| 666 |
+
detections.forEach((det, idx) => {
|
| 667 |
+
setTimeout(() => {
|
| 668 |
+
const card = document.createElement("div");
|
| 669 |
+
card.className = "det-tag-pop flex items-center justify-between gap-2 px-2 py-1.5 rounded-lg bg-nvidia-brand/8 border border-nvidia-brand/20 hover:border-nvidia-brand/40 transition-colors";
|
| 670 |
+
card.style.animationDelay = "0s";
|
| 671 |
+
|
| 672 |
+
const labelWrap = document.createElement("div");
|
| 673 |
+
labelWrap.className = "flex items-center gap-1.5 min-w-0";
|
| 674 |
+
|
| 675 |
+
const typeBadge = document.createElement("span");
|
| 676 |
+
typeBadge.className = "shrink-0 px-1 py-0.5 rounded text-[7px] font-bold uppercase tracking-wider bg-black/40 text-nvidia-brand border border-nvidia-brand/25";
|
| 677 |
+
typeBadge.textContent = det.type || "box";
|
| 678 |
+
|
| 679 |
+
const label = document.createElement("span");
|
| 680 |
+
label.className = "font-bold uppercase tracking-wider text-[9px] text-nvidia-brand truncate";
|
| 681 |
+
label.textContent = det.frame ? `[F${det.frame}] ${det.label}` : (det.label || "object");
|
| 682 |
+
|
| 683 |
+
labelWrap.appendChild(typeBadge);
|
| 684 |
+
labelWrap.appendChild(label);
|
| 685 |
+
|
| 686 |
+
const coords = document.createElement("span");
|
| 687 |
+
coords.className = "shrink-0 font-mono text-[8px] text-slate-500";
|
| 688 |
+
const coordStr = formatDetectionCoords(det);
|
| 689 |
+
coords.textContent = coordStr ? `[${coordStr}]` : "";
|
| 690 |
+
|
| 691 |
+
card.appendChild(labelWrap);
|
| 692 |
+
card.appendChild(coords);
|
| 693 |
+
detectionTagsWrapper.appendChild(card);
|
| 694 |
+
}, idx * 80);
|
| 695 |
+
});
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
function resetDetectionTagsPlaceholder() {
|
| 699 |
+
detectionTagsWrapper.innerHTML = `
|
| 700 |
+
<div id="detection-empty-hint" class="space-y-1.5 leading-relaxed">
|
| 701 |
+
<p>Run inference to populate detected targets here — each result will pop in one by one.</p>
|
| 702 |
+
<p class="text-[9px] text-slate-600">Adjustable: Task Type · Categories · Inference Mode · Resize Cap · Temperature · Top P/K · Max Video Frames</p>
|
| 703 |
+
</div>`;
|
| 704 |
+
detectionCountBadge.textContent = "0";
|
| 705 |
+
detectionCountBadge.classList.remove("det-count-pop");
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
// Switch workspace input styles without clearing
|
| 709 |
function setMediaType(type) {
|
| 710 |
selectedMediaType = type;
|
|
|
|
| 743 |
outputVideo.src = "";
|
| 744 |
outputVideo.classList.add("hidden");
|
| 745 |
workspaceStatus.textContent = "Workspace Cleared";
|
| 746 |
+
resetDetectionTagsPlaceholder();
|
| 747 |
}
|
| 748 |
|
| 749 |
// Drag and drop utilities
|
|
|
|
| 887 |
richTraceLog.classList.add("hidden");
|
| 888 |
metaStatus.textContent = "Processing...";
|
| 889 |
metaStatus.className = "text-yellow-500 font-semibold";
|
| 890 |
+
detectionTagsWrapper.innerHTML = '<p class="text-slate-400 animate-pulse">Processing objects in backend...</p>';
|
| 891 |
detectionCountBadge.textContent = "0";
|
| 892 |
+
detectionCountBadge.classList.remove("det-count-pop");
|
| 893 |
|
| 894 |
try {
|
| 895 |
const clientInstance = await getClient();
|
|
|
|
| 963 |
metaBps.textContent = stats.bps || "-";
|
| 964 |
metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
|
| 965 |
|
| 966 |
+
// Render detection tags with staggered pop-in animation
|
| 967 |
+
renderDetectionTags(meta.detections || []);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 968 |
|
| 969 |
+
// Render decoding trace (token-by-token pop animation from previous version)
|
| 970 |
if (meta.html) {
|
| 971 |
richTraceLog.innerHTML = meta.html;
|
| 972 |
richTraceLog.classList.remove("hidden");
|