fix-bot commited on
Commit
4c3d7bf
·
1 Parent(s): 605f703

ui: restore staggered detection pop-in and add usage guide

Browse files

Bring back one-by-one overlay animation and decoding trace from the earlier demo, and add Quick Start guidance for categories and parameters.

Files changed (2) hide show
  1. app.py +118 -6
  2. index.html +117 -22
app.py CHANGED
@@ -306,6 +306,107 @@ def _parse_out_info_dict(out_info: str) -> dict:
306
  return stats
307
 
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  def generate_raw_prompt(task_type, category):
310
  if not category:
311
  category = "objects"
@@ -407,7 +508,8 @@ def run_image_gpu_api(
407
  "coords": [round(c, 2) for c in det.get("coords", [])]
408
  })
409
 
410
- return out_img_path, stats, output_text, detections_summary
 
411
 
412
 
413
  @spaces.GPU(duration=240, size="xlarge")
@@ -558,7 +660,15 @@ def run_video_gpu_api(
558
  "early_stop_reason": early_stop_reason
559
  }
560
 
561
- return out_video_path, stats, "\n---\n".join(inference_results_for_draw), detections_summary
 
 
 
 
 
 
 
 
562
 
563
 
564
  # ============================================================
@@ -623,7 +733,7 @@ def run_inference_api(
623
  if not img_path or not os.path.exists(img_path):
624
  return None, None, {"success": False, "error": f"Invalid image file path: {img_path}"}
625
 
626
- out_img_path, stats, raw_text, detections = run_image_gpu_api(
627
  img_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt
628
  )
629
 
@@ -633,7 +743,8 @@ def run_inference_api(
633
  "stats": stats,
634
  "raw_text": raw_text,
635
  "detections": detections,
636
- "final_prompt": final_prompt
 
637
  }
638
  return FileData(path=out_img_path), None, meta
639
 
@@ -652,7 +763,7 @@ def run_inference_api(
652
  if not vid_path or not os.path.exists(vid_path):
653
  return None, None, {"success": False, "error": f"Invalid video file path: {vid_path}"}
654
 
655
- out_vid_path, stats, raw_text, detections = run_video_gpu_api(
656
  vid_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt, max_video_frames
657
  )
658
 
@@ -662,7 +773,8 @@ def run_inference_api(
662
  "stats": stats,
663
  "raw_text": raw_text,
664
  "detections": detections,
665
- "final_prompt": final_prompt
 
666
  }
667
  return None, FileData(path=out_vid_path), meta
668
 
 
306
  return stats
307
 
308
 
309
+ def generate_dynamic_html(token_sequence, out_info, raw_text):
310
+ uid = f"a{int(time.time() * 1000)}"
311
+ css = f"""
312
+ <style>
313
+ .dc-root-{uid} {{
314
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
315
+ border: 1px solid rgba(118, 185, 0, 0.25); border-radius: 12px;
316
+ background: rgba(0, 0, 0, 0.55); overflow: hidden;
317
+ }}
318
+ .dc-header-{uid} {{
319
+ display: flex; align-items: center; justify-content: space-between; flex-wrap: wrap; gap: 8px;
320
+ padding: 10px 14px;
321
+ background: linear-gradient(135deg, rgba(118, 185, 0, 0.25) 0%, rgba(63, 98, 0, 0.35) 100%);
322
+ border-bottom: 1px solid rgba(118, 185, 0, 0.2);
323
+ }}
324
+ .dc-header-title-{uid} {{ font-weight: 700; font-size: 0.82em; color: #d9f99d; letter-spacing: 0.04em; text-transform: uppercase; }}
325
+ .dc-legend-{uid} {{ display: flex; gap: 12px; align-items: center; flex-wrap: wrap; }}
326
+ .dc-legend-item-{uid} {{ display: flex; align-items: center; gap: 5px; font-size: 0.72em; color: rgba(226, 232, 240, 0.85); }}
327
+ .dc-legend-dot-{uid} {{ width: 8px; height: 8px; border-radius: 2px; display: inline-block; }}
328
+ .dc-row-{uid} {{ display: flex; gap: 10px; padding: 12px 14px; border-bottom: 1px solid rgba(255,255,255,0.05); }}
329
+ .dc-row-{uid}:last-child {{ border-bottom: none; }}
330
+ .dc-val-{uid} {{ flex: 1; line-height: 2.2; word-wrap: break-word; color: #cbd5e1; font-size: 0.85em; }}
331
+ @keyframes tk-{uid} {{
332
+ 0% {{ opacity: 0; transform: translateY(8px) scale(0.92); }}
333
+ 60% {{ opacity: 1; transform: translateY(-2px) scale(1.02); }}
334
+ 100% {{ opacity: 1; transform: translateY(0) scale(1); }}
335
+ }}
336
+ .tk-mtp-{uid}, .tk-ar-{uid} {{
337
+ opacity: 0; animation: tk-{uid} 0.35s ease-out forwards;
338
+ border-radius: 5px; padding: 2px 7px; margin: 2px 1px; display: inline-block;
339
+ font-size: 0.78em; font-weight: 600;
340
+ font-family: 'Fira Code', Consolas, monospace; white-space: nowrap;
341
+ }}
342
+ .tk-mtp-{uid} {{ background: rgba(118, 185, 0, 0.15); border: 1px solid rgba(118, 185, 0, 0.55); color: #bbf7d0; }}
343
+ .tk-ar-{uid} {{ background: rgba(230, 81, 0, 0.15); border: 1px solid rgba(230, 81, 0, 0.55); color: #fed7aa; }}
344
+ .tk-stat-{uid} {{
345
+ opacity: 0; animation: tk-{uid} 0.4s ease-out forwards;
346
+ background: rgba(118, 185, 0, 0.12); border: 1px solid rgba(118, 185, 0, 0.35); border-radius: 6px;
347
+ padding: 4px 12px; display: inline-block; font-size: 0.78em; color: #d9f99d; font-weight: 600;
348
+ }}
349
+ .dc-raw-{uid} {{ padding: 0 14px 12px; }}
350
+ .dc-raw-{uid} summary {{ cursor: pointer; color: #94a3b8; font-size: 0.78em; user-select: none; }}
351
+ .dc-raw-{uid} summary:hover {{ color: #76b900; }}
352
+ .dc-raw-pre-{uid} {{
353
+ background: rgba(0,0,0,0.45); border: 1px solid rgba(255,255,255,0.08); border-radius: 6px;
354
+ padding: 10px; margin-top: 8px;
355
+ font-family: 'Fira Code', Consolas, monospace;
356
+ font-size: 0.74em; color: #cbd5e1; white-space: pre-wrap; word-break: break-all;
357
+ max-height: 180px; overflow-y: auto;
358
+ }}
359
+ </style>
360
+ """
361
+ h = css + f'<div class="dc-root-{uid}">'
362
+ h += (f'<div class="dc-header-{uid}">'
363
+ f'<span class="dc-header-title-{uid}">Decoding Trace</span>'
364
+ f'<div class="dc-legend-{uid}">'
365
+ f'<div class="dc-legend-item-{uid}"><span class="dc-legend-dot-{uid}" style="background:#76b900;"></span>MTP Parallel</div>'
366
+ f'<div class="dc-legend-item-{uid}"><span class="dc-legend-dot-{uid}" style="background:#e65100;"></span>AR Fallback</div>'
367
+ f'</div></div>')
368
+ h += f'<div class="dc-row-{uid}"><div class="dc-val-{uid}">'
369
+ tok_idx = 0
370
+ if token_sequence:
371
+ for item in token_sequence:
372
+ if not isinstance(item, (list, tuple)) or len(item) < 2:
373
+ continue
374
+ decode_type = str(item[0]).lower()
375
+ text = str(item[1])
376
+ safe = text.replace("<", "&lt;").replace(">", "&gt;")
377
+ delay = f"{tok_idx * 0.06:.2f}s"
378
+ cls = f"tk-ar-{uid}" if decode_type == "ar" else f"tk-mtp-{uid}"
379
+ h += f'<span class="{cls}" style="animation-delay:{delay}">{safe}</span> '
380
+ tok_idx += 1
381
+ h += '</div></div>'
382
+ if out_info:
383
+ stats = _parse_out_info_dict(out_info)
384
+ bits = []
385
+ if "forward_step" in stats:
386
+ bits.append(f"{stats['forward_step']} steps")
387
+ if "num_tokens" in stats:
388
+ bits.append(f"{stats['num_tokens']} tokens")
389
+ if "num_boxes" in stats:
390
+ bits.append(f"{stats['num_boxes']} boxes")
391
+ if "switch_to_ar" in stats:
392
+ n = stats["switch_to_ar"]
393
+ bits.append(f"{n} AR fallback{'s' if n != '1' else ''}")
394
+ if "tps" in stats:
395
+ bits.append(f"{stats['tps']} tok/s")
396
+ if "bps" in stats:
397
+ bits.append(f"{stats['bps']} box/s")
398
+ summary = " · ".join(bits) if bits else out_info.strip()
399
+ stat_delay = f"{tok_idx * 0.06 + 0.3:.2f}s"
400
+ h += (f'<div class="dc-row-{uid}" style="justify-content:flex-end;padding-top:4px;padding-bottom:8px;border-bottom:none;">'
401
+ f'<span class="tk-stat-{uid}" style="animation-delay:{stat_delay}">{summary}</span></div>')
402
+ if raw_text:
403
+ safe_raw = raw_text.replace("<", "&lt;").replace(">", "&gt;")
404
+ h += (f'<div class="dc-raw-{uid}"><details><summary>Show Raw Response</summary>'
405
+ f'<div class="dc-raw-pre-{uid}">{safe_raw}</div></details></div>')
406
+ h += '</div>'
407
+ return h
408
+
409
+
410
  def generate_raw_prompt(task_type, category):
411
  if not category:
412
  category = "objects"
 
508
  "coords": [round(c, 2) for c in det.get("coords", [])]
509
  })
510
 
511
+ html = generate_dynamic_html(token_sequence, out_info, output_text)
512
+ return out_img_path, stats, output_text, detections_summary, html
513
 
514
 
515
  @spaces.GPU(duration=240, size="xlarge")
 
660
  "early_stop_reason": early_stop_reason
661
  }
662
 
663
+ raw_combined = "\n---\n".join(inference_results_for_draw)
664
+ timing_summary = (
665
+ f"Processed {processed_count}/{n_sampled} sampled frames "
666
+ f"({total} total) in {total_time:.1f}s"
667
+ )
668
+ if early_stopped:
669
+ timing_summary += f" — {early_stop_reason}"
670
+ html = generate_dynamic_html([], "", timing_summary + "\n\n" + raw_combined)
671
+ return out_video_path, stats, raw_combined, detections_summary, html
672
 
673
 
674
  # ============================================================
 
733
  if not img_path or not os.path.exists(img_path):
734
  return None, None, {"success": False, "error": f"Invalid image file path: {img_path}"}
735
 
736
+ out_img_path, stats, raw_text, detections, html = run_image_gpu_api(
737
  img_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt
738
  )
739
 
 
743
  "stats": stats,
744
  "raw_text": raw_text,
745
  "detections": detections,
746
+ "final_prompt": final_prompt,
747
+ "html": html,
748
  }
749
  return FileData(path=out_img_path), None, meta
750
 
 
763
  if not vid_path or not os.path.exists(vid_path):
764
  return None, None, {"success": False, "error": f"Invalid video file path: {vid_path}"}
765
 
766
+ out_vid_path, stats, raw_text, detections, html = run_video_gpu_api(
767
  vid_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt, max_video_frames
768
  )
769
 
 
773
  "stats": stats,
774
  "raw_text": raw_text,
775
  "detections": detections,
776
+ "final_prompt": final_prompt,
777
+ "html": html,
778
  }
779
  return None, FileData(path=out_vid_path), meta
780
 
index.html CHANGED
@@ -184,6 +184,20 @@
184
  border-color: #76b900 !important;
185
  background: rgba(118, 185, 0, 0.04) !important;
186
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  </style>
188
  </head>
189
  <body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
@@ -270,6 +284,9 @@
270
  <p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
271
  NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
272
  </p>
 
 
 
273
  </div>
274
 
275
  <!-- Setup Glass Card Controls -->
@@ -367,6 +384,23 @@
367
 
368
  </div>
369
  </details>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  </div>
371
 
372
  <!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
@@ -380,7 +414,7 @@
380
  </div>
381
 
382
  <!-- 3. Floating Categories Search Bar Overlay (Right/Center side, extremely clean glass box) -->
383
- <div class="absolute top-1/2 right-6 lg:right-16 -translate-y-1/2 z-30 flex justify-center pointer-events-none w-full max-w-xs">
384
  <div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full pointer-events-auto">
385
  <svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
386
  <path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
@@ -392,6 +426,9 @@
392
  </svg>
393
  </button>
394
  </div>
 
 
 
395
  </div>
396
 
397
  <!-- Floating Workspace Status -->
@@ -459,20 +496,23 @@
459
  </div>
460
 
461
  <!-- Tag drawer box list (Grid: 7) -->
462
- <div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col">
463
  <div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
464
  <span>🎯 Detected Target Overlays</span>
465
  <span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
466
  </div>
467
- <div id="detection-tags-wrapper" class="flex-1 flex flex-wrap gap-1.5 max-h-[100px] overflow-y-auto pt-1 align-content-start text-[10px] text-slate-500">
468
- Run inference to populate target tags here.
 
 
 
469
  </div>
470
  </div>
471
 
472
  </div>
473
 
474
  <!-- Optional dynamic trace wrapper -->
475
- <div id="rich-trace-log" class="hidden border-t border-white/5 pt-3"></div>
476
  </div>
477
  </div>
478
 
@@ -599,6 +639,72 @@
599
  }
600
  }
601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  // Switch workspace input styles without clearing
603
  function setMediaType(type) {
604
  selectedMediaType = type;
@@ -637,6 +743,7 @@
637
  outputVideo.src = "";
638
  outputVideo.classList.add("hidden");
639
  workspaceStatus.textContent = "Workspace Cleared";
 
640
  }
641
 
642
  // Drag and drop utilities
@@ -780,8 +887,9 @@
780
  richTraceLog.classList.add("hidden");
781
  metaStatus.textContent = "Processing...";
782
  metaStatus.className = "text-yellow-500 font-semibold";
783
- detectionTagsWrapper.innerHTML = "Processing objects in backend...";
784
  detectionCountBadge.textContent = "0";
 
785
 
786
  try {
787
  const clientInstance = await getClient();
@@ -855,23 +963,10 @@
855
  metaBps.textContent = stats.bps || "-";
856
  metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
857
 
858
- // Render detection tags
859
- const detections = meta.detections || [];
860
- detectionCountBadge.textContent = detections.length;
861
-
862
- if (detections.length === 0) {
863
- detectionTagsWrapper.innerHTML = "No objects matched categories.";
864
- } else {
865
- detectionTagsWrapper.innerHTML = "";
866
- detections.forEach(det => {
867
- const tag = document.createElement("span");
868
- tag.className = "px-2 py-0.5 rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 font-bold uppercase tracking-wider text-[8px] animate-fade-in";
869
- tag.textContent = det.frame ? `[Frame ${det.frame}] ${det.label}` : det.label;
870
- detectionTagsWrapper.appendChild(tag);
871
- });
872
- }
873
 
874
- // Render logs trace
875
  if (meta.html) {
876
  richTraceLog.innerHTML = meta.html;
877
  richTraceLog.classList.remove("hidden");
 
184
  border-color: #76b900 !important;
185
  background: rgba(118, 185, 0, 0.04) !important;
186
  }
187
+
188
+ /* Detection overlay tag pop-in (restored from previous demo) */
189
+ @keyframes det-pop {
190
+ 0% { opacity: 0; transform: translateY(10px) scale(0.88); }
191
+ 60% { opacity: 1; transform: translateY(-2px) scale(1.03); }
192
+ 100% { opacity: 1; transform: translateY(0) scale(1); }
193
+ }
194
+ .det-tag-pop {
195
+ opacity: 0;
196
+ animation: det-pop 0.38s cubic-bezier(0.16, 1, 0.3, 1) forwards;
197
+ }
198
+ .det-count-pop {
199
+ animation: det-pop 0.35s cubic-bezier(0.16, 1, 0.3, 1) forwards;
200
+ }
201
  </style>
202
  </head>
203
  <body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
 
284
  <p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
285
  NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
286
  </p>
287
+ <p class="text-[9px] text-slate-500 max-w-sm leading-relaxed border-l-2 border-nvidia-brand/30 pl-2.5">
288
+ Note: inputs larger than 1K are auto-resized in this Space demo. For full-resolution inference, download the weights and run locally.
289
+ </p>
290
  </div>
291
 
292
  <!-- Setup Glass Card Controls -->
 
384
 
385
  </div>
386
  </details>
387
+
388
+ <!-- Quick Start Guide -->
389
+ <details class="group border-t border-white/5 pt-3" open>
390
+ <summary class="list-none flex justify-between items-center cursor-pointer select-none text-[8px] font-bold text-nvidia-brand tracking-wider uppercase hover:text-nvidia-hover transition-colors">
391
+ <span>📖 How to Use</span>
392
+ <svg class="h-3 w-3 transform group-open:rotate-180 transition-transform text-slate-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
393
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7" />
394
+ </svg>
395
+ </summary>
396
+ <ol class="space-y-1.5 pt-2.5 text-[9px] text-slate-400 leading-relaxed list-decimal list-inside marker:text-nvidia-brand/70">
397
+ <li>Upload an <strong class="text-slate-300">Image</strong> or <strong class="text-slate-300">Video</strong>, or pick a Quick Sandbox example below.</li>
398
+ <li>Choose a <strong class="text-slate-300">Task Type</strong>: Detection · Grounding · OCR · GUI · Pointing.</li>
399
+ <li>Enter <strong class="text-slate-300">Categories</strong> in the search bar (comma-separated, e.g. <code class="text-nvidia-brand/80">car, person</code>).</li>
400
+ <li>Optionally tune <strong class="text-slate-300">Advanced parameters</strong> above (mode, resize, temperature, etc.).</li>
401
+ <li>Click <strong class="text-nvidia-brand">Run Inference</strong> or press <kbd class="px-1 py-0.5 rounded bg-white/5 border border-white/10 text-[8px]">Enter</kbd> in the search bar.</li>
402
+ </ol>
403
+ </details>
404
  </div>
405
 
406
  <!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
 
414
  </div>
415
 
416
  <!-- 3. Floating Categories Search Bar Overlay (Right/Center side, extremely clean glass box) -->
417
+ <div class="absolute top-1/2 right-6 lg:right-16 -translate-y-1/2 z-30 flex flex-col items-end gap-2 pointer-events-none w-full max-w-xs">
418
  <div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full pointer-events-auto">
419
  <svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
420
  <path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
 
426
  </svg>
427
  </button>
428
  </div>
429
+ <p class="text-[9px] text-slate-500 text-right leading-relaxed pointer-events-none px-1 max-w-full">
430
+ Comma-separated targets · supports English &amp; Chinese · press <span class="text-slate-400">Enter</span> to run
431
+ </p>
432
  </div>
433
 
434
  <!-- Floating Workspace Status -->
 
496
  </div>
497
 
498
  <!-- Tag drawer box list (Grid: 7) -->
499
+ <div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col min-h-[140px]">
500
  <div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
501
  <span>🎯 Detected Target Overlays</span>
502
  <span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
503
  </div>
504
+ <div id="detection-tags-wrapper" class="flex-1 flex flex-col gap-1.5 max-h-[120px] overflow-y-auto pt-1 text-[10px] text-slate-500">
505
+ <div id="detection-empty-hint" class="space-y-1.5 leading-relaxed">
506
+ <p>Run inference to populate detected targets here — each result will pop in one by one.</p>
507
+ <p class="text-[9px] text-slate-600">Adjustable: Task Type · Categories · Inference Mode · Resize Cap · Temperature · Top P/K · Max Video Frames</p>
508
+ </div>
509
  </div>
510
  </div>
511
 
512
  </div>
513
 
514
  <!-- Optional dynamic trace wrapper -->
515
+ <div id="rich-trace-log" class="hidden border-t border-white/5 pt-3 text-[10px]"></div>
516
  </div>
517
  </div>
518
 
 
639
  }
640
  }
641
 
642
+ function formatDetectionCoords(det) {
643
+ const coords = det.coords || [];
644
+ if (!coords.length) return "";
645
+ const rounded = coords.map(c => Number.isFinite(c) ? Math.round(c) : c);
646
+ return rounded.join(", ");
647
+ }
648
+
649
+ function renderDetectionTags(detections) {
650
+ detectionTagsWrapper.innerHTML = "";
651
+ detectionCountBadge.textContent = "0";
652
+ detectionCountBadge.classList.remove("det-count-pop");
653
+
654
+ if (!detections.length) {
655
+ detectionTagsWrapper.innerHTML = '<p class="text-slate-500">No objects matched the given categories.</p>';
656
+ return;
657
+ }
658
+
659
+ // Animate count badge after tags finish popping in
660
+ const countDelay = detections.length * 80 + 120;
661
+ setTimeout(() => {
662
+ detectionCountBadge.textContent = detections.length;
663
+ detectionCountBadge.classList.add("det-count-pop");
664
+ }, countDelay);
665
+
666
+ detections.forEach((det, idx) => {
667
+ setTimeout(() => {
668
+ const card = document.createElement("div");
669
+ card.className = "det-tag-pop flex items-center justify-between gap-2 px-2 py-1.5 rounded-lg bg-nvidia-brand/8 border border-nvidia-brand/20 hover:border-nvidia-brand/40 transition-colors";
670
+ card.style.animationDelay = "0s";
671
+
672
+ const labelWrap = document.createElement("div");
673
+ labelWrap.className = "flex items-center gap-1.5 min-w-0";
674
+
675
+ const typeBadge = document.createElement("span");
676
+ typeBadge.className = "shrink-0 px-1 py-0.5 rounded text-[7px] font-bold uppercase tracking-wider bg-black/40 text-nvidia-brand border border-nvidia-brand/25";
677
+ typeBadge.textContent = det.type || "box";
678
+
679
+ const label = document.createElement("span");
680
+ label.className = "font-bold uppercase tracking-wider text-[9px] text-nvidia-brand truncate";
681
+ label.textContent = det.frame ? `[F${det.frame}] ${det.label}` : (det.label || "object");
682
+
683
+ labelWrap.appendChild(typeBadge);
684
+ labelWrap.appendChild(label);
685
+
686
+ const coords = document.createElement("span");
687
+ coords.className = "shrink-0 font-mono text-[8px] text-slate-500";
688
+ const coordStr = formatDetectionCoords(det);
689
+ coords.textContent = coordStr ? `[${coordStr}]` : "";
690
+
691
+ card.appendChild(labelWrap);
692
+ card.appendChild(coords);
693
+ detectionTagsWrapper.appendChild(card);
694
+ }, idx * 80);
695
+ });
696
+ }
697
+
698
+ function resetDetectionTagsPlaceholder() {
699
+ detectionTagsWrapper.innerHTML = `
700
+ <div id="detection-empty-hint" class="space-y-1.5 leading-relaxed">
701
+ <p>Run inference to populate detected targets here — each result will pop in one by one.</p>
702
+ <p class="text-[9px] text-slate-600">Adjustable: Task Type · Categories · Inference Mode · Resize Cap · Temperature · Top P/K · Max Video Frames</p>
703
+ </div>`;
704
+ detectionCountBadge.textContent = "0";
705
+ detectionCountBadge.classList.remove("det-count-pop");
706
+ }
707
+
708
  // Switch workspace input styles without clearing
709
  function setMediaType(type) {
710
  selectedMediaType = type;
 
743
  outputVideo.src = "";
744
  outputVideo.classList.add("hidden");
745
  workspaceStatus.textContent = "Workspace Cleared";
746
+ resetDetectionTagsPlaceholder();
747
  }
748
 
749
  // Drag and drop utilities
 
887
  richTraceLog.classList.add("hidden");
888
  metaStatus.textContent = "Processing...";
889
  metaStatus.className = "text-yellow-500 font-semibold";
890
+ detectionTagsWrapper.innerHTML = '<p class="text-slate-400 animate-pulse">Processing objects in backend...</p>';
891
  detectionCountBadge.textContent = "0";
892
+ detectionCountBadge.classList.remove("det-count-pop");
893
 
894
  try {
895
  const clientInstance = await getClient();
 
963
  metaBps.textContent = stats.bps || "-";
964
  metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
965
 
966
+ // Render detection tags with staggered pop-in animation
967
+ renderDetectionTags(meta.detections || []);
 
 
 
 
 
 
 
 
 
 
 
 
 
968
 
969
+ // Render decoding trace (token-by-token pop animation from previous version)
970
  if (meta.html) {
971
  richTraceLog.innerHTML = meta.html;
972
  richTraceLog.classList.remove("hidden");