| <!doctype html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <title>AVIP – Audio–Vision Interaction Probe (Benchmark)</title> |
| <meta name="description" content="AVIP is a lightweight benchmark to check whether multimodal models truly use audio in videos." /> |
|
|
| |
| <meta property="og:title" content="AVIP Benchmark" /> |
| <meta property="og:description" content="Do multimodal models actually use audio in videos?" /> |
| <meta property="og:type" content="website" /> |
| <meta property="og:video" content="thumbnail" /> |
| <meta name="twitter:card" content="summary_large_image" /> |
|
|
| <link rel="icon" href="assets/favicon.png" /> |
| <style> |
| |
| |
| |
| :root{ |
| --bg:#0b0b0c; --surface:#111216; --text:#e9e9ee; --muted:#9aa0a6; --accent:#6ee7ff; --ring:rgba(110,231,255,.35); |
| --card: color-mix(in oklab, var(--surface), transparent 8%); |
| --border: 1px solid rgba(255,255,255,.08); |
| } |
| @media (prefers-color-scheme: light){ |
| :root{ --bg:#fafafa; --surface:#ffffff; --text:#101114; --muted:#5f6368; --accent:#0078ff; --ring:rgba(0,120,255,.2); |
| --card: color-mix(in oklab, var(--surface), transparent 4%); |
| --border: 1px solid rgba(0,0,0,.08); |
| } |
| } |
| *,*::before,*::after{ box-sizing:border-box } |
| html,body{ height:100% } |
| body{ |
| margin:0; font:16px/1.6 system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif; color:var(--text); |
| background: |
| radial-gradient(1200px 800px at 10% -5%, rgba(110,231,255,.08), transparent 40%), |
| radial-gradient(900px 700px at 110% 10%, rgba(110,231,255,.06), transparent 40%), |
| var(--bg); |
| } |
| .site-header{ position:sticky; top:0; z-index:20; display:flex; align-items:center; gap:1rem; justify-content:space-between; |
| padding:.8rem 1rem; border-bottom:var(--border); |
| background:color-mix(in oklab, var(--surface), transparent 35%); |
| backdrop-filter:saturate(1.2) blur(8px); |
| } |
| .logo{ font-weight:800; letter-spacing:.2px; text-decoration:none; color:var(--text); display:flex; align-items:center; gap:.55rem } |
| .logo .dot{ width:.7rem; height:.7rem; border-radius:999px; background:var(--accent); box-shadow:0 0 16px var(--ring) } |
| .logo span{ color:var(--accent) } |
| .site-nav{ display:flex; gap:.75rem } |
| .site-nav a{ color:var(--text); text-decoration:none; padding:.4rem .6rem; border-radius:.5rem } |
| .site-nav a:hover{ outline:2px solid var(--ring); outline-offset:2px } |
| .nav-toggle{ display:none; background:transparent; border:1px solid rgba(255,255,255,.2); color:var(--text); border-radius:.5rem; padding:.4rem .6rem } |
| @media (max-width: 720px){ |
| .nav-toggle{ display:inline-block } |
| .site-nav{ position:absolute; right:1rem; top:3.4rem; flex-direction:column; padding:.6rem; background:var(--surface); border:var(--border); border-radius:.6rem; display:none } |
| .site-nav.open{ display:flex } |
| } |
| |
| .hero{ text-align:center; padding:4.5rem 1rem 2rem; max-width:1060px; margin:0 auto } |
| .hero h1{ font-size:clamp(2rem,3.6vw,3rem); margin:0 0 .5rem } |
| .hero p{ margin:0 0 1.5rem; color:var(--muted) } |
| .btn{ display:inline-block; padding:.72rem 1rem; border-radius:.6rem; background:var(--accent); color:#06141b; text-decoration:none; font-weight:700 } |
| .cover{ margin:1.2rem auto 0; max-width:980px; aspect-ratio: 16/9; border-radius:.8rem; overflow:hidden; border:var(--border); background:var(--card) } |
| .cover img{ width:100%; height:100%; object-fit:cover; display:block } |
| |
| .section{ padding:2.6rem 1rem; max-width:1060px; margin:0 auto } |
| .section h2{ font-size:clamp(1.4rem,2.2vw,1.8rem); margin:0 0 .4rem } |
| .muted{ color:var(--muted) } |
| |
| .features{ display:grid; gap:.8rem; grid-template-columns:repeat(auto-fill,minmax(230px,1fr)); margin:1rem 0 0 } |
| .card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:1rem } |
| |
| |
| .table-wrap{ overflow-x:auto; background:var(--card); border-radius:.8rem; border:var(--border) } |
| table{ width:100%; border-collapse:collapse } |
| th,td{ padding:.7rem .8rem; border-bottom:1px solid rgba(255,255,255,.06); text-align:left } |
| th{ font-weight:700 } |
| .pill{ display:inline-block; padding:.2rem .5rem; border-radius:.5rem; background:rgba(110,231,255,.18); color:var(--text); font-size:.8rem } |
| |
| |
| .media-grid{ display:grid; grid-template-columns:2fr 1fr; gap:1rem } |
| @media (max-width: 920px){ .media-grid{ grid-template-columns:1fr } } |
| figure{ margin:0 } |
| .media-card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:.8rem } |
| video{ width:100%; max-height:520px; background:#000; border-radius:.6rem } |
| .toolbar{ display:flex; gap:.5rem; flex-wrap:wrap; margin:.6rem 0 } |
| .toolbar button{ background:transparent; color:var(--text); border:1px solid rgba(255,255,255,.25); padding:.38rem .6rem; border-radius:.5rem; cursor:pointer } |
| .toolbar button[aria-pressed="true"]{ outline:2px solid var(--ring) } |
| |
| |
| .site-footer{ padding:2rem 1rem 4rem; text-align:center; color:var(--muted) } |
| |
| |
| @media (prefers-reduced-motion: reduce){ *{ transition:none!important; animation:none!important } } |
| |
| |
| .results-group{ margin-top:.8rem } |
| .results-group h3{ margin:.2rem 0 .4rem; font-size:1rem; color:var(--muted) } |
| .grid-2{ |
| display:grid; grid-template-columns:repeat(2,1fr); gap:1rem; |
| } |
| .grid-3{ |
| display:grid; grid-template-columns:repeat(3,1fr); gap:1rem; |
| } |
| @media (max-width: 900px){ |
| .grid-2, .grid-3{ grid-template-columns:1fr } |
| } |
| .figure-card img{ |
| width:100%; height:auto; border-radius:.6rem; display:block; |
| box-shadow:0 2px 12px rgba(0,0,0,.08); |
| } |
| .figure-card figcaption{ |
| font-size:.9rem; color:var(--muted); margin-top:.35rem; |
| } |
| |
| |
| .full-span { grid-column: 1 / -1; } |
| @media (max-width: 920px){ .full-span { grid-column: auto; } } |
| |
| |
| .img-modal { |
| display:none; position:fixed; inset:0; z-index:1000; |
| background:rgba(0,0,0,.75); |
| align-items:center; justify-content:center; |
| padding:2rem; |
| } |
| .img-modal.open { display:flex; } |
| .img-modal img { |
| max-width:92vw; max-height:92vh; |
| border-radius:.6rem; box-shadow:0 6px 28px rgba(0,0,0,.4); |
| } |
| .img-modal .close { |
| position:absolute; top:12px; right:16px; |
| font-size:1.6rem; color:#fff; background:transparent; border:0; cursor:pointer; |
| } |
| .figure-card img { cursor: zoom-in; } |
| body.modal-open { overflow:hidden; } |
| |
| .btn-row{ display:flex; gap:.6rem; justify-content:center; flex-wrap:wrap; margin:.6rem 0 .2rem; } |
| .btn.btn-disabled{ opacity:.6; cursor:not-allowed; pointer-events:none; } |
| |
| |
| .hero > video{ margin-top:.8rem; display:block; max-width:980px; width:100%; border-radius:.8rem; border:var(--border); background:var(--card); } |
| |
| .hint{ |
| margin-top:.35rem; font-size:.88rem; color:var(--muted); |
| border-left:3px solid color-mix(in oklab, var(--accent), transparent 65%); |
| padding:.2rem .6rem; line-height:1.4; |
| } |
| |
| |
| |
| </style> |
| </head> |
| <body> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| <header class="site-header"> |
| <a href="#home" class="logo" aria-label="AVIP Benchmark home"><span class="dot" aria-hidden="true"></span> AVIP<span>Benchmark</span></a> |
| <button class="nav-toggle" aria-expanded="false" aria-controls="site-nav">☰</button> |
| <nav id="site-nav" class="site-nav" aria-label="Main navigation"> |
| <a href="#home">Home</a> |
| <a href="#about">About</a> |
| <a href="#leaderboard">Leaderboard</a> |
| <a href="#media">Examples</a> |
| <a href="#contact">Contact</a> |
| </nav> |
| </header> |
|
|
| <main id="home"> |
| <section class="hero" aria-labelledby="tagline"> |
| <h1 id="tagline">Do you hear it? Meet AVIP-Bench</h1> |
| <p style="font-style:italic;"> |
| A controlled benchmark for evaluating intuitive physics from video & sound. |
| </p> |
| <p> |
| Objects crash, bounce, and shatter - our benchmark of audiovisual object drops |
| probes whether models benefit from adding <strong>sound</strong> when reasoning about physics. |
| </p> |
| |
| <div class="btn-row"> |
| <a class="btn" href="#media">See example Videos and Results</a> |
| <a id="paperLink" href="#" class="btn btn-disabled" aria-disabled="true">📄 PDF coming soon</a> |
| </div> |
|
|
| |
| <video autoplay muted loop playsinline poster=""> |
| <source src="thumbnail.mp4" type="video/mp4" /> |
| |
| <source src="thumbnail.webm" type="video/webm" /> |
| </video> |
| </section> |
|
|
| <section id="about" class="section"> |
| <h2>What is AVIP?</h2> |
| <p class="muted">A tiny, controlled benchmark with triplet videos per clip: <span class="pill">A</span> audio-only, <span class="pill">V</span> video-only, and <span class="pill">AV</span> audio+video. Tasks: <em>object</em>, <em>material</em>, <em>outcome</em>. We check top‑1 predictions vs. ground truth and look for cross‑modal gains.</p> |
| <ul class="features"> |
| <li class="card">📦 <strong>Minimal, reproducible clips</strong><br/>Short single‑impact scenes recorded in a controlled setup.</li> |
| <li class="card">🔊 <strong>Modality toggles</strong><br/>Each clip exists as A, V, and AV to test true audio usage.</li> |
| <li class="card">📈 <strong>Metrics</strong><br/>Top‑1 accuracy per task and an <em>AV − max(A,V)</em> cross‑modal gain.</li> |
| <li class="card">🧪 <strong>Probe‑style prompts</strong><br/>Strict label sets & JSON outputs to avoid prompt drift.</li> |
| </ul> |
| <details class="card" style="margin-top:1rem"> |
| <summary><strong>Method (short)</strong></summary> |
| <ol> |
| <li>For each clip, run models on A, V, and AV variants with the same instruction-style prompt.</li> |
| <li>Decode model outputs into <code>{object, material, outcome}</code> and compare against labels.</li> |
| <li>Compute per-task Top-1 and Top-5 accuracy and cross-modal gain per clip and in aggregate; additionally report calibration/confidence metrics (ECE, Brier, margin, entropy, Top-1 probability) and probing-based audio reliance via fixed cue selection and A/V/AV consistency; all metrics computed on the paired clip set (A∩V∩AV) with 95% confidence intervals.</li> |
| </ol> |
| </details> |
| </section> |
| |
| <section id="leaderboard" class="section" aria-labelledby="lb-title"> |
| <h2 id="lb-title">Leaderboard</h2> |
| <p class="muted">Per‑Modality (A / V / AV)</p> |
| <div class="table-wrap"> |
| <table aria-describedby="lb-title"> |
| <thead> |
| <tr> |
| <th scope="col">Model</th> |
| <th scope="col">Modality</th> |
| <th scope="col">N</th> |
| <th scope="col">Top‑1 Acc (%)</th> |
| <th scope="col">Updated</th> |
| </tr> |
| </thead> |
| <tbody id="leaderboard-body"></tbody> |
| </table> |
| </div> |
| |
| |
| <script id="leaderboard-data" type="application/json">{ |
| "rows": [ |
| {"model":"Gemini‑2.5 Flash (no think)", "modality":"A", "Top1AccuracyinPercent":20.0, "N":993}, |
| {"model":"Gemini‑2.5 Flash (no think)", "modality":"AV", "Top1AccuracyinPercent":53.4, "N":993}, |
| {"model":"Gemini‑2.5 Flash (no think)", "modality":"V", "Top1AccuracyinPercent":48.2, "N":993}, |
| |
| {"model":"Gemini‑2.5 Flash (think)", "modality":"A", "Top1AccuracyinPercent":24.1, "N":990}, |
| {"model":"Gemini‑2.5 Flash (think)", "modality":"AV", "Top1AccuracyinPercent":58.5, "N":993}, |
| {"model":"Gemini‑2.5 Flash (think)", "modality":"V", "Top1AccuracyinPercent":50.9, "N":993}, |
| |
| {"model":"Gemini‑2.5 Pro (think)", "modality":"A", "Top1AccuracyinPercent":17.3, "N":819}, |
| {"model":"Gemini‑2.5 Pro (think)", "modality":"AV", "Top1AccuracyinPercent":61.8, "N":807}, |
| {"model":"Gemini‑2.5 Pro (think)", "modality":"V", "Top1AccuracyinPercent":56.3, "N":807}, |
| |
| {"model":"Qwen2.5‑Omni 7B (local)", "modality":"A", "Top1AccuracyinPercent":10.9, "N":993}, |
| {"model":"Qwen2.5‑Omni 7B (local)", "modality":"AV", "Top1AccuracyinPercent":38.7, "N":993}, |
| {"model":"Qwen2.5‑Omni 7B (local)", "modality":"V", "Top1AccuracyinPercent":38.5, "N":993} |
| ] |
| }</script> |
| </section> |
|
|
| <section id="media" class="section" aria-labelledby="ex-title"> |
| <h2 id="ex-title">Example clips and Plots</h2> |
| <div class="media-grid"> |
| <figure class="media-card"> |
| <video id="sampleVideo" controls preload="metadata" playsinline poster="assets/sample_poster.jpg"> |
| <source src="paperbox_high_1.MP4" type="video/mp4" /> |
| Your browser doesn’t support HTML5 video. |
| </video> |
| <div class="toolbar" role="toolbar" aria-label="Version selector"> |
| <button type="button" class="ver" data-src="paperbox_high_1_A.mp4" aria-pressed="false" aria-label="Audio only (A)">A</button> |
| <button type="button" class="ver" data-src="paperbox_high_1_V.mp4" aria-pressed="false" aria-label="Video only (V)">V</button> |
| <button type="button" class="ver" data-src="paperbox_high_1.MP4" aria-pressed="true" aria-label="Audio + Video (AV)">AV</button> |
| <span class="muted" id="verStatus" aria-live="polite" style="margin-left:.4rem">Now showing: AV</span> |
| </div> |
| <figcaption class="muted">Task labels (demo): <strong>object</strong>=<code>paperbox</code>, <strong>material</strong>=<code>cardboard</code>, <strong>outcome</strong>=<code>bounce</code></figcaption> |
| </figure> |
|
|
| |
| <div class="results-group"> |
| <h3>Cross-Modal Gain (CMG)</h3> |
| <figure class="figure-card"> |
| <img src="xmod_cis.png" alt="Cross-Modal Gain heatmap" loading="lazy"> |
| <figcaption>CMG in percentage points per engine; horizontal bars are 95\% paired-bootstrap CIs on the paired clip set.</figcaption> |
| <div class="hint">Look for positive values: these mean AV was better than either audio or video alone. Gains usually appear for outcome prediction, but rarely for object or material recognition.</div> |
| </figure> |
| </div> |
|
|
| |
| <div class="results-group"> |
| <h3>Average modality attribution (AV)</h3> |
| <div class="grid-2"> |
| <figure class="figure-card"> |
| <img src="Heatmap_Audio.png" alt="Average audio weight across models" loading="lazy"> |
| <figcaption>Audio weight by model.</figcaption> |
| <div class="hint">What to look for: Red = model relies more on audio, Blue = model relies less. |
| Engines that “listen” more may gain on outcome prediction, but not always.</div> |
| </figure> |
| <figure class="figure-card"> |
| <img src="Heatmap_Video.png" alt="Average video weight across models" loading="lazy"> |
| <figcaption>Video weight by model.</figcaption> |
| <div class="hint">What to look for: Red = model relies more on video, Blue = model relies less. |
| Engines that “look” more often ignore sound, which can explain weak cross-modal gains.</div> |
| </figure> |
| </div> |
| </div> |
|
|
| |
| <div class="media-card full-span"> |
| <div class="results-group"> |
| <h3>Top-1 accuracy by task</h3> |
| <figure class="figure-card"> |
| <img src="accuracy_micro_macro_cis.png" alt="Top-1 accuracy per model across object, material, and outcome for A, V, AV" loading="lazy"> |
| <figcaption>Top-1 accuracy with 95% CIs (A, V, AV) across tasks and models.</figcaption> |
| <div class="hint">What to look for: V is usually highest; AV improves over A and sometimes nudges past V on outcome. |
| Big gaps A→AV mean sound is helpful; AV≈V means little extra benefit.</div> |
| </figure> |
| </div> |
| </div> |
|
|
|
|
| </section> |
|
|
| <section id="contact" class="section"> |
| <h2>Contact</h2> |
| <p>Questions? <a href="mailto:bramo.g@protonmail.com">bramo.g@protonmail.com</a></p> |
| <p class="muted"><a href="https://huggingface.co/Grets/AVIP">huggingface.co/Grets/AVIP</a></p> |
| </section> |
| </main> |
|
|
| <footer class="site-footer"> |
| <small>© <span id="year"></span> Grets. Rendered by Hugging Face Spaces.</small> |
| </footer> |
|
|
| |
| <script> |
| |
| const navToggle = document.querySelector('.nav-toggle'); |
| const nav = document.getElementById('site-nav'); |
| if (navToggle && nav) { |
| navToggle.addEventListener('click', () => { |
| const open = nav.classList.toggle('open'); |
| navToggle.setAttribute('aria-expanded', String(open)); |
| }); |
| } |
| |
| const y = document.getElementById('year'); |
| if (y) y.textContent = new Date().getFullYear(); |
| </script> |
|
|
| |
| <script> |
| (function renderLeaderboard(){ |
| const el = document.getElementById('leaderboard-body'); |
| const dataEl = document.getElementById('leaderboard-data'); |
| if (!el || !dataEl) return; |
| |
| |
| let rows = []; |
| try { |
| const parsed = JSON.parse(dataEl.textContent.trim()); |
| rows = Array.isArray(parsed.rows) ? parsed.rows : []; |
| } catch (_) {} |
| |
| |
| const order = ['AV', 'V', 'A']; |
| rows.sort((a,b) => |
| String(a.model).localeCompare(String(b.model)) || |
| order.indexOf(a.modality) - order.indexOf(b.modality) |
| ); |
| |
| const today = new Date().toISOString().slice(0,10); |
| const fmtPct = v => (v==null || v==='') ? '—' : (Number(v).toFixed(1) + '%'); |
| |
| |
| el.innerHTML = rows.map(r => ` |
| <tr> |
| <td>${r.model}</td> |
| <td>${r.modality}</td> |
| <td>${r.N ?? '—'}</td> |
| <td>${fmtPct(r.Top1AccuracyinPercent)}</td> |
| <td>${r.updated ?? today}</td> |
| </tr> |
| `).join(''); |
| })(); |
| </script> |
|
|
| |
| <script> |
| (function(){ |
| const video = document.getElementById('sampleVideo'); |
| const verButtons = document.querySelectorAll('.ver'); |
| const verStatus = document.getElementById('verStatus'); |
| verButtons.forEach(btn => { |
| btn.addEventListener('click', () => { |
| verButtons.forEach(b => b.setAttribute('aria-pressed','false')); |
| btn.setAttribute('aria-pressed','true'); |
| const src = btn.getAttribute('data-src'); |
| const label = btn.textContent.trim(); |
| if (src && video) { |
| const wasPlaying = !video.paused && !video.ended; |
| video.pause(); |
| video.querySelector('source').src = src; |
| video.load(); |
| if (wasPlaying) video.play().catch(()=>{}); |
| if (verStatus) verStatus.textContent = `Now showing: ${label}`; |
| } |
| }); |
| }); |
| })(); |
| </script> |
|
|
| |
| <div class="img-modal" id="imgModal" aria-hidden="true"> |
| <button class="close" aria-label="Close">×</button> |
| <img id="imgModalImg" alt=""> |
| </div> |
|
|
| <script> |
| (function(){ |
| const modal = document.getElementById('imgModal'); |
| const modalImg = document.getElementById('imgModalImg'); |
| if (!modal || !modalImg) return; |
| |
| document.addEventListener('click', (e)=>{ |
| const img = e.target.closest('.figure-card img'); |
| if (!img) return; |
| const full = img.getAttribute('data-full'); |
| modalImg.src = full || img.src; |
| modalImg.alt = img.alt || ''; |
| modal.classList.add('open'); |
| document.body.classList.add('modal-open'); |
| modal.setAttribute('aria-hidden','false'); |
| }); |
| |
| modal.addEventListener('click', (e)=>{ |
| if (e.target === modal || e.target.classList.contains('close')) closeModal(); |
| }); |
| |
| document.addEventListener('keydown', (e)=>{ |
| if (e.key === 'Escape' && modal.classList.contains('open')) closeModal(); |
| }); |
| |
| function closeModal(){ |
| modal.classList.remove('open'); |
| document.body.classList.remove('modal-open'); |
| modal.setAttribute('aria-hidden','true'); |
| modalImg.src = ''; |
| } |
| })(); |
| </script> |
|
|
| <script> |
| |
| const PDF_PATH = "AVIP_gbramow_lbreitkopf_iberger.pdf"; |
| |
| async function enablePdfButton(){ |
| try{ |
| const res = await fetch(PDF_PATH, { method:"HEAD", cache:"no-store" }); |
| if(!res.ok) return; |
| const a = document.getElementById("paperLink"); |
| if(!a) return; |
| a.href = PDF_PATH; |
| a.target = "_blank"; |
| a.rel = "noopener"; |
| a.textContent = "📄 View PDF"; |
| a.classList.remove("btn-disabled"); |
| a.setAttribute("aria-disabled","false"); |
| }catch(e){ } |
| } |
| document.addEventListener("DOMContentLoaded", enablePdfButton); |
| </script> |
|
|
|
|
| </body> |
| </html> |