| | <!doctype html> |
| | <html lang="en"> |
| | <head> |
| | <meta charset="utf-8" /> |
| | <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| | <title>AVIP – Audio–Vision Interaction Probe (Benchmark)</title> |
| | <meta name="description" content="AVIP is a lightweight benchmark to check whether multimodal models truly use audio in videos." /> |
| |
|
| | |
| | <meta property="og:title" content="AVIP Benchmark" /> |
| | <meta property="og:description" content="Do multimodal models actually use audio in videos?" /> |
| | <meta property="og:type" content="website" /> |
| | <meta property="og:video" content="thumbnail" /> |
| | <meta name="twitter:card" content="summary_large_image" /> |
| |
|
| | <link rel="icon" href="assets/favicon.png" /> |
| | <style> |
| | |
| | |
| | |
| | :root{ |
| | --bg:#0b0b0c; --surface:#111216; --text:#e9e9ee; --muted:#9aa0a6; --accent:#6ee7ff; --ring:rgba(110,231,255,.35); |
| | --card: color-mix(in oklab, var(--surface), transparent 8%); |
| | --border: 1px solid rgba(255,255,255,.08); |
| | } |
| | @media (prefers-color-scheme: light){ |
| | :root{ --bg:#fafafa; --surface:#ffffff; --text:#101114; --muted:#5f6368; --accent:#0078ff; --ring:rgba(0,120,255,.2); |
| | --card: color-mix(in oklab, var(--surface), transparent 4%); |
| | --border: 1px solid rgba(0,0,0,.08); |
| | } |
| | } |
| | *,*::before,*::after{ box-sizing:border-box } |
| | html,body{ height:100% } |
| | body{ |
| | margin:0; font:16px/1.6 system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif; color:var(--text); |
| | background: |
| | radial-gradient(1200px 800px at 10% -5%, rgba(110,231,255,.08), transparent 40%), |
| | radial-gradient(900px 700px at 110% 10%, rgba(110,231,255,.06), transparent 40%), |
| | var(--bg); |
| | } |
| | .site-header{ position:sticky; top:0; z-index:20; display:flex; align-items:center; gap:1rem; justify-content:space-between; |
| | padding:.8rem 1rem; border-bottom:var(--border); |
| | background:color-mix(in oklab, var(--surface), transparent 35%); |
| | backdrop-filter:saturate(1.2) blur(8px); |
| | } |
| | .logo{ font-weight:800; letter-spacing:.2px; text-decoration:none; color:var(--text); display:flex; align-items:center; gap:.55rem } |
| | .logo .dot{ width:.7rem; height:.7rem; border-radius:999px; background:var(--accent); box-shadow:0 0 16px var(--ring) } |
| | .logo span{ color:var(--accent) } |
| | .site-nav{ display:flex; gap:.75rem } |
| | .site-nav a{ color:var(--text); text-decoration:none; padding:.4rem .6rem; border-radius:.5rem } |
| | .site-nav a:hover{ outline:2px solid var(--ring); outline-offset:2px } |
| | .nav-toggle{ display:none; background:transparent; border:1px solid rgba(255,255,255,.2); color:var(--text); border-radius:.5rem; padding:.4rem .6rem } |
| | @media (max-width: 720px){ |
| | .nav-toggle{ display:inline-block } |
| | .site-nav{ position:absolute; right:1rem; top:3.4rem; flex-direction:column; padding:.6rem; background:var(--surface); border:var(--border); border-radius:.6rem; display:none } |
| | .site-nav.open{ display:flex } |
| | } |
| | |
| | .hero{ text-align:center; padding:4.5rem 1rem 2rem; max-width:1060px; margin:0 auto } |
| | .hero h1{ font-size:clamp(2rem,3.6vw,3rem); margin:0 0 .5rem } |
| | .hero p{ margin:0 0 1.5rem; color:var(--muted) } |
| | .btn{ display:inline-block; padding:.72rem 1rem; border-radius:.6rem; background:var(--accent); color:#06141b; text-decoration:none; font-weight:700 } |
| | .cover{ margin:1.2rem auto 0; max-width:980px; aspect-ratio: 16/9; border-radius:.8rem; overflow:hidden; border:var(--border); background:var(--card) } |
| | .cover img{ width:100%; height:100%; object-fit:cover; display:block } |
| | |
| | .section{ padding:2.6rem 1rem; max-width:1060px; margin:0 auto } |
| | .section h2{ font-size:clamp(1.4rem,2.2vw,1.8rem); margin:0 0 .4rem } |
| | .muted{ color:var(--muted) } |
| | |
| | .features{ display:grid; gap:.8rem; grid-template-columns:repeat(auto-fill,minmax(230px,1fr)); margin:1rem 0 0 } |
| | .card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:1rem } |
| | |
| | |
| | .table-wrap{ overflow-x:auto; background:var(--card); border-radius:.8rem; border:var(--border) } |
| | table{ width:100%; border-collapse:collapse } |
| | th,td{ padding:.7rem .8rem; border-bottom:1px solid rgba(255,255,255,.06); text-align:left } |
| | th{ font-weight:700 } |
| | .pill{ display:inline-block; padding:.2rem .5rem; border-radius:.5rem; background:rgba(110,231,255,.18); color:var(--text); font-size:.8rem } |
| | |
| | |
| | .media-grid{ display:grid; grid-template-columns:2fr 1fr; gap:1rem } |
| | @media (max-width: 920px){ .media-grid{ grid-template-columns:1fr } } |
| | figure{ margin:0 } |
| | .media-card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:.8rem } |
| | video{ width:100%; max-height:520px; background:#000; border-radius:.6rem } |
| | .toolbar{ display:flex; gap:.5rem; flex-wrap:wrap; margin:.6rem 0 } |
| | .toolbar button{ background:transparent; color:var(--text); border:1px solid rgba(255,255,255,.25); padding:.38rem .6rem; border-radius:.5rem; cursor:pointer } |
| | .toolbar button[aria-pressed="true"]{ outline:2px solid var(--ring) } |
| | |
| | |
| | .site-footer{ padding:2rem 1rem 4rem; text-align:center; color:var(--muted) } |
| | |
| | |
| | @media (prefers-reduced-motion: reduce){ *{ transition:none!important; animation:none!important } } |
| | |
| | |
| | .results-group{ margin-top:.8rem } |
| | .results-group h3{ margin:.2rem 0 .4rem; font-size:1rem; color:var(--muted) } |
| | .grid-2{ |
| | display:grid; grid-template-columns:repeat(2,1fr); gap:1rem; |
| | } |
| | .grid-3{ |
| | display:grid; grid-template-columns:repeat(3,1fr); gap:1rem; |
| | } |
| | @media (max-width: 900px){ |
| | .grid-2, .grid-3{ grid-template-columns:1fr } |
| | } |
| | .figure-card img{ |
| | width:100%; height:auto; border-radius:.6rem; display:block; |
| | box-shadow:0 2px 12px rgba(0,0,0,.08); |
| | } |
| | .figure-card figcaption{ |
| | font-size:.9rem; color:var(--muted); margin-top:.35rem; |
| | } |
| | |
| | |
| | .full-span { grid-column: 1 / -1; } |
| | @media (max-width: 920px){ .full-span { grid-column: auto; } } |
| | |
| | |
| | .img-modal { |
| | display:none; position:fixed; inset:0; z-index:1000; |
| | background:rgba(0,0,0,.75); |
| | align-items:center; justify-content:center; |
| | padding:2rem; |
| | } |
| | .img-modal.open { display:flex; } |
| | .img-modal img { |
| | max-width:92vw; max-height:92vh; |
| | border-radius:.6rem; box-shadow:0 6px 28px rgba(0,0,0,.4); |
| | } |
| | .img-modal .close { |
| | position:absolute; top:12px; right:16px; |
| | font-size:1.6rem; color:#fff; background:transparent; border:0; cursor:pointer; |
| | } |
| | .figure-card img { cursor: zoom-in; } |
| | body.modal-open { overflow:hidden; } |
| | |
| | .btn-row{ display:flex; gap:.6rem; justify-content:center; flex-wrap:wrap; margin:.6rem 0 .2rem; } |
| | .btn.btn-disabled{ opacity:.6; cursor:not-allowed; pointer-events:none; } |
| | |
| | |
| | .hero > video{ margin-top:.8rem; display:block; max-width:980px; width:100%; border-radius:.8rem; border:var(--border); background:var(--card); } |
| | |
| | .hint{ |
| | margin-top:.35rem; font-size:.88rem; color:var(--muted); |
| | border-left:3px solid color-mix(in oklab, var(--accent), transparent 65%); |
| | padding:.2rem .6rem; line-height:1.4; |
| | } |
| | |
| | |
| | |
| | </style> |
| | </head> |
| | <body> |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | <header class="site-header"> |
| | <a href="#home" class="logo" aria-label="AVIP Benchmark home"><span class="dot" aria-hidden="true"></span> AVIP<span>Benchmark</span></a> |
| | <button class="nav-toggle" aria-expanded="false" aria-controls="site-nav">☰</button> |
| | <nav id="site-nav" class="site-nav" aria-label="Main navigation"> |
| | <a href="#home">Home</a> |
| | <a href="#about">About</a> |
| | <a href="#leaderboard">Leaderboard</a> |
| | <a href="#media">Examples</a> |
| | <a href="#contact">Contact</a> |
| | </nav> |
| | </header> |
| |
|
| | <main id="home"> |
| | <section class="hero" aria-labelledby="tagline"> |
| | <h1 id="tagline">Do you hear it? Meet AVIP-Bench</h1> |
| | <p style="font-style:italic;"> |
| | A controlled benchmark for evaluating intuitive physics from video & sound. |
| | </p> |
| | <p> |
| | Objects crash, bounce, and shatter - our benchmark of audiovisual object drops |
| | probes whether models benefit from adding <strong>sound</strong> when reasoning about physics. |
| | </p> |
| | |
| | <div class="btn-row"> |
| | <a class="btn" href="#media">See example Videos and Results</a> |
| | <a id="paperLink" href="#" class="btn btn-disabled" aria-disabled="true">📄 PDF coming soon</a> |
| | </div> |
| |
|
| | |
| | <video autoplay muted loop playsinline poster=""> |
| | <source src="thumbnail.mp4" type="video/mp4" /> |
| | |
| | <source src="thumbnail.webm" type="video/webm" /> |
| | </video> |
| | </section> |
| |
|
| | <section id="about" class="section"> |
| | <h2>What is AVIP?</h2> |
| | <p class="muted">A tiny, controlled benchmark with triplet videos per clip: <span class="pill">A</span> audio-only, <span class="pill">V</span> video-only, and <span class="pill">AV</span> audio+video. Tasks: <em>object</em>, <em>material</em>, <em>outcome</em>. We check top‑1 predictions vs. ground truth and look for cross‑modal gains.</p> |
| | <ul class="features"> |
| | <li class="card">📦 <strong>Minimal, reproducible clips</strong><br/>Short single‑impact scenes recorded in a controlled setup.</li> |
| | <li class="card">🔊 <strong>Modality toggles</strong><br/>Each clip exists as A, V, and AV to test true audio usage.</li> |
| | <li class="card">📈 <strong>Metrics</strong><br/>Top‑1 accuracy per task and an <em>AV − max(A,V)</em> cross‑modal gain.</li> |
| | <li class="card">🧪 <strong>Probe‑style prompts</strong><br/>Strict label sets & JSON outputs to avoid prompt drift.</li> |
| | </ul> |
| | <details class="card" style="margin-top:1rem"> |
| | <summary><strong>Method (short)</strong></summary> |
| | <ol> |
| | <li>For each clip, run models on A, V, and AV variants with the same instruction-style prompt.</li> |
| | <li>Decode model outputs into <code>{object, material, outcome}</code> and compare against labels.</li> |
| | <li>Compute per-task Top-1 and Top-5 accuracy and cross-modal gain per clip and in aggregate; additionally report calibration/confidence metrics (ECE, Brier, margin, entropy, Top-1 probability) and probing-based audio reliance via fixed cue selection and A/V/AV consistency; all metrics computed on the paired clip set (A∩V∩AV) with 95% confidence intervals.</li> |
| | </ol> |
| | </details> |
| | </section> |
| | |
| | <section id="leaderboard" class="section" aria-labelledby="lb-title"> |
| | <h2 id="lb-title">Leaderboard</h2> |
| | <p class="muted">Per‑Modality (A / V / AV)</p> |
| | <div class="table-wrap"> |
| | <table aria-describedby="lb-title"> |
| | <thead> |
| | <tr> |
| | <th scope="col">Model</th> |
| | <th scope="col">Modality</th> |
| | <th scope="col">N</th> |
| | <th scope="col">Top‑1 Acc (%)</th> |
| | <th scope="col">Updated</th> |
| | </tr> |
| | </thead> |
| | <tbody id="leaderboard-body"></tbody> |
| | </table> |
| | </div> |
| | |
| | |
| | <script id="leaderboard-data" type="application/json">{ |
| | "rows": [ |
| | {"model":"Gemini‑2.5 Flash (no think)", "modality":"A", "Top1AccuracyinPercent":20.0, "N":993}, |
| | {"model":"Gemini‑2.5 Flash (no think)", "modality":"AV", "Top1AccuracyinPercent":53.4, "N":993}, |
| | {"model":"Gemini‑2.5 Flash (no think)", "modality":"V", "Top1AccuracyinPercent":48.2, "N":993}, |
| | |
| | {"model":"Gemini‑2.5 Flash (think)", "modality":"A", "Top1AccuracyinPercent":24.1, "N":990}, |
| | {"model":"Gemini‑2.5 Flash (think)", "modality":"AV", "Top1AccuracyinPercent":58.5, "N":993}, |
| | {"model":"Gemini‑2.5 Flash (think)", "modality":"V", "Top1AccuracyinPercent":50.9, "N":993}, |
| | |
| | {"model":"Gemini‑2.5 Pro (think)", "modality":"A", "Top1AccuracyinPercent":17.3, "N":819}, |
| | {"model":"Gemini‑2.5 Pro (think)", "modality":"AV", "Top1AccuracyinPercent":61.8, "N":807}, |
| | {"model":"Gemini‑2.5 Pro (think)", "modality":"V", "Top1AccuracyinPercent":56.3, "N":807}, |
| | |
| | {"model":"Qwen2.5‑Omni 7B (local)", "modality":"A", "Top1AccuracyinPercent":10.9, "N":993}, |
| | {"model":"Qwen2.5‑Omni 7B (local)", "modality":"AV", "Top1AccuracyinPercent":38.7, "N":993}, |
| | {"model":"Qwen2.5‑Omni 7B (local)", "modality":"V", "Top1AccuracyinPercent":38.5, "N":993} |
| | ] |
| | }</script> |
| | </section> |
| |
|
| | <section id="media" class="section" aria-labelledby="ex-title"> |
| | <h2 id="ex-title">Example clips and Plots</h2> |
| | <div class="media-grid"> |
| | <figure class="media-card"> |
| | <video id="sampleVideo" controls preload="metadata" playsinline poster="assets/sample_poster.jpg"> |
| | <source src="paperbox_high_1.MP4" type="video/mp4" /> |
| | Your browser doesn’t support HTML5 video. |
| | </video> |
| | <div class="toolbar" role="toolbar" aria-label="Version selector"> |
| | <button type="button" class="ver" data-src="paperbox_high_1_A.mp4" aria-pressed="false" aria-label="Audio only (A)">A</button> |
| | <button type="button" class="ver" data-src="paperbox_high_1_V.mp4" aria-pressed="false" aria-label="Video only (V)">V</button> |
| | <button type="button" class="ver" data-src="paperbox_high_1.MP4" aria-pressed="true" aria-label="Audio + Video (AV)">AV</button> |
| | <span class="muted" id="verStatus" aria-live="polite" style="margin-left:.4rem">Now showing: AV</span> |
| | </div> |
| | <figcaption class="muted">Task labels (demo): <strong>object</strong>=<code>paperbox</code>, <strong>material</strong>=<code>cardboard</code>, <strong>outcome</strong>=<code>bounce</code></figcaption> |
| | </figure> |
| |
|
| | |
| | <div class="results-group"> |
| | <h3>Cross-Modal Gain (CMG)</h3> |
| | <figure class="figure-card"> |
| | <img src="xmod_cis.png" alt="Cross-Modal Gain heatmap" loading="lazy"> |
| | <figcaption>CMG in percentage points per engine; horizontal bars are 95\% paired-bootstrap CIs on the paired clip set.</figcaption> |
| | <div class="hint">Look for positive values: these mean AV was better than either audio or video alone. Gains usually appear for outcome prediction, but rarely for object or material recognition.</div> |
| | </figure> |
| | </div> |
| |
|
| | |
| | <div class="results-group"> |
| | <h3>Average modality attribution (AV)</h3> |
| | <div class="grid-2"> |
| | <figure class="figure-card"> |
| | <img src="Heatmap_Audio.png" alt="Average audio weight across models" loading="lazy"> |
| | <figcaption>Audio weight by model.</figcaption> |
| | <div class="hint">What to look for: Red = model relies more on audio, Blue = model relies less. |
| | Engines that “listen” more may gain on outcome prediction, but not always.</div> |
| | </figure> |
| | <figure class="figure-card"> |
| | <img src="Heatmap_Video.png" alt="Average video weight across models" loading="lazy"> |
| | <figcaption>Video weight by model.</figcaption> |
| | <div class="hint">What to look for: Red = model relies more on video, Blue = model relies less. |
| | Engines that “look” more often ignore sound, which can explain weak cross-modal gains.</div> |
| | </figure> |
| | </div> |
| | </div> |
| |
|
| | |
| | <div class="media-card full-span"> |
| | <div class="results-group"> |
| | <h3>Top-1 accuracy by task</h3> |
| | <figure class="figure-card"> |
| | <img src="accuracy_micro_macro_cis.png" alt="Top-1 accuracy per model across object, material, and outcome for A, V, AV" loading="lazy"> |
| | <figcaption>Top-1 accuracy with 95% CIs (A, V, AV) across tasks and models.</figcaption> |
| | <div class="hint">What to look for: V is usually highest; AV improves over A and sometimes nudges past V on outcome. |
| | Big gaps A→AV mean sound is helpful; AV≈V means little extra benefit.</div> |
| | </figure> |
| | </div> |
| | </div> |
| |
|
| |
|
| | </section> |
| |
|
| | <section id="contact" class="section"> |
| | <h2>Contact</h2> |
| | <p>Questions? <a href="mailto:bramo.g@protonmail.com">bramo.g@protonmail.com</a></p> |
| | <p class="muted"><a href="https://huggingface.co/Grets/AVIP">huggingface.co/Grets/AVIP</a></p> |
| | </section> |
| | </main> |
| |
|
| | <footer class="site-footer"> |
| | <small>© <span id="year"></span> Grets. Rendered by Hugging Face Spaces.</small> |
| | </footer> |
| |
|
| | |
| | <script> |
| | |
| | const navToggle = document.querySelector('.nav-toggle'); |
| | const nav = document.getElementById('site-nav'); |
| | if (navToggle && nav) { |
| | navToggle.addEventListener('click', () => { |
| | const open = nav.classList.toggle('open'); |
| | navToggle.setAttribute('aria-expanded', String(open)); |
| | }); |
| | } |
| | |
| | const y = document.getElementById('year'); |
| | if (y) y.textContent = new Date().getFullYear(); |
| | </script> |
| |
|
| | |
| | <script> |
| | (function renderLeaderboard(){ |
| | const el = document.getElementById('leaderboard-body'); |
| | const dataEl = document.getElementById('leaderboard-data'); |
| | if (!el || !dataEl) return; |
| | |
| | |
| | let rows = []; |
| | try { |
| | const parsed = JSON.parse(dataEl.textContent.trim()); |
| | rows = Array.isArray(parsed.rows) ? parsed.rows : []; |
| | } catch (_) {} |
| | |
| | |
| | const order = ['AV', 'V', 'A']; |
| | rows.sort((a,b) => |
| | String(a.model).localeCompare(String(b.model)) || |
| | order.indexOf(a.modality) - order.indexOf(b.modality) |
| | ); |
| | |
| | const today = new Date().toISOString().slice(0,10); |
| | const fmtPct = v => (v==null || v==='') ? '—' : (Number(v).toFixed(1) + '%'); |
| | |
| | |
| | el.innerHTML = rows.map(r => ` |
| | <tr> |
| | <td>${r.model}</td> |
| | <td>${r.modality}</td> |
| | <td>${r.N ?? '—'}</td> |
| | <td>${fmtPct(r.Top1AccuracyinPercent)}</td> |
| | <td>${r.updated ?? today}</td> |
| | </tr> |
| | `).join(''); |
| | })(); |
| | </script> |
| |
|
| | |
| | <script> |
| | (function(){ |
| | const video = document.getElementById('sampleVideo'); |
| | const verButtons = document.querySelectorAll('.ver'); |
| | const verStatus = document.getElementById('verStatus'); |
| | verButtons.forEach(btn => { |
| | btn.addEventListener('click', () => { |
| | verButtons.forEach(b => b.setAttribute('aria-pressed','false')); |
| | btn.setAttribute('aria-pressed','true'); |
| | const src = btn.getAttribute('data-src'); |
| | const label = btn.textContent.trim(); |
| | if (src && video) { |
| | const wasPlaying = !video.paused && !video.ended; |
| | video.pause(); |
| | video.querySelector('source').src = src; |
| | video.load(); |
| | if (wasPlaying) video.play().catch(()=>{}); |
| | if (verStatus) verStatus.textContent = `Now showing: ${label}`; |
| | } |
| | }); |
| | }); |
| | })(); |
| | </script> |
| |
|
| | |
| | <div class="img-modal" id="imgModal" aria-hidden="true"> |
| | <button class="close" aria-label="Close">×</button> |
| | <img id="imgModalImg" alt=""> |
| | </div> |
| |
|
| | <script> |
| | (function(){ |
| | const modal = document.getElementById('imgModal'); |
| | const modalImg = document.getElementById('imgModalImg'); |
| | if (!modal || !modalImg) return; |
| | |
| | document.addEventListener('click', (e)=>{ |
| | const img = e.target.closest('.figure-card img'); |
| | if (!img) return; |
| | const full = img.getAttribute('data-full'); |
| | modalImg.src = full || img.src; |
| | modalImg.alt = img.alt || ''; |
| | modal.classList.add('open'); |
| | document.body.classList.add('modal-open'); |
| | modal.setAttribute('aria-hidden','false'); |
| | }); |
| | |
| | modal.addEventListener('click', (e)=>{ |
| | if (e.target === modal || e.target.classList.contains('close')) closeModal(); |
| | }); |
| | |
| | document.addEventListener('keydown', (e)=>{ |
| | if (e.key === 'Escape' && modal.classList.contains('open')) closeModal(); |
| | }); |
| | |
| | function closeModal(){ |
| | modal.classList.remove('open'); |
| | document.body.classList.remove('modal-open'); |
| | modal.setAttribute('aria-hidden','true'); |
| | modalImg.src = ''; |
| | } |
| | })(); |
| | </script> |
| |
|
| | <script> |
| | |
| | const PDF_PATH = "AVIP_gbramow_lbreitkopf_iberger.pdf"; |
| | |
| | async function enablePdfButton(){ |
| | try{ |
| | const res = await fetch(PDF_PATH, { method:"HEAD", cache:"no-store" }); |
| | if(!res.ok) return; |
| | const a = document.getElementById("paperLink"); |
| | if(!a) return; |
| | a.href = PDF_PATH; |
| | a.target = "_blank"; |
| | a.rel = "noopener"; |
| | a.textContent = "📄 View PDF"; |
| | a.classList.remove("btn-disabled"); |
| | a.setAttribute("aria-disabled","false"); |
| | }catch(e){ } |
| | } |
| | document.addEventListener("DOMContentLoaded", enablePdfButton); |
| | </script> |
| |
|
| |
|
| | </body> |
| | </html> |