AVIP / index.html
Grets's picture
Update index.html
7b93b09 verified
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>AVIP – Audio–Vision Interaction Probe (Benchmark)</title>
<meta name="description" content="AVIP is a lightweight benchmark to check whether multimodal models truly use audio in videos." />
<!-- Social cards (add your own image at assets/cover.jpg) -->
<meta property="og:title" content="AVIP Benchmark" />
<meta property="og:description" content="Do multimodal models actually use audio in videos?" />
<meta property="og:type" content="website" />
<meta property="og:video" content="thumbnail" />
<meta name="twitter:card" content="summary_large_image" />
<link rel="icon" href="assets/favicon.png" />
<style>
/* =====================
THEME & BASICS
===================== */
:root{
--bg:#0b0b0c; --surface:#111216; --text:#e9e9ee; --muted:#9aa0a6; --accent:#6ee7ff; --ring:rgba(110,231,255,.35);
--card: color-mix(in oklab, var(--surface), transparent 8%);
--border: 1px solid rgba(255,255,255,.08);
}
@media (prefers-color-scheme: light){
:root{ --bg:#fafafa; --surface:#ffffff; --text:#101114; --muted:#5f6368; --accent:#0078ff; --ring:rgba(0,120,255,.2);
--card: color-mix(in oklab, var(--surface), transparent 4%);
--border: 1px solid rgba(0,0,0,.08);
}
}
*,*::before,*::after{ box-sizing:border-box }
html,body{ height:100% }
body{
margin:0; font:16px/1.6 system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif; color:var(--text);
background:
radial-gradient(1200px 800px at 10% -5%, rgba(110,231,255,.08), transparent 40%),
radial-gradient(900px 700px at 110% 10%, rgba(110,231,255,.06), transparent 40%),
var(--bg);
}
.site-header{ position:sticky; top:0; z-index:20; display:flex; align-items:center; gap:1rem; justify-content:space-between;
padding:.8rem 1rem; border-bottom:var(--border);
background:color-mix(in oklab, var(--surface), transparent 35%);
backdrop-filter:saturate(1.2) blur(8px);
}
.logo{ font-weight:800; letter-spacing:.2px; text-decoration:none; color:var(--text); display:flex; align-items:center; gap:.55rem }
.logo .dot{ width:.7rem; height:.7rem; border-radius:999px; background:var(--accent); box-shadow:0 0 16px var(--ring) }
.logo span{ color:var(--accent) }
.site-nav{ display:flex; gap:.75rem }
.site-nav a{ color:var(--text); text-decoration:none; padding:.4rem .6rem; border-radius:.5rem }
.site-nav a:hover{ outline:2px solid var(--ring); outline-offset:2px }
.nav-toggle{ display:none; background:transparent; border:1px solid rgba(255,255,255,.2); color:var(--text); border-radius:.5rem; padding:.4rem .6rem }
@media (max-width: 720px){
.nav-toggle{ display:inline-block }
.site-nav{ position:absolute; right:1rem; top:3.4rem; flex-direction:column; padding:.6rem; background:var(--surface); border:var(--border); border-radius:.6rem; display:none }
.site-nav.open{ display:flex }
}
.hero{ text-align:center; padding:4.5rem 1rem 2rem; max-width:1060px; margin:0 auto }
.hero h1{ font-size:clamp(2rem,3.6vw,3rem); margin:0 0 .5rem }
.hero p{ margin:0 0 1.5rem; color:var(--muted) }
.btn{ display:inline-block; padding:.72rem 1rem; border-radius:.6rem; background:var(--accent); color:#06141b; text-decoration:none; font-weight:700 }
.cover{ margin:1.2rem auto 0; max-width:980px; aspect-ratio: 16/9; border-radius:.8rem; overflow:hidden; border:var(--border); background:var(--card) }
.cover img{ width:100%; height:100%; object-fit:cover; display:block }
.section{ padding:2.6rem 1rem; max-width:1060px; margin:0 auto }
.section h2{ font-size:clamp(1.4rem,2.2vw,1.8rem); margin:0 0 .4rem }
.muted{ color:var(--muted) }
.features{ display:grid; gap:.8rem; grid-template-columns:repeat(auto-fill,minmax(230px,1fr)); margin:1rem 0 0 }
.card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:1rem }
/* Leaderboard */
.table-wrap{ overflow-x:auto; background:var(--card); border-radius:.8rem; border:var(--border) }
table{ width:100%; border-collapse:collapse }
th,td{ padding:.7rem .8rem; border-bottom:1px solid rgba(255,255,255,.06); text-align:left }
th{ font-weight:700 }
.pill{ display:inline-block; padding:.2rem .5rem; border-radius:.5rem; background:rgba(110,231,255,.18); color:var(--text); font-size:.8rem }
/* Media showcase */
.media-grid{ display:grid; grid-template-columns:2fr 1fr; gap:1rem }
@media (max-width: 920px){ .media-grid{ grid-template-columns:1fr } }
figure{ margin:0 }
.media-card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:.8rem }
video{ width:100%; max-height:520px; background:#000; border-radius:.6rem }
.toolbar{ display:flex; gap:.5rem; flex-wrap:wrap; margin:.6rem 0 }
.toolbar button{ background:transparent; color:var(--text); border:1px solid rgba(255,255,255,.25); padding:.38rem .6rem; border-radius:.5rem; cursor:pointer }
.toolbar button[aria-pressed="true"]{ outline:2px solid var(--ring) }
/* Footer */
.site-footer{ padding:2rem 1rem 4rem; text-align:center; color:var(--muted) }
/* A11y: reduce motion */
@media (prefers-reduced-motion: reduce){ *{ transition:none!important; animation:none!important } }
/* --- Results grids (for figures) --- */
.results-group{ margin-top:.8rem }
.results-group h3{ margin:.2rem 0 .4rem; font-size:1rem; color:var(--muted) }
.grid-2{
display:grid; grid-template-columns:repeat(2,1fr); gap:1rem;
}
.grid-3{
display:grid; grid-template-columns:repeat(3,1fr); gap:1rem;
}
@media (max-width: 900px){
.grid-2, .grid-3{ grid-template-columns:1fr }
}
.figure-card img{
width:100%; height:auto; border-radius:.6rem; display:block;
box-shadow:0 2px 12px rgba(0,0,0,.08);
}
.figure-card figcaption{
font-size:.9rem; color:var(--muted); margin-top:.35rem;
}
/* make a card span both columns of .media-grid */
.full-span { grid-column: 1 / -1; }
@media (max-width: 920px){ .full-span { grid-column: auto; } }
/* ===== Image modal (click-to-zoom) ===== */
.img-modal {
display:none; position:fixed; inset:0; z-index:1000;
background:rgba(0,0,0,.75);
align-items:center; justify-content:center;
padding:2rem;
}
.img-modal.open { display:flex; }
.img-modal img {
max-width:92vw; max-height:92vh;
border-radius:.6rem; box-shadow:0 6px 28px rgba(0,0,0,.4);
}
.img-modal .close {
position:absolute; top:12px; right:16px;
font-size:1.6rem; color:#fff; background:transparent; border:0; cursor:pointer;
}
.figure-card img { cursor: zoom-in; }
body.modal-open { overflow:hidden; } /* prevent background scroll */
.btn-row{ display:flex; gap:.6rem; justify-content:center; flex-wrap:wrap; margin:.6rem 0 .2rem; }
.btn.btn-disabled{ opacity:.6; cursor:not-allowed; pointer-events:none; }
/* push the hero video down so the button never overlaps */
.hero > video{ margin-top:.8rem; display:block; max-width:980px; width:100%; border-radius:.8rem; border:var(--border); background:var(--card); }
.hint{
margin-top:.35rem; font-size:.88rem; color:var(--muted);
border-left:3px solid color-mix(in oklab, var(--accent), transparent 65%);
padding:.2rem .6rem; line-height:1.4;
}
</style>
</head>
<body>
<!--
ASSETS CHECKLIST (drop into your Space):
- assets/cover.jpg (hero title image)
- assets/sample_AV.mp4 (example clip – audio+video)
- assets/sample_A.mp4 (audio-only version)
- assets/sample_V.mp4 (video-only version)
- assets/sample_poster.jpg (poster frame for the video)
- assets/heatmap.png (overall heatmap graphic)
- assets/confusion.png (confusion matrix or similar)
- assets/favicon.png (16–64px)
-->
<header class="site-header">
<a href="#home" class="logo" aria-label="AVIP Benchmark home"><span class="dot" aria-hidden="true"></span> AVIP<span>Benchmark</span></a>
<button class="nav-toggle" aria-expanded="false" aria-controls="site-nav"></button>
<nav id="site-nav" class="site-nav" aria-label="Main navigation">
<a href="#home">Home</a>
<a href="#about">About</a>
<a href="#leaderboard">Leaderboard</a>
<a href="#media">Examples</a>
<a href="#contact">Contact</a>
</nav>
</header>
<main id="home">
<section class="hero" aria-labelledby="tagline">
<h1 id="tagline">Do you hear it? Meet AVIP-Bench</h1>
<p style="font-style:italic;">
A controlled benchmark for evaluating intuitive physics from video &amp; sound.
</p>
<p>
Objects crash, bounce, and shatter - our benchmark of audiovisual object drops
probes whether models benefit from adding <strong>sound</strong> when reasoning about physics.
</p>
<!-- Button row: examples + PDF -->
<div class="btn-row">
<a class="btn" href="#media">See example Videos and Results</a>
<a id="paperLink" href="#" class="btn btn-disabled" aria-disabled="true">📄 PDF coming soon</a>
</div>
<!-- Hero video (pushed down a bit) -->
<video autoplay muted loop playsinline poster="">
<source src="thumbnail.mp4" type="video/mp4" />
<!-- Remove the webm line unless you add it later -->
<source src="thumbnail.webm" type="video/webm" />
</video>
</section>
<section id="about" class="section">
<h2>What is AVIP?</h2>
<p class="muted">A tiny, controlled benchmark with triplet videos per clip: <span class="pill">A</span> audio-only, <span class="pill">V</span> video-only, and <span class="pill">AV</span> audio+video. Tasks: <em>object</em>, <em>material</em>, <em>outcome</em>. We check top‑1 predictions vs. ground truth and look for cross‑modal gains.</p>
<ul class="features">
<li class="card">📦 <strong>Minimal, reproducible clips</strong><br/>Short single‑impact scenes recorded in a controlled setup.</li>
<li class="card">🔊 <strong>Modality toggles</strong><br/>Each clip exists as A, V, and AV to test true audio usage.</li>
<li class="card">📈 <strong>Metrics</strong><br/>Top‑1 accuracy per task and an <em>AV − max(A,V)</em> cross‑modal gain.</li>
<li class="card">🧪 <strong>Probe‑style prompts</strong><br/>Strict label sets &amp; JSON outputs to avoid prompt drift.</li>
</ul>
<details class="card" style="margin-top:1rem">
<summary><strong>Method (short)</strong></summary>
<ol>
<li>For each clip, run models on A, V, and AV variants with the same instruction-style prompt.</li>
<li>Decode model outputs into <code>{object, material, outcome}</code> and compare against labels.</li>
<li>Compute per-task Top-1 and Top-5 accuracy and cross-modal gain per clip and in aggregate; additionally report calibration/confidence metrics (ECE, Brier, margin, entropy, Top-1 probability) and probing-based audio reliance via fixed cue selection and A/V/AV consistency; all metrics computed on the paired clip set (A&cap;V&cap;AV) with 95% confidence intervals.</li>
</ol>
</details>
</section>
<section id="leaderboard" class="section" aria-labelledby="lb-title">
<h2 id="lb-title">Leaderboard</h2>
<p class="muted">Per‑Modality (A / V / AV)</p>
<div class="table-wrap">
<table aria-describedby="lb-title">
<thead>
<tr>
<th scope="col">Model</th>
<th scope="col">Modality</th>
<th scope="col">N</th>
<th scope="col">Top‑1 Acc (%)</th>
<th scope="col">Updated</th>
</tr>
</thead>
<tbody id="leaderboard-body"><!-- JS renders here --></tbody>
</table>
</div>
<!-- Deine Daten (inline JSON) -->
<script id="leaderboard-data" type="application/json">{
"rows": [
{"model":"Gemini‑2.5 Flash (no think)", "modality":"A", "Top1AccuracyinPercent":20.0, "N":993},
{"model":"Gemini‑2.5 Flash (no think)", "modality":"AV", "Top1AccuracyinPercent":53.4, "N":993},
{"model":"Gemini‑2.5 Flash (no think)", "modality":"V", "Top1AccuracyinPercent":48.2, "N":993},
{"model":"Gemini‑2.5 Flash (think)", "modality":"A", "Top1AccuracyinPercent":24.1, "N":990},
{"model":"Gemini‑2.5 Flash (think)", "modality":"AV", "Top1AccuracyinPercent":58.5, "N":993},
{"model":"Gemini‑2.5 Flash (think)", "modality":"V", "Top1AccuracyinPercent":50.9, "N":993},
{"model":"Gemini‑2.5 Pro (think)", "modality":"A", "Top1AccuracyinPercent":17.3, "N":819},
{"model":"Gemini‑2.5 Pro (think)", "modality":"AV", "Top1AccuracyinPercent":61.8, "N":807},
{"model":"Gemini‑2.5 Pro (think)", "modality":"V", "Top1AccuracyinPercent":56.3, "N":807},
{"model":"Qwen2.5‑Omni 7B (local)", "modality":"A", "Top1AccuracyinPercent":10.9, "N":993},
{"model":"Qwen2.5‑Omni 7B (local)", "modality":"AV", "Top1AccuracyinPercent":38.7, "N":993},
{"model":"Qwen2.5‑Omni 7B (local)", "modality":"V", "Top1AccuracyinPercent":38.5, "N":993}
]
}</script>
</section>
<section id="media" class="section" aria-labelledby="ex-title">
<h2 id="ex-title">Example clips and Plots</h2>
<div class="media-grid">
<figure class="media-card">
<video id="sampleVideo" controls preload="metadata" playsinline poster="assets/sample_poster.jpg">
<source src="paperbox_high_1.MP4" type="video/mp4" />
Your browser doesn’t support HTML5 video.
</video>
<div class="toolbar" role="toolbar" aria-label="Version selector">
<button type="button" class="ver" data-src="paperbox_high_1_A.mp4" aria-pressed="false" aria-label="Audio only (A)">A</button>
<button type="button" class="ver" data-src="paperbox_high_1_V.mp4" aria-pressed="false" aria-label="Video only (V)">V</button>
<button type="button" class="ver" data-src="paperbox_high_1.MP4" aria-pressed="true" aria-label="Audio + Video (AV)">AV</button>
<span class="muted" id="verStatus" aria-live="polite" style="margin-left:.4rem">Now showing: AV</span>
</div>
<figcaption class="muted">Task labels (demo): <strong>object</strong>=<code>paperbox</code>, <strong>material</strong>=<code>cardboard</code>, <strong>outcome</strong>=<code>bounce</code></figcaption>
</figure>
<!-- Group 0: Cross-Modal Gain heatmap -->
<div class="results-group">
<h3>Cross-Modal Gain (CMG)</h3>
<figure class="figure-card">
<img src="xmod_cis.png" alt="Cross-Modal Gain heatmap" loading="lazy">
<figcaption>CMG in percentage points per engine; horizontal bars are 95\% paired-bootstrap CIs on the paired clip set.</figcaption>
<div class="hint">Look for positive values: these mean AV was better than either audio or video alone. Gains usually appear for outcome prediction, but rarely for object or material recognition.</div>
</figure>
</div>
<!-- Group 1: Average modality attribution (Audio vs Video) -->
<div class="results-group">
<h3>Average modality attribution (AV)</h3>
<div class="grid-2">
<figure class="figure-card">
<img src="Heatmap_Audio.png" alt="Average audio weight across models" loading="lazy">
<figcaption>Audio weight by model.</figcaption>
<div class="hint">What to look for: Red = model relies more on audio, Blue = model relies less.
Engines that “listen” more may gain on outcome prediction, but not always.</div>
</figure>
<figure class="figure-card">
<img src="Heatmap_Video.png" alt="Average video weight across models" loading="lazy">
<figcaption>Video weight by model.</figcaption>
<div class="hint">What to look for: Red = model relies more on video, Blue = model relies less.
Engines that “look” more often ignore sound, which can explain weak cross-modal gains.</div>
</figure>
</div>
</div>
<!-- Group 2: Top-1 Accuracy: single combined figure -->
<div class="media-card full-span">
<div class="results-group">
<h3>Top-1 accuracy by task</h3>
<figure class="figure-card">
<img src="accuracy_micro_macro_cis.png" alt="Top-1 accuracy per model across object, material, and outcome for A, V, AV" loading="lazy">
<figcaption>Top-1 accuracy with 95% CIs (A, V, AV) across tasks and models.</figcaption>
<div class="hint">What to look for: V is usually highest; AV improves over A and sometimes nudges past V on outcome.
Big gaps A→AV mean sound is helpful; AV≈V means little extra benefit.</div>
</figure>
</div>
</div>
</section>
<section id="contact" class="section">
<h2>Contact</h2>
<p>Questions? <a href="mailto:bramo.g@protonmail.com">bramo.g@protonmail.com</a></p>
<p class="muted"><a href="https://huggingface.co/Grets/AVIP">huggingface.co/Grets/AVIP</a></p>
</section>
</main>
<footer class="site-footer">
<small>&copy; <span id="year"></span> Grets. Rendered by Hugging Face Spaces.</small>
</footer>
<!-- 1) Basis-Skripte (Nav & Jahr) -->
<script>
// Mobile nav toggle
const navToggle = document.querySelector('.nav-toggle');
const nav = document.getElementById('site-nav');
if (navToggle && nav) {
navToggle.addEventListener('click', () => {
const open = nav.classList.toggle('open');
navToggle.setAttribute('aria-expanded', String(open));
});
}
// Year in footer
const y = document.getElementById('year');
if (y) y.textContent = new Date().getFullYear();
</script>
<!-- 2) Leaderboard-Renderer (nur deine inline-JSON rows) -->
<script>
(function renderLeaderboard(){
const el = document.getElementById('leaderboard-body');
const dataEl = document.getElementById('leaderboard-data');
if (!el || !dataEl) return;
// JSON laden
let rows = [];
try {
const parsed = JSON.parse(dataEl.textContent.trim());
rows = Array.isArray(parsed.rows) ? parsed.rows : [];
} catch (_) {}
// Sortierung: Model (A–Z) -> Modality in Reihenfolge AV, V, A
const order = ['AV', 'V', 'A'];
rows.sort((a,b) =>
String(a.model).localeCompare(String(b.model)) ||
order.indexOf(a.modality) - order.indexOf(b.modality)
);
const today = new Date().toISOString().slice(0,10);
const fmtPct = v => (v==null || v==='') ? '—' : (Number(v).toFixed(1) + '%');
// Spalten: Model | Modality | N | Top-1 | Top-5 | Updated
el.innerHTML = rows.map(r => `
<tr>
<td>${r.model}</td>
<td>${r.modality}</td>
<td>${r.N ?? '—'}</td>
<td>${fmtPct(r.Top1AccuracyinPercent)}</td>
<td>${r.updated ?? today}</td>
</tr>
`).join('');
})();
</script>
<!-- 3) Example video toggles -->
<script>
(function(){
const video = document.getElementById('sampleVideo');
const verButtons = document.querySelectorAll('.ver');
const verStatus = document.getElementById('verStatus');
verButtons.forEach(btn => {
btn.addEventListener('click', () => {
verButtons.forEach(b => b.setAttribute('aria-pressed','false'));
btn.setAttribute('aria-pressed','true');
const src = btn.getAttribute('data-src');
const label = btn.textContent.trim();
if (src && video) {
const wasPlaying = !video.paused && !video.ended;
video.pause();
video.querySelector('source').src = src;
video.load();
if (wasPlaying) video.play().catch(()=>{});
if (verStatus) verStatus.textContent = `Now showing: ${label}`;
}
});
});
})();
</script>
<!-- Image modal (click-to-zoom) -->
<div class="img-modal" id="imgModal" aria-hidden="true">
<button class="close" aria-label="Close">×</button>
<img id="imgModalImg" alt="">
</div>
<script>
(function(){
const modal = document.getElementById('imgModal');
const modalImg = document.getElementById('imgModalImg');
if (!modal || !modalImg) return;
document.addEventListener('click', (e)=>{
const img = e.target.closest('.figure-card img');
if (!img) return;
const full = img.getAttribute('data-full');
modalImg.src = full || img.src;
modalImg.alt = img.alt || '';
modal.classList.add('open');
document.body.classList.add('modal-open');
modal.setAttribute('aria-hidden','false');
});
modal.addEventListener('click', (e)=>{
if (e.target === modal || e.target.classList.contains('close')) closeModal();
});
document.addEventListener('keydown', (e)=>{
if (e.key === 'Escape' && modal.classList.contains('open')) closeModal();
});
function closeModal(){
modal.classList.remove('open');
document.body.classList.remove('modal-open');
modal.setAttribute('aria-hidden','true');
modalImg.src = '';
}
})();
</script>
<script>
// Where your PDF will live
const PDF_PATH = "AVIP_gbramow_lbreitkopf_iberger.pdf";
async function enablePdfButton(){
try{
const res = await fetch(PDF_PATH, { method:"HEAD", cache:"no-store" });
if(!res.ok) return; // keep disabled state
const a = document.getElementById("paperLink");
if(!a) return;
a.href = PDF_PATH;
a.target = "_blank";
a.rel = "noopener";
a.textContent = "📄 View PDF";
a.classList.remove("btn-disabled");
a.setAttribute("aria-disabled","false");
}catch(e){ /* keep disabled */ }
}
document.addEventListener("DOMContentLoaded", enablePdfButton);
</script>
</body>
</html>