Spaces:

Grets
/

AVIP

Running

App Files Files Community

AVIP / index.html

Grets

Update index.html

7b93b09 verified 8 months ago

raw

history blame contribute delete

22.3 kB

	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<title>AVIP – Audio–Vision Interaction Probe (Benchmark)</title>
	<meta name="description" content="AVIP is a lightweight benchmark to check whether multimodal models truly use audio in videos." />

	<!-- Social cards (add your own image at assets/cover.jpg) -->
	<meta property="og:title" content="AVIP Benchmark" />
	<meta property="og:description" content="Do multimodal models actually use audio in videos?" />
	<meta property="og:type" content="website" />
	<meta property="og:video" content="thumbnail" />
	<meta name="twitter:card" content="summary_large_image" />

	<link rel="icon" href="assets/favicon.png" />
	<style>
	/* =====================
	THEME & BASICS
	===================== */
	:root{
	--bg:#0b0b0c; --surface:#111216; --text:#e9e9ee; --muted:#9aa0a6; --accent:#6ee7ff; --ring:rgba(110,231,255,.35);
	--card: color-mix(in oklab, var(--surface), transparent 8%);
	--border: 1px solid rgba(255,255,255,.08);
	}
	@media (prefers-color-scheme: light){
	:root{ --bg:#fafafa; --surface:#ffffff; --text:#101114; --muted:#5f6368; --accent:#0078ff; --ring:rgba(0,120,255,.2);
	--card: color-mix(in oklab, var(--surface), transparent 4%);
	--border: 1px solid rgba(0,0,0,.08);
	}
	}
	,::before,*::after{ box-sizing:border-box }
	html,body{ height:100% }
	body{
	margin:0; font:16px/1.6 system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif; color:var(--text);
	background:
	radial-gradient(1200px 800px at 10% -5%, rgba(110,231,255,.08), transparent 40%),
	radial-gradient(900px 700px at 110% 10%, rgba(110,231,255,.06), transparent 40%),
	var(--bg);
	}
	.site-header{ position:sticky; top:0; z-index:20; display:flex; align-items:center; gap:1rem; justify-content:space-between;
	padding:.8rem 1rem; border-bottom:var(--border);
	background:color-mix(in oklab, var(--surface), transparent 35%);
	backdrop-filter:saturate(1.2) blur(8px);
	}
	.logo{ font-weight:800; letter-spacing:.2px; text-decoration:none; color:var(--text); display:flex; align-items:center; gap:.55rem }
	.logo .dot{ width:.7rem; height:.7rem; border-radius:999px; background:var(--accent); box-shadow:0 0 16px var(--ring) }
	.logo span{ color:var(--accent) }
	.site-nav{ display:flex; gap:.75rem }
	.site-nav a{ color:var(--text); text-decoration:none; padding:.4rem .6rem; border-radius:.5rem }
	.site-nav a:hover{ outline:2px solid var(--ring); outline-offset:2px }
	.nav-toggle{ display:none; background:transparent; border:1px solid rgba(255,255,255,.2); color:var(--text); border-radius:.5rem; padding:.4rem .6rem }
	@media (max-width: 720px){
	.nav-toggle{ display:inline-block }
	.site-nav{ position:absolute; right:1rem; top:3.4rem; flex-direction:column; padding:.6rem; background:var(--surface); border:var(--border); border-radius:.6rem; display:none }
	.site-nav.open{ display:flex }
	}

	.hero{ text-align:center; padding:4.5rem 1rem 2rem; max-width:1060px; margin:0 auto }
	.hero h1{ font-size:clamp(2rem,3.6vw,3rem); margin:0 0 .5rem }
	.hero p{ margin:0 0 1.5rem; color:var(--muted) }
	.btn{ display:inline-block; padding:.72rem 1rem; border-radius:.6rem; background:var(--accent); color:#06141b; text-decoration:none; font-weight:700 }
	.cover{ margin:1.2rem auto 0; max-width:980px; aspect-ratio: 16/9; border-radius:.8rem; overflow:hidden; border:var(--border); background:var(--card) }
	.cover img{ width:100%; height:100%; object-fit:cover; display:block }

	.section{ padding:2.6rem 1rem; max-width:1060px; margin:0 auto }
	.section h2{ font-size:clamp(1.4rem,2.2vw,1.8rem); margin:0 0 .4rem }
	.muted{ color:var(--muted) }

	.features{ display:grid; gap:.8rem; grid-template-columns:repeat(auto-fill,minmax(230px,1fr)); margin:1rem 0 0 }
	.card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:1rem }

	/* Leaderboard */
	.table-wrap{ overflow-x:auto; background:var(--card); border-radius:.8rem; border:var(--border) }
	table{ width:100%; border-collapse:collapse }
	th,td{ padding:.7rem .8rem; border-bottom:1px solid rgba(255,255,255,.06); text-align:left }
	th{ font-weight:700 }
	.pill{ display:inline-block; padding:.2rem .5rem; border-radius:.5rem; background:rgba(110,231,255,.18); color:var(--text); font-size:.8rem }

	/* Media showcase */
	.media-grid{ display:grid; grid-template-columns:2fr 1fr; gap:1rem }
	@media (max-width: 920px){ .media-grid{ grid-template-columns:1fr } }
	figure{ margin:0 }
	.media-card{ background:var(--card); border:var(--border); border-radius:.8rem; padding:.8rem }
	video{ width:100%; max-height:520px; background:#000; border-radius:.6rem }
	.toolbar{ display:flex; gap:.5rem; flex-wrap:wrap; margin:.6rem 0 }
	.toolbar button{ background:transparent; color:var(--text); border:1px solid rgba(255,255,255,.25); padding:.38rem .6rem; border-radius:.5rem; cursor:pointer }
	.toolbar button[aria-pressed="true"]{ outline:2px solid var(--ring) }

	/* Footer */
	.site-footer{ padding:2rem 1rem 4rem; text-align:center; color:var(--muted) }

	/* A11y: reduce motion */
	@media (prefers-reduced-motion: reduce){ *{ transition:none!important; animation:none!important } }

	/* --- Results grids (for figures) --- */
	.results-group{ margin-top:.8rem }
	.results-group h3{ margin:.2rem 0 .4rem; font-size:1rem; color:var(--muted) }
	.grid-2{
	display:grid; grid-template-columns:repeat(2,1fr); gap:1rem;
	}
	.grid-3{
	display:grid; grid-template-columns:repeat(3,1fr); gap:1rem;
	}
	@media (max-width: 900px){
	.grid-2, .grid-3{ grid-template-columns:1fr }
	}
	.figure-card img{
	width:100%; height:auto; border-radius:.6rem; display:block;
	box-shadow:0 2px 12px rgba(0,0,0,.08);
	}
	.figure-card figcaption{
	font-size:.9rem; color:var(--muted); margin-top:.35rem;
	}

	/* make a card span both columns of .media-grid */
	.full-span { grid-column: 1 / -1; }
	@media (max-width: 920px){ .full-span { grid-column: auto; } }

	/* ===== Image modal (click-to-zoom) ===== */
	.img-modal {
	display:none; position:fixed; inset:0; z-index:1000;
	background:rgba(0,0,0,.75);
	align-items:center; justify-content:center;
	padding:2rem;
	}
	.img-modal.open { display:flex; }
	.img-modal img {
	max-width:92vw; max-height:92vh;
	border-radius:.6rem; box-shadow:0 6px 28px rgba(0,0,0,.4);
	}
	.img-modal .close {
	position:absolute; top:12px; right:16px;
	font-size:1.6rem; color:#fff; background:transparent; border:0; cursor:pointer;
	}
	.figure-card img { cursor: zoom-in; }
	body.modal-open { overflow:hidden; } /* prevent background scroll */

	.btn-row{ display:flex; gap:.6rem; justify-content:center; flex-wrap:wrap; margin:.6rem 0 .2rem; }
	.btn.btn-disabled{ opacity:.6; cursor:not-allowed; pointer-events:none; }

	/* push the hero video down so the button never overlaps */
	.hero > video{ margin-top:.8rem; display:block; max-width:980px; width:100%; border-radius:.8rem; border:var(--border); background:var(--card); }

	.hint{
	margin-top:.35rem; font-size:.88rem; color:var(--muted);
	border-left:3px solid color-mix(in oklab, var(--accent), transparent 65%);
	padding:.2rem .6rem; line-height:1.4;
	}



	</style>
	</head>
	<body>
	<!--
	ASSETS CHECKLIST (drop into your Space):
	- assets/cover.jpg (hero title image)
	- assets/sample_AV.mp4 (example clip – audio+video)
	- assets/sample_A.mp4 (audio-only version)
	- assets/sample_V.mp4 (video-only version)
	- assets/sample_poster.jpg (poster frame for the video)
	- assets/heatmap.png (overall heatmap graphic)
	- assets/confusion.png (confusion matrix or similar)
	- assets/favicon.png (16–64px)
	-->

	<header class="site-header">
	<a href="#home" class="logo" aria-label="AVIP Benchmark home"><span class="dot" aria-hidden="true"></span> AVIP<span>Benchmark</span></a>
	<button class="nav-toggle" aria-expanded="false" aria-controls="site-nav">☰</button>
	<nav id="site-nav" class="site-nav" aria-label="Main navigation">
	<a href="#home">Home</a>
	<a href="#about">About</a>
	<a href="#leaderboard">Leaderboard</a>
	<a href="#media">Examples</a>
	<a href="#contact">Contact</a>
	</nav>
	</header>

	<main id="home">
	<section class="hero" aria-labelledby="tagline">
	<h1 id="tagline">Do you hear it? Meet AVIP-Bench</h1>
	<p style="font-style:italic;">
	A controlled benchmark for evaluating intuitive physics from video & sound.
	</p>
	<p>
	Objects crash, bounce, and shatter - our benchmark of audiovisual object drops
	probes whether models benefit from adding <strong>sound</strong> when reasoning about physics.
	</p>
	<!-- Button row: examples + PDF -->
	<div class="btn-row">
	<a class="btn" href="#media">See example Videos and Results</a>
	<a id="paperLink" href="#" class="btn btn-disabled" aria-disabled="true">📄 PDF coming soon</a>
	</div>

	<!-- Hero video (pushed down a bit) -->
	<video autoplay muted loop playsinline poster="">
	<source src="thumbnail.mp4" type="video/mp4" />
	<!-- Remove the webm line unless you add it later -->
	<source src="thumbnail.webm" type="video/webm" />
	</video>
	</section>

	<section id="about" class="section">
	<h2>What is AVIP?</h2>
	<p class="muted">A tiny, controlled benchmark with triplet videos per clip: <span class="pill">A</span> audio-only, <span class="pill">V</span> video-only, and <span class="pill">AV</span> audio+video. Tasks: <em>object</em>, <em>material</em>, <em>outcome</em>. We check top‑1 predictions vs. ground truth and look for cross‑modal gains.</p>
	<ul class="features">
	<li class="card">📦 <strong>Minimal, reproducible clips</strong><br/>Short single‑impact scenes recorded in a controlled setup.</li>
	<li class="card">🔊 <strong>Modality toggles</strong><br/>Each clip exists as A, V, and AV to test true audio usage.</li>
	<li class="card">📈 <strong>Metrics</strong><br/>Top‑1 accuracy per task and an <em>AV − max(A,V)</em> cross‑modal gain.</li>
	<li class="card">🧪 <strong>Probe‑style prompts</strong><br/>Strict label sets & JSON outputs to avoid prompt drift.</li>
	</ul>
	<details class="card" style="margin-top:1rem">
	<summary><strong>Method (short)</strong></summary>
	<ol>
	<li>For each clip, run models on A, V, and AV variants with the same instruction-style prompt.</li>
	<li>Decode model outputs into <code>{object, material, outcome}</code> and compare against labels.</li>
	<li>Compute per-task Top-1 and Top-5 accuracy and cross-modal gain per clip and in aggregate; additionally report calibration/confidence metrics (ECE, Brier, margin, entropy, Top-1 probability) and probing-based audio reliance via fixed cue selection and A/V/AV consistency; all metrics computed on the paired clip set (A∩V∩AV) with 95% confidence intervals.</li>
	</ol>
	</details>
	</section>

	<section id="leaderboard" class="section" aria-labelledby="lb-title">
	<h2 id="lb-title">Leaderboard</h2>
	<p class="muted">Per‑Modality (A / V / AV)</p>
	<div class="table-wrap">
	<table aria-describedby="lb-title">
	<thead>
	<tr>
	<th scope="col">Model</th>
	<th scope="col">Modality</th>
	<th scope="col">N</th>
	<th scope="col">Top‑1 Acc (%)</th>
	<th scope="col">Updated</th>
	</tr>
	</thead>
	<tbody id="leaderboard-body"><!-- JS renders here --></tbody>
	</table>
	</div>

	<!-- Deine Daten (inline JSON) -->
	<script id="leaderboard-data" type="application/json">{
	"rows": [
	{"model":"Gemini‑2.5 Flash (no think)", "modality":"A", "Top1AccuracyinPercent":20.0, "N":993},
	{"model":"Gemini‑2.5 Flash (no think)", "modality":"AV", "Top1AccuracyinPercent":53.4, "N":993},
	{"model":"Gemini‑2.5 Flash (no think)", "modality":"V", "Top1AccuracyinPercent":48.2, "N":993},

	{"model":"Gemini‑2.5 Flash (think)", "modality":"A", "Top1AccuracyinPercent":24.1, "N":990},
	{"model":"Gemini‑2.5 Flash (think)", "modality":"AV", "Top1AccuracyinPercent":58.5, "N":993},
	{"model":"Gemini‑2.5 Flash (think)", "modality":"V", "Top1AccuracyinPercent":50.9, "N":993},

	{"model":"Gemini‑2.5 Pro (think)", "modality":"A", "Top1AccuracyinPercent":17.3, "N":819},
	{"model":"Gemini‑2.5 Pro (think)", "modality":"AV", "Top1AccuracyinPercent":61.8, "N":807},
	{"model":"Gemini‑2.5 Pro (think)", "modality":"V", "Top1AccuracyinPercent":56.3, "N":807},

	{"model":"Qwen2.5‑Omni 7B (local)", "modality":"A", "Top1AccuracyinPercent":10.9, "N":993},
	{"model":"Qwen2.5‑Omni 7B (local)", "modality":"AV", "Top1AccuracyinPercent":38.7, "N":993},
	{"model":"Qwen2.5‑Omni 7B (local)", "modality":"V", "Top1AccuracyinPercent":38.5, "N":993}
	]
	}</script>
	</section>

	<section id="media" class="section" aria-labelledby="ex-title">
	<h2 id="ex-title">Example clips and Plots</h2>
	<div class="media-grid">
	<figure class="media-card">
	<video id="sampleVideo" controls preload="metadata" playsinline poster="assets/sample_poster.jpg">
	<source src="paperbox_high_1.MP4" type="video/mp4" />
	Your browser doesn’t support HTML5 video.
	</video>
	<div class="toolbar" role="toolbar" aria-label="Version selector">
	<button type="button" class="ver" data-src="paperbox_high_1_A.mp4" aria-pressed="false" aria-label="Audio only (A)">A</button>
	<button type="button" class="ver" data-src="paperbox_high_1_V.mp4" aria-pressed="false" aria-label="Video only (V)">V</button>
	<button type="button" class="ver" data-src="paperbox_high_1.MP4" aria-pressed="true" aria-label="Audio + Video (AV)">AV</button>
	<span class="muted" id="verStatus" aria-live="polite" style="margin-left:.4rem">Now showing: AV</span>
	</div>
	<figcaption class="muted">Task labels (demo): <strong>object</strong>=<code>paperbox</code>, <strong>material</strong>=<code>cardboard</code>, <strong>outcome</strong>=<code>bounce</code></figcaption>
	</figure>

	<!-- Group 0: Cross-Modal Gain heatmap -->
	<div class="results-group">
	<h3>Cross-Modal Gain (CMG)</h3>
	<figure class="figure-card">
	<img src="xmod_cis.png" alt="Cross-Modal Gain heatmap" loading="lazy">
	<figcaption>CMG in percentage points per engine; horizontal bars are 95\% paired-bootstrap CIs on the paired clip set.</figcaption>
	<div class="hint">Look for positive values: these mean AV was better than either audio or video alone. Gains usually appear for outcome prediction, but rarely for object or material recognition.</div>
	</figure>
	</div>

	<!-- Group 1: Average modality attribution (Audio vs Video) -->
	<div class="results-group">
	<h3>Average modality attribution (AV)</h3>
	<div class="grid-2">
	<figure class="figure-card">
	<img src="Heatmap_Audio.png" alt="Average audio weight across models" loading="lazy">
	<figcaption>Audio weight by model.</figcaption>
	<div class="hint">What to look for: Red = model relies more on audio, Blue = model relies less.
	Engines that “listen” more may gain on outcome prediction, but not always.</div>
	</figure>
	<figure class="figure-card">
	<img src="Heatmap_Video.png" alt="Average video weight across models" loading="lazy">
	<figcaption>Video weight by model.</figcaption>
	<div class="hint">What to look for: Red = model relies more on video, Blue = model relies less.
	Engines that “look” more often ignore sound, which can explain weak cross-modal gains.</div>
	</figure>
	</div>
	</div>

	<!-- Group 2: Top-1 Accuracy: single combined figure -->
	<div class="media-card full-span">
	<div class="results-group">
	<h3>Top-1 accuracy by task</h3>
	<figure class="figure-card">
	<img src="accuracy_micro_macro_cis.png" alt="Top-1 accuracy per model across object, material, and outcome for A, V, AV" loading="lazy">
	<figcaption>Top-1 accuracy with 95% CIs (A, V, AV) across tasks and models.</figcaption>
	<div class="hint">What to look for: V is usually highest; AV improves over A and sometimes nudges past V on outcome.
	Big gaps A→AV mean sound is helpful; AV≈V means little extra benefit.</div>
	</figure>
	</div>
	</div>


	</section>

	<section id="contact" class="section">
	<h2>Contact</h2>
	<p>Questions? <a href="mailto:bramo.g@protonmail.com">bramo.g@protonmail.com</a></p>
	<p class="muted"><a href="https://huggingface.co/Grets/AVIP">huggingface.co/Grets/AVIP</a></p>
	</section>
	</main>

	<footer class="site-footer">
	<small>© <span id="year"></span> Grets. Rendered by Hugging Face Spaces.</small>
	</footer>

	<!-- 1) Basis-Skripte (Nav & Jahr) -->
	<script>
	// Mobile nav toggle
	const navToggle = document.querySelector('.nav-toggle');
	const nav = document.getElementById('site-nav');
	if (navToggle && nav) {
	navToggle.addEventListener('click', () => {
	const open = nav.classList.toggle('open');
	navToggle.setAttribute('aria-expanded', String(open));
	});
	}
	// Year in footer
	const y = document.getElementById('year');
	if (y) y.textContent = new Date().getFullYear();
	</script>

	<!-- 2) Leaderboard-Renderer (nur deine inline-JSON rows) -->
	<script>
	(function renderLeaderboard(){
	const el = document.getElementById('leaderboard-body');
	const dataEl = document.getElementById('leaderboard-data');
	if (!el \|\| !dataEl) return;

	// JSON laden
	let rows = [];
	try {
	const parsed = JSON.parse(dataEl.textContent.trim());
	rows = Array.isArray(parsed.rows) ? parsed.rows : [];
	} catch (_) {}

	// Sortierung: Model (A–Z) -> Modality in Reihenfolge AV, V, A
	const order = ['AV', 'V', 'A'];
	rows.sort((a,b) =>
	String(a.model).localeCompare(String(b.model)) \|\|
	order.indexOf(a.modality) - order.indexOf(b.modality)
	);

	const today = new Date().toISOString().slice(0,10);
	const fmtPct = v => (v==null \|\| v==='') ? '—' : (Number(v).toFixed(1) + '%');

	// Spalten: Model \| Modality \| N \| Top-1 \| Top-5 \| Updated
	el.innerHTML = rows.map(r => `
	<tr>
	<td>${r.model}</td>
	<td>${r.modality}</td>
	<td>${r.N ?? '—'}</td>
	<td>${fmtPct(r.Top1AccuracyinPercent)}</td>
	<td>${r.updated ?? today}</td>
	</tr>
	`).join('');
	})();
	</script>

	<!-- 3) Example video toggles -->
	<script>
	(function(){
	const video = document.getElementById('sampleVideo');
	const verButtons = document.querySelectorAll('.ver');
	const verStatus = document.getElementById('verStatus');
	verButtons.forEach(btn => {
	btn.addEventListener('click', () => {
	verButtons.forEach(b => b.setAttribute('aria-pressed','false'));
	btn.setAttribute('aria-pressed','true');
	const src = btn.getAttribute('data-src');
	const label = btn.textContent.trim();
	if (src && video) {
	const wasPlaying = !video.paused && !video.ended;
	video.pause();
	video.querySelector('source').src = src;
	video.load();
	if (wasPlaying) video.play().catch(()=>{});
	if (verStatus) verStatus.textContent = `Now showing: ${label}`;
	}
	});
	});
	})();
	</script>

	<!-- Image modal (click-to-zoom) -->
	<div class="img-modal" id="imgModal" aria-hidden="true">
	<button class="close" aria-label="Close">×</button>
	<img id="imgModalImg" alt="">
	</div>

	<script>
	(function(){
	const modal = document.getElementById('imgModal');
	const modalImg = document.getElementById('imgModalImg');
	if (!modal \|\| !modalImg) return;

	document.addEventListener('click', (e)=>{
	const img = e.target.closest('.figure-card img');
	if (!img) return;
	const full = img.getAttribute('data-full');
	modalImg.src = full \|\| img.src;
	modalImg.alt = img.alt \|\| '';
	modal.classList.add('open');
	document.body.classList.add('modal-open');
	modal.setAttribute('aria-hidden','false');
	});

	modal.addEventListener('click', (e)=>{
	if (e.target === modal \|\| e.target.classList.contains('close')) closeModal();
	});

	document.addEventListener('keydown', (e)=>{
	if (e.key === 'Escape' && modal.classList.contains('open')) closeModal();
	});

	function closeModal(){
	modal.classList.remove('open');
	document.body.classList.remove('modal-open');
	modal.setAttribute('aria-hidden','true');
	modalImg.src = '';
	}
	})();
	</script>

	<script>
	// Where your PDF will live
	const PDF_PATH = "AVIP_gbramow_lbreitkopf_iberger.pdf";

	async function enablePdfButton(){
	try{
	const res = await fetch(PDF_PATH, { method:"HEAD", cache:"no-store" });
	if(!res.ok) return; // keep disabled state
	const a = document.getElementById("paperLink");
	if(!a) return;
	a.href = PDF_PATH;
	a.target = "_blank";
	a.rel = "noopener";
	a.textContent = "📄 View PDF";
	a.classList.remove("btn-disabled");
	a.setAttribute("aria-disabled","false");
	}catch(e){ /* keep disabled */ }
	}
	document.addEventListener("DOMContentLoaded", enablePdfButton);
	</script>


	</body>
	</html>