Spaces:

WukLab
/

SourceBench-Leaderboard

Running

App Files Files Community

SourceBench-Leaderboard / index.html

Kristinx0351

Remove overview and highlights section

1444582 2 days ago

raw

history blame contribute delete

41.9 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>SourceBench Leaderboard</title>
	<style>
	:root {
	--bg: #f6efe4;
	--paper: rgba(255, 251, 244, 0.92);
	--paper-solid: #fff8ef;
	--ink: #172033;
	--muted: #6d7382;
	--line: rgba(23, 32, 51, 0.12);
	--accent: #b24a1f;
	--accent-deep: #7c2d12;
	--gold: #e9a23b;
	--green: #206a5d;
	--blue: #255a8f;
	--shadow: 0 18px 40px rgba(23, 32, 51, 0.09);
	--radius: 22px;
	}

	* {
	box-sizing: border-box;
	}

	html {
	scroll-behavior: smooth;
	}

	body {
	margin: 0;
	color: var(--ink);
	background:
	radial-gradient(circle at 0% 0%, rgba(233, 162, 59, 0.18), transparent 28%),
	radial-gradient(circle at 100% 10%, rgba(178, 74, 31, 0.14), transparent 22%),
	radial-gradient(circle at 80% 100%, rgba(37, 90, 143, 0.09), transparent 22%),
	linear-gradient(180deg, #f8f2e8 0%, #f4ebdf 100%);
	font-family: "Avenir Next", "Segoe UI", "Helvetica Neue", sans-serif;
	font-size: 17px;
	}

	.shell {
	max-width: 1200px;
	margin: 0 auto;
	padding: 26px 48px 72px;
	}

	.topbar {
	display: flex;
	justify-content: space-between;
	align-items: center;
	gap: 16px;
	margin-bottom: 18px;
	flex-wrap: wrap;
	}

	.brand {
	display: flex;
	align-items: center;
	gap: 12px;
	}

	.brand-mark {
	width: 84px;
	height: 66px;
	border-radius: 0;
	display: flex;
	align-items: center;
	justify-content: center;
	background: transparent;
	color: var(--ink);
	font-weight: 700;
	overflow: hidden;
	border: none;
	}

	.brand-mark img {
	width: 84px;
	height: auto;
	display: block;
	}

	.brand-copy strong {
	display: block;
	font-size: 1rem;
	letter-spacing: 0.06em;
	text-transform: uppercase;
	}

	.brand-copy span {
	color: var(--muted);
	font-size: 0.92rem;
	}

	.nav {
	display: flex;
	flex-wrap: wrap;
	gap: 10px;
	}

	.nav a,
	button {
	appearance: none;
	border: 1px solid rgba(23, 32, 51, 0.18);
	background: rgba(255, 255, 255, 0.65);
	color: var(--ink);
	padding: 10px 14px;
	border-radius: 999px;
	text-decoration: none;
	font: inherit;
	cursor: pointer;
	transition: transform 120ms ease, background 120ms ease, border-color 120ms ease;
	}

	.nav a:hover,
	button:hover {
	transform: translateY(-1px);
	background: white;
	border-color: rgba(23, 32, 51, 0.28);
	}

	.hero {
	display: block;
	margin-bottom: 18px;
	}

	.panel {
	background: var(--paper);
	border: 1px solid var(--line);
	border-radius: var(--radius);
	box-shadow: var(--shadow);
	backdrop-filter: blur(8px);
	}

	.plain-section {
	padding: 8px 8px 2px;
	margin-bottom: 22px;
	}

	.hero-main {
	padding: 34px 30px 30px;
	min-height: 300px;
	border: none;
	border-radius: 0;
	box-shadow: none;
	backdrop-filter: none;
	background:
	linear-gradient(150deg, rgba(255,255,255,0.42), rgba(255,248,239,0.2)),
	linear-gradient(135deg, rgba(178,74,31,0.035), rgba(37,90,143,0.02));
	}

	h1, h2, h3 {
	margin: 0;
	line-height: 1.05;
	}

	.hero-label {
	display: block;
	margin-bottom: 12px;
	color: var(--accent);
	font-size: 1.52rem;
	font-weight: 700;
	letter-spacing: 0.01em;
	text-align: center;
	}

	h1 {
	font-family: Georgia, "Times New Roman", serif;
	font-size: clamp(2rem, 3.1vw, 3.2rem);
	letter-spacing: -0.05em;
	margin-bottom: 18px;
	max-width: none;
	white-space: nowrap;
	text-align: center;
	}

	.hero-text {
	max-width: 1040px;
	font-size: 1.1rem;
	line-height: 1.72;
	color: #374151;
	margin-bottom: 24px;
	margin-left: auto;
	margin-right: auto;
	}

	.resource-links {
	display: flex;
	gap: 10px;
	flex-wrap: wrap;
	margin-bottom: 20px;
	justify-content: center;
	}

	.resource-links a {
	display: inline-flex;
	align-items: center;
	gap: 7px;
	color: var(--blue);
	text-decoration: none;
	font-size: 0.96rem;
	border-bottom: 1px solid rgba(37, 90, 143, 0.25);
	padding-bottom: 1px;
	}

	.resource-links a:hover {
	color: var(--accent);
	border-bottom-color: rgba(178, 74, 31, 0.35);
	}

	.resource-links svg {
	width: 15px;
	height: 15px;
	display: block;
	flex: 0 0 auto;
	fill: currentColor;
	}

	.hero-points {
	display: grid;
	grid-template-columns: repeat(3, minmax(0, 1fr));
	gap: 14px;
	padding-top: 10px;
	border-top: 1px solid rgba(23, 32, 51, 0.08);
	}

	.study-section {
	margin: 24px 0 28px;
	}

	.study-copy {
	max-width: 860px;
	color: var(--muted);
	line-height: 1.65;
	margin: 0 0 14px;
	}

	.study-list {
	display: grid;
	gap: 10px;
	margin: 0 0 16px;
	}

	.study-item {
	padding-left: 18px;
	position: relative;
	color: var(--muted);
	line-height: 1.6;
	}

	.study-item::before {
	content: "";
	position: absolute;
	left: 0;
	top: 0.55rem;
	width: 8px;
	height: 8px;
	border-radius: 999px;
	background: var(--accent);
	}

	.study-item strong {
	color: var(--ink);
	}

	.hero-point {
	padding: 10px 6px 0 0;
	}

	.hero-point strong {
	display: block;
	margin-bottom: 6px;
	font-size: 1rem;
	}

	.hero-point span {
	color: var(--muted);
	font-size: 0.98rem;
	line-height: 1.55;
	}

	.section-kicker {
	font-size: 0.88rem;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	color: var(--accent);
	margin-bottom: 8px;
	}

	.micro {
	color: var(--muted);
	font-size: 0.98rem;
	line-height: 1.58;
	}

	.stats {
	display: grid;
	grid-template-columns: repeat(4, minmax(0, 1fr));
	gap: 12px;
	margin: 18px 0;
	}

	.stat {
	padding: 20px;
	}

	.stat-label {
	display: block;
	color: var(--muted);
	font-size: 0.84rem;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	margin-bottom: 8px;
	}

	.stat-value {
	font-family: Georgia, "Times New Roman", serif;
	font-size: 2rem;
	line-height: 1;
	margin-bottom: 6px;
	}

	.stat-sub {
	color: var(--muted);
	font-size: 0.96rem;
	}

	.grid-2 {
	display: grid;
	grid-template-columns: 1fr 1fr;
	gap: 18px;
	margin-bottom: 18px;
	}

	.section {
	padding: 24px;
	}

	.section-head {
	display: flex;
	justify-content: space-between;
	align-items: baseline;
	gap: 12px;
	margin-bottom: 16px;
	flex-wrap: wrap;
	}

	.section-head p {
	margin: 0;
	color: var(--muted);
	max-width: 720px;
	line-height: 1.55;
	}

	.metric-intro {
	margin: 0 0 12px;
	color: var(--muted);
	max-width: none;
	line-height: 1.6;
	}

	.metric-core {
	display: grid;
	grid-template-columns: repeat(3, minmax(0, 1fr));
	gap: 10px 18px;
	margin: 0 0 12px;
	padding: 0;
	list-style: none;
	}

	.metric-core li {
	padding-left: 16px;
	position: relative;
	color: var(--muted);
	line-height: 1.55;
	font-size: 0.97rem;
	}

	.metric-core li::before {
	content: "";
	position: absolute;
	left: 0;
	top: 0.58rem;
	width: 7px;
	height: 7px;
	border-radius: 999px;
	background: var(--blue);
	}

	.metric-core strong {
	color: var(--ink);
	}

	.metric-note {
	margin: 0 0 18px;
	color: var(--muted);
	font-size: 0.97rem;
	line-height: 1.58;
	}

	.findings,
	.plain-list {
	display: grid;
	gap: 12px;
	}

	.finding,
	.plain-item {
	padding: 0 0 0 18px;
	position: relative;
	}

	.finding::before,
	.plain-item::before {
	content: "";
	position: absolute;
	left: 0;
	top: 0.5rem;
	width: 8px;
	height: 8px;
	border-radius: 999px;
	background: var(--accent);
	}

	.finding strong,
	.plain-item strong {
	display: block;
	margin-bottom: 6px;
	font-size: 1.06rem;
	}

	.finding span,
	.plain-item span {
	color: var(--muted);
	line-height: 1.55;
	font-size: 0.98rem;
	}

	.mini-table {
	overflow-x: auto;
	border-radius: 18px;
	border: 1px solid var(--line);
	background: rgba(255,255,255,0.72);
	}

	table {
	width: 100%;
	border-collapse: collapse;
	min-width: 760px;
	}

	th, td {
	text-align: left;
	padding: 12px 14px;
	border-bottom: 1px solid rgba(23, 32, 51, 0.08);
	white-space: nowrap;
	font-size: 0.93rem;
	}

	th {
	background: rgba(255,248,239,0.97);
	color: var(--accent);
	font-size: 0.8rem;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	}

	.stage-grid {
	display: grid;
	grid-template-columns: repeat(4, minmax(0, 1fr));
	gap: 12px;
	}

	.stage {
	padding: 16px;
	border-radius: 18px;
	background: rgba(255,255,255,0.72);
	border: 1px solid var(--line);
	position: relative;
	overflow: hidden;
	}

	.stage::before {
	content: "";
	position: absolute;
	inset: 0 auto 0 0;
	width: 5px;
	background: linear-gradient(180deg, var(--gold), var(--accent));
	}

	.stage small {
	display: block;
	color: var(--accent);
	text-transform: uppercase;
	letter-spacing: 0.08em;
	margin-bottom: 8px;
	font-size: 0.78rem;
	}

	.stage h3 {
	font-size: 1.18rem;
	margin-bottom: 8px;
	}

	.stage p {
	margin: 0 0 10px;
	color: var(--muted);
	line-height: 1.58;
	font-size: 1rem;
	}

	.stage code {
	display: block;
	white-space: pre-wrap;
	font-size: 0.84rem;
	color: #2b3240;
	background: rgba(23,32,51,0.04);
	padding: 10px;
	border-radius: 12px;
	}

	.two-col {
	display: grid;
	grid-template-columns: 1.1fr 0.9fr;
	gap: 18px;
	margin-bottom: 18px;
	}

	.policy-list,
	.guide-list {
	display: grid;
	gap: 12px;
	}

	.policy-item,
	.guide-item {
	padding-left: 18px;
	position: relative;
	}

	.policy-item strong,
	.guide-item strong {
	display: block;
	margin-bottom: 6px;
	}

	.policy-item::before,
	.guide-item::before {
	content: "";
	position: absolute;
	left: 0;
	top: 0.5rem;
	width: 8px;
	height: 8px;
	border-radius: 999px;
	background: var(--accent);
	}

	.policy-item span,
	.guide-item span {
	color: var(--muted);
	line-height: 1.55;
	font-size: 0.98rem;
	}

	.controls {
	display: grid;
	gap: 12px;
	grid-template-columns: repeat(4, minmax(0, 1fr));
	padding: 16px;
	margin-bottom: 14px;
	}

	.query-filter-wrap {
	display: none;
	}

	.query-filter-wrap.visible {
	display: block;
	}

	.toggle-row {
	display: flex;
	justify-content: space-between;
	align-items: center;
	gap: 12px;
	margin: 4px 0 14px;
	flex-wrap: wrap;
	}

	.toggle-row.hidden {
	display: none;
	}

	.toggle {
	display: inline-flex;
	align-items: center;
	gap: 10px;
	color: var(--muted);
	font-size: 0.97rem;
	}

	.toggle input {
	width: 18px;
	height: 18px;
	margin: 0;
	accent-color: var(--ink);
	}

	label {
	display: grid;
	gap: 6px;
	color: var(--muted);
	font-size: 0.98rem;
	}

	select,
	input[type="search"] {
	width: 100%;
	padding: 11px 12px;
	border-radius: 12px;
	border: 1px solid var(--line);
	background: rgba(255,255,255,0.94);
	color: var(--ink);
	font: inherit;
	}

	.tabs {
	display: inline-flex;
	gap: 8px;
	margin-bottom: 14px;
	flex-wrap: wrap;
	}

	.tab.active {
	background: var(--ink);
	color: white;
	border-color: var(--ink);
	}

	.table-wrap {
	overflow-x: auto;
	border: 1px solid var(--line);
	border-radius: 20px;
	background: rgba(255,255,255,0.72);
	}

	.table-wrap table {
	min-width: 980px;
	}

	tbody tr:hover {
	background: rgba(233, 162, 59, 0.09);
	}

	code {
	font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
	}

	.footer-note {
	color: var(--muted);
	font-size: 1rem;
	line-height: 1.55;
	margin-top: 14px;
	}

	.load-status {
	color: var(--muted);
	font-size: 0.95rem;
	line-height: 1.5;
	margin-top: 10px;
	}

	.table-meta {
	color: var(--muted);
	font-size: 0.95rem;
	line-height: 1.5;
	}

	.dataset-legend {
	display: none;
	gap: 10px;
	flex-wrap: wrap;
	margin: 0 0 14px;
	}

	.dataset-legend.visible {
	display: flex;
	}

	.dataset-chip {
	display: inline-flex;
	align-items: center;
	gap: 8px;
	padding: 8px 12px;
	border-radius: 999px;
	background: rgba(255,255,255,0.78);
	border: 1px solid rgba(23, 32, 51, 0.1);
	color: var(--muted);
	font-size: 0.93rem;
	line-height: 1.4;
	}

	.dataset-chip strong {
	color: var(--ink);
	font-size: 0.94rem;
	}

	.dataset-chip.active {
	background: var(--ink);
	border-color: var(--ink);
	color: rgba(255,255,255,0.85);
	}

	.dataset-chip.active strong {
	color: white;
	}

	.sort-option-overall-only.hidden {
	display: none;
	}

	.group-row td {
	background: rgba(23, 32, 51, 0.05);
	color: var(--ink);
	font-weight: 700;
	font-size: 0.9rem;
	letter-spacing: 0.02em;
	border-bottom: 1px solid rgba(23, 32, 51, 0.08);
	}

	.markdown-note {
	color: var(--muted);
	font-size: 1rem;
	line-height: 1.7;
	}

	.markdown-note p {
	margin: 0 0 10px;
	}

	.markdown-note ul {
	margin: 0;
	padding-left: 20px;
	}

	.markdown-note li {
	margin: 0 0 8px;
	}

	.markdown-note pre {
	margin: 12px 0 16px;
	padding: 14px 16px;
	overflow-x: auto;
	border-radius: 14px;
	border: 1px solid rgba(23, 32, 51, 0.08);
	background: rgba(255, 255, 255, 0.68);
	color: #223049;
	font-size: 0.92rem;
	line-height: 1.55;
	}

	.markdown-note code {
	font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
	}

	@media (max-width: 1120px) {
	.hero,
	.grid-2,
	.two-col,
	.stats,
	.controls,
	.stage-grid,
	.hero-points,
	.metric-core {
	grid-template-columns: 1fr;
	}

	h1 {
	white-space: normal;
	}
	}
	</style>
	</head>
	<body>
	<main class="shell">
	<header class="topbar">
	<div class="brand">
	<div class="brand-mark" aria-hidden="true">
	<img src="./wuklab_logo.png" alt="WukLab logo">
	</div>
	<div class="brand-copy">
	<strong>SourceBench</strong>
	<span>Leaderboard for cited source quality in generative engines</span>
	</div>
	</div>
	<nav class="nav">
	<a href="#leaderboard">Leaderboard</a>
	<a href="#findings">Findings</a>
	<a href="#deepseek-study">Study</a>
	<a href="#policy">Policy</a>
	</nav>
	</header>

	<section class="hero">
	<article class="panel hero-main">
	<div class="hero-label">SourceBench</div>
	<h1>Can AI answers cite high-quality web sources?</h1>
	<div class="resource-links">
	<a href="https://arxiv.org/abs/2602.16942" target="_blank" rel="noopener noreferrer">
	<svg viewBox="0 0 24 24" aria-hidden="true"><path d="M12 2 1.8 7.5v9L12 22l10.2-5.5v-9L12 2Zm0 2.3 7.8 4.2-2.4 1.3L12 6.8 6.6 9.8 4.2 8.5 12 4.3Zm-6 5.5 4.8 2.6v5.5L6 15.3V9.8Zm6 9.1v-5.5l4.8-2.6v5.5L12 18.9Z"/></svg>
	<span>arXiv</span>
	</a>
	<a href="https://github.com/WukLab/SourceBench" target="_blank" rel="noopener noreferrer">
	<svg viewBox="0 0 24 24" aria-hidden="true"><path d="M12 2C6.48 2 2 6.6 2 12.26c0 4.52 2.87 8.36 6.84 9.71.5.1.68-.22.68-.49 0-.24-.01-1.05-.01-1.9-2.78.62-3.37-1.22-3.37-1.22-.45-1.19-1.11-1.5-1.11-1.5-.91-.64.07-.63.07-.63 1 .08 1.53 1.06 1.53 1.06.9 1.57 2.35 1.12 2.92.86.09-.67.35-1.12.64-1.38-2.22-.26-4.55-1.14-4.55-5.08 0-1.12.39-2.04 1.03-2.76-.1-.26-.45-1.3.1-2.71 0 0 .84-.28 2.75 1.05A9.3 9.3 0 0 1 12 7.64c.85 0 1.71.12 2.51.35 1.91-1.33 2.75-1.05 2.75-1.05.55 1.41.2 2.45.1 2.71.64.72 1.03 1.64 1.03 2.76 0 3.95-2.33 4.82-4.56 5.07.36.32.68.95.68 1.92 0 1.39-.01 2.5-.01 2.84 0 .27.18.6.69.49A10.27 10.27 0 0 0 22 12.26C22 6.6 17.52 2 12 2Z"/></svg>
	<span>GitHub</span>
	</a>
	<a href="https://mlsys.wuklab.io/posts/sourcebench/" target="_blank" rel="noopener noreferrer">
	<svg viewBox="0 0 24 24" aria-hidden="true"><path d="M4 4h10l6 6v10H4V4Zm10 1.5V10h4.5L14 5.5ZM7 13h10v1.5H7V13Zm0 3h10v1.5H7V16Zm0-6h5v1.5H7V10Z"/></svg>
	<span>Blog</span>
	</a>
	</div>
	<p class="hero-text">
	SourceBench focuses on a different evaluation target from standard answer-quality benchmarks.
	Instead of only asking whether a model answered well, it asks whether the model cited sources that are
	relevant, accurate, fresh, transparent, authoritative, and usable. This Space hosts the public-facing
	leaderboard frontend. Official leaderboard entries are validated and judged by the SourceBench team.
	</p>
	<div class="hero-points">
	<div class="hero-point">
	<strong>Benchmark target</strong>
	<span>Quality of cited sources, not just final answer correctness.</span>
	</div>
	<div class="hero-point">
	<strong>Current scope</strong>
	<span>Generative engines with built-in web search, plus official validation for leaderboard inclusion.</span>
	</div>
	<div class="hero-point">
	<strong>Official policy</strong>
	<span>Official leaderboard entries are validated and judged by the SourceBench team using fixed hidden evaluation settings.</span>
	</div>
	</div>
	</article>
	</section>

	<section class="stats" id="summary">
	<article class="panel stat">
	<span class="stat-label">Models in current board</span>
	<div class="stat-value" id="stat-models">0</div>
	<div class="stat-sub">Current leaderboard snapshot</div>
	</article>
	<article class="panel stat">
	<span class="stat-label">Query types</span>
	<div class="stat-value" id="stat-query-types">0</div>
	<div class="stat-sub">Benchmark query slices</div>
	</article>
	<article class="panel stat">
	<span class="stat-label">Top model</span>
	<div class="stat-value" id="stat-top-model">-</div>
	<div class="stat-sub">Highest weighted content score</div>
	</article>
	<article class="panel stat">
	<span class="stat-label">Top weighted score</span>
	<div class="stat-value" id="stat-top-score">-</div>
	<div class="stat-sub">Weighted source-quality metric</div>
	</article>
	</section>

	<section class="panel section" id="leaderboard">
	<div class="section-head">
	<div>
	<div class="section-kicker">Leaderboard</div>
	<h2 style="font-size:1.6rem;">Ranking Table</h2>
	<div class="table-meta" id="table-generated">Generated at -</div>
	</div>
	</div>

	<p class="metric-intro">
	SourceBench ranks systems by judged source quality rather than answer fluency alone. The main leaderboard target is the weighted overall score.
	</p>
	<ul class="metric-core">
	<li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
	<li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
	<li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
	</ul>
	<p class="metric-note">
	Turn on <strong>Show dimension scores</strong> in Overall view to inspect the eight judged dimensions: semantic relevance, factual accuracy, freshness, objectivity, layout/ad density, accountability, transparency, and authority.
	</p>

	<div class="tabs">
	<button class="tab active" data-view="overall" type="button">Overall</button>
	<button class="tab" data-view="by_query_type" type="button">By Query Type</button>
	</div>

	<section class="panel controls">
	<label>
	View
	<select id="view-select">
	<option value="overall">Overall</option>
	<option value="by_query_type">By query type</option>
	</select>
	</label>
	<label>
	Sort by
	<select id="sort-key">
	<option value="weighted_total_content_score">Weighted score</option>
	<option value="unweighted_mean_score">Unweighted mean</option>
	<option class="sort-option-overall-only" value="percentage_ge_sources_in_se_sources">% In SE</option>
	</select>
	</label>
	<label>
	Filter model
	<input id="search-input" type="search" placeholder="Search model name">
	</label>
	</section>

	<div class="query-filter-wrap" id="query-filter-wrap">
	<div class="dataset-legend" id="dataset-legend"></div>
	</div>

	<div class="toggle-row" id="toggle-row">
	<label class="toggle">
	<input id="show-dimensions" type="checkbox">
	<span>Show dimension scores</span>
	</label>
	</div>

	<div class="section-head" style="margin-top:10px;">
	<h3 id="table-title" style="font-size:1.1rem;">Overall ranking</h3>
	<div class="micro" id="table-count">0 rows</div>
	</div>

	<div class="table-wrap">
	<table id="leaderboard-table">
	<thead></thead>
	<tbody></tbody>
	</table>
	</div>

	<div class="footer-note" id="board-note">
	Official leaderboard entries are validated and judged by the SourceBench team. The benchmark codebase and public evaluation pipeline are maintained separately in the SourceBench repository.
	</div>
	<div class="load-status" id="load-status">Loading leaderboard data...</div>
	</section>

	<section class="plain-section study-section" id="deepseek-study">
	<div class="section-head">
	<div>
	<div class="section-kicker">DeepSeek Tool Study</div>
	<h2 style="font-size:1.5rem;">DeepSeek variants with different search backends</h2>
	</div>
	</div>
	<p class="study-copy">
	SourceBench also includes a focused comparison of DeepSeek variants paired with different retrieval setups.
	This is a separate study rather than part of the main model family ranking: the purpose is to isolate how
	search backend choice and reasoning mode change citation quality, overlap with search results, and the final weighted source score.
	</p>
	<div class="study-list" id="deepseek-findings">
	<div class="study-item"><strong>Load leaderboard data to see the DeepSeek tool study.</strong></div>
	</div>
	<div class="table-wrap">
	<table id="deepseek-table">
	<thead></thead>
	<tbody></tbody>
	</table>
	</div>
	</section>

	<section class="plain-section" id="policy">
	<div class="section-head">
	<div>
	<div class="section-kicker">Official Policy</div>
	<h2 style="font-size:1.7rem;">How official leaderboard evaluation works</h2>
	</div>
	</div>
	<div class="markdown-note">
	<p><strong>Local self-check</strong> can be run with the public SourceBench benchmark code and the fixed public query split.</p>
	<p><strong>Official leaderboard entries</strong> are not accepted from participant-computed final scores alone. Instead, entries are validated and judged by the SourceBench team.</p>
	<p>For official evaluation, SourceBench uses hidden holdout queries, the fixed judging setup, and the fixed metric computation pipeline so that leaderboard rows remain comparable across systems.</p>
	</div>
	</section>

	<section class="plain-section">
	<div class="section-head">
	<div>
	<div class="section-kicker">Submission</div>
	<h2 style="font-size:1.65rem;">What participants should submit</h2>
	</div>
	</div>
	<div class="markdown-note">
	<p><strong>Preferred submission: endpoint access.</strong> Submit the model endpoint, API key, model name, API format, and optional generation settings. The SourceBench team will run hidden queries, source collection, scraping, judging, and metric computation on our side.</p>
	<p><strong>Fallback submission: answer + cited URL bundle.</strong> If endpoint access cannot be shared, submit per-query answer text together with cited URLs. The SourceBench team will run scraping, post-processing, judging, and metric computation server-side.</p>
	<p><strong>Why these boundaries?</strong> They keep the standardized parts of the benchmark under SourceBench control. Official leaderboard entries are validated and judged by the SourceBench team, rather than accepted from participant-provided final scores.</p>
	<p><strong>Benchmark repository and submission examples:</strong></p>
	<ul>
	<li><a href="https://github.com/WukLab/SourceBench" target="_blank" rel="noopener noreferrer">SourceBench benchmark repository</a></li>
	<li><code>leaderboard/examples/endpoint_submission.example.json</code> in the benchmark repo</li>
	<li><code>leaderboard/examples/answer_url_bundle.example.json</code> in the benchmark repo</li>
	</ul>
	</div>
	</section>

	</main>

	<script src="./leaderboard_data.js"></script>
	<script>
	const state = {
	payload: null,
	currentView: "overall",
	showDimensions: false,
	selectedQueryType: "all",
	};

	const baseColumns = [
	"model_name",
	"query_type",
	"weighted_total_content_score",
	"unweighted_mean_score",
	"percentage_ge_sources_in_se_sources",
	];

	const dimensionColumns = [
	"semantic_relevance",
	"factual_accuracy",
	"freshness",
	"objectivity_tone",
	"layout_ad_density",
	"accountability",
	"transparency",
	"authority",
	];

	const labels = {
	model_name: "Model",
	query_type: "Query Type",
	weighted_total_content_score: "Weighted Score",
	unweighted_mean_score: "Unweighted Mean",
	percentage_ge_sources_in_se_sources: "% In SE",
	semantic_relevance: "Semantic Relevance",
	factual_accuracy: "Factual Accuracy",
	freshness: "Freshness",
	objectivity_tone: "Objectivity / Tone",
	layout_ad_density: "Layout / Ad Density",
	accountability: "Accountability",
	transparency: "Transparency",
	authority: "Authority",
	};

	const DEEPSEEK_STUDY_MODELS = new Set([
	"deepseek-chat-gensee",
	"deepseek-reasoning-tavily",
	"deepseek-reasoning-gensee",
	"deepseek-chat-tavily",
	]);

	const QUERY_TYPE_META = {
	"DebateQA": "Argument",
	"HotpotQA": "Informational",
	"Pinocchios": "Factual",
	"QuoraQuestions": "Social",
	"VA-COS NLQ": "Shopping",
	};

	const DISPLAY_MODEL_NAMES = {
	"gpt-5": "GPT-5",
	"gpt-4o": "GPT-4o",
	"grok-4.1-fast-non-reasoning": "Grok-4.1-Fast-Non-Reasoning",
	"claude": "Claude",
	"gensee": "Gensee",
	"exa": "Exa",
	"tavily": "Tavily",
	"google-search": "Google Search",
	"Gemini-3-Pro-Preview": "Gemini 3 Pro Preview",
	"Gemini-3-Flash-Preview": "Gemini 3 Flash Preview",
	"Gemini-2.5-Flash-Preview": "Gemini 2.5 Flash Preview",
	"Perplexity-Sonar-Pro": "Perplexity Sonar Pro",
	"deepseek-chat-gensee": "DeepSeek Chat + Gensee",
	"deepseek-reasoning-tavily": "DeepSeek Reasoning + Tavily",
	"deepseek-reasoning-gensee": "DeepSeek Reasoning + Gensee",
	"deepseek-chat-tavily": "DeepSeek Chat + Tavily",
	};

	const viewSelect = document.getElementById("view-select");
	const sortKey = document.getElementById("sort-key");
	const searchInput = document.getElementById("search-input");
	const showDimensionsInput = document.getElementById("show-dimensions");
	const toggleRow = document.getElementById("toggle-row");
	const queryFilterWrap = document.getElementById("query-filter-wrap");
	const datasetLegend = document.getElementById("dataset-legend");
	const tableTitle = document.getElementById("table-title");
	const tableCount = document.getElementById("table-count");
	const tableHead = document.querySelector("#leaderboard-table thead");
	const tableBody = document.querySelector("#leaderboard-table tbody");
	const deepseekFindings = document.getElementById("deepseek-findings");
	const deepseekTableHead = document.querySelector("#deepseek-table thead");
	const deepseekTableBody = document.querySelector("#deepseek-table tbody");
	const boardNote = document.getElementById("board-note");
	const loadStatus = document.getElementById("load-status");
	const tableGenerated = document.getElementById("table-generated");

	function formatNumber(value, digits = 2) {
	if (value === null \|\| value === undefined \|\| value === "") return "-";
	if (typeof value === "number") return value.toFixed(digits);
	const parsed = Number(value);
	if (!Number.isNaN(parsed)) return parsed.toFixed(digits);
	return String(value);
	}

	function formatPacificTimestamp(value) {
	if (!value) return "-";
	const date = new Date(value);
	if (Number.isNaN(date.getTime())) return value;
	return new Intl.DateTimeFormat("en-US", {
	timeZone: "America/Los_Angeles",
	year: "numeric",
	month: "short",
	day: "numeric",
	hour: "numeric",
	minute: "2-digit",
	timeZoneName: "short",
	}).format(date);
	}

	function setLoadStatus(message, isError = false) {
	loadStatus.textContent = message;
	loadStatus.style.color = isError ? "#b91c1c" : "var(--muted)";
	}

	function isDeepSeekStudyModel(name) {
	return DEEPSEEK_STUDY_MODELS.has(String(name \|\| ""));
	}

	function isMainBoardModel(name) {
	const modelName = String(name \|\| "");
	return !isDeepSeekStudyModel(modelName);
	}

	function formatQueryTypeLabel(queryType) {
	const tag = QUERY_TYPE_META[queryType];
	return tag ? `${queryType} (${tag})` : String(queryType \|\| "-");
	}

	function formatModelName(name) {
	const key = String(name \|\| "");
	return DISPLAY_MODEL_NAMES[key] \|\| key;
	}

	function updateQueryTypeControls() {
	const visible = state.currentView === "by_query_type";
	queryFilterWrap.classList.toggle("visible", visible);
	datasetLegend.classList.toggle("visible", visible);
	toggleRow.classList.toggle("hidden", visible);
	for (const option of document.querySelectorAll(".sort-option-overall-only")) {
	option.classList.toggle("hidden", visible);
	}
	if (visible && sortKey.value === "percentage_ge_sources_in_se_sources") {
	sortKey.value = "weighted_total_content_score";
	}
	if (visible) {
	state.showDimensions = false;
	showDimensionsInput.checked = false;
	}
	}

	function populateQueryTypeFilter(payload) {
	const values = Array.from(new Set((payload.by_query_type \|\| []).map((row) => row.query_type).filter(Boolean)));
	const ordered = Object.keys(QUERY_TYPE_META).filter((key) => values.includes(key)).concat(values.filter((key) => !(key in QUERY_TYPE_META)));
	if (!ordered.length) {
	datasetLegend.innerHTML = "";
	return;
	}
	if (state.selectedQueryType === "all" \|\| !ordered.includes(state.selectedQueryType)) {
	state.selectedQueryType = ordered[0];
	}
	datasetLegend.innerHTML = ordered.map((value) => `
	<button class="dataset-chip ${value === state.selectedQueryType ? 'active' : ''}" data-query-type="${value}" type="button">
	<strong>${value}</strong>
	<span>${QUERY_TYPE_META[value] \|\| ''}</span>
	</button>
	`).join('');
	for (const button of datasetLegend.querySelectorAll('[data-query-type]')) {
	button.addEventListener('click', () => {
	state.selectedQueryType = button.dataset.queryType;
	populateQueryTypeFilter(state.payload);
	renderTable();
	});
	}
	}

	function updateTopStats(payload) {
	const overall = (payload.overall \|\| []).filter((row) => isMainBoardModel(row.model_name));
	const byType = (payload.by_query_type \|\| []).filter((row) => isMainBoardModel(row.model_name));
	const top = overall[0];
	document.getElementById("stat-models").textContent = overall.length;
	document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
	document.getElementById("stat-top-model").textContent = top ? formatModelName(top.model_name) : "-";
	document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
	tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
	}

	function renderDeepSeekStudy(payload) {
	const rows = (payload.overall \|\| [])
	.filter((row) => typeof row.model_name === "string" && row.model_name.startsWith("deepseek"));

	if (!rows.length) {
	deepseekFindings.innerHTML = '<div class="study-item"><strong>No DeepSeek study rows found in the current artifact.</strong></div>';
	deepseekTableHead.innerHTML = "";
	deepseekTableBody.innerHTML = "";
	return;
	}

	const sorted = [...rows].sort((a, b) => b.weighted_total_content_score - a.weighted_total_content_score);
	const best = sorted[0];
	const genseeRows = sorted.filter((row) => row.model_name.includes("gensee"));
	const tavilyRows = sorted.filter((row) => row.model_name.includes("tavily"));
	const avg = (items, key) => items.length ? items.reduce((sum, item) => sum + (item[key] \|\| 0), 0) / items.length : null;
	const genseeAvg = avg(genseeRows, "weighted_total_content_score");
	const tavilyAvg = avg(tavilyRows, "weighted_total_content_score");
	const reasoningRows = sorted.filter((row) => row.model_name.includes("reasoning"));
	const chatRows = sorted.filter((row) => row.model_name.includes("chat-"));
	const reasoningAvg = avg(reasoningRows, "weighted_total_content_score");
	const chatAvg = avg(chatRows, "weighted_total_content_score");

	deepseekFindings.innerHTML = `
	<div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${formatModelName(best.model_name)}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
	<div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
	<div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
	`;

	const cols = ["model_name", "weighted_total_content_score", "unweighted_mean_score", "percentage_ge_sources_in_se_sources"];
	deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] \|\| key}</th>`).join("")}</tr>`;
	deepseekTableBody.innerHTML = sorted.map((row) => `
	<tr>
	${cols.map((key) => {
	const value = row[key];
	const rendered = key === "model_name"
	? formatModelName(value)
	: (typeof value === "number" ? formatNumber(value) : (value ?? "-"));
	return `<td>${rendered}</td>`;
	}).join("")}
	</tr>
	`).join("");
	}

	function getRows() {
	if (!state.payload) return [];
	let rows = state.currentView === "overall"
	? [...(state.payload.overall \|\| [])]
	: [...(state.payload.by_query_type \|\| [])];

	rows = rows.filter((row) => isMainBoardModel(row.model_name));

	if (state.currentView === "by_query_type") {
	rows = rows.filter((row) => row.query_type === state.selectedQueryType);
	}

	const q = searchInput.value.trim().toLowerCase();
	if (q) {
	rows = rows.filter((row) => {
	const raw = String(row.model_name \|\| "").toLowerCase();
	const display = formatModelName(row.model_name).toLowerCase();
	return raw.includes(q) \|\| display.includes(q);
	});
	}

	const metric = sortKey.value;
	rows.sort((a, b) => {
	const av = Number(a[metric]);
	const bv = Number(b[metric]);
	const aa = Number.isNaN(av) ? -Infinity : av;
	const bb = Number.isNaN(bv) ? -Infinity : bv;
	if (bb !== aa) return bb - aa;
	return formatModelName(a.model_name).localeCompare(formatModelName(b.model_name));
	});
	return rows;
	}

	function renderTable() {
	const rows = getRows();
	const cols = state.currentView === "overall"
	? [
	...baseColumns.filter((key) => key !== "query_type"),
	...(state.showDimensions ? dimensionColumns : []),
	]
	: [
	"model_name",
	"weighted_total_content_score",
	"unweighted_mean_score",
	];

	if (state.currentView === "overall") {
	tableTitle.textContent = "Overall ranking";
	} else {
	tableTitle.textContent = `Ranking for ${formatQueryTypeLabel(state.selectedQueryType)}`;
	}
	tableCount.textContent = `${rows.length} rows`;
	tableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] \|\| key}</th>`).join("")}</tr>`;
	const tableRows = [];
	for (const row of rows) {
	tableRows.push(`
	<tr>
	${cols.map((key) => {
	const val = row[key];
	const rendered = key === "model_name"
	? formatModelName(val)
	: (typeof val === "number" ? formatNumber(val) : (val ?? "-"));
	return `<td>${rendered}</td>`;
	}).join("")}
	</tr>
	`);
	}
	tableBody.innerHTML = tableRows.join("");
	}

	function applyPayload(payload) {
	state.payload = payload;
	populateQueryTypeFilter(payload);
	updateQueryTypeControls();
	updateTopStats(payload);
	renderDeepSeekStudy(payload);
	renderTable();
	setLoadStatus("Leaderboard data loaded.");
	}

	async function autoLoad() {
	if (window.SOURCEBENCH_PAYLOAD) {
	applyPayload(window.SOURCEBENCH_PAYLOAD);
	return;
	}
	try {
	const response = await fetch("./leaderboard_data.json", { cache: "no-store" });
	if (!response.ok) throw new Error("No local leaderboard_data.json found next to index.html");
	const payload = await response.json();
	applyPayload(payload);
	} catch (error) {
	console.error(error);
	setLoadStatus("Could not load leaderboard data.", true);
	}
	}

	for (const tab of document.querySelectorAll(".tab")) {
	tab.addEventListener("click", () => {
	state.currentView = tab.dataset.view;
	viewSelect.value = state.currentView;
	if (state.currentView === "by_query_type" && state.payload) {
	populateQueryTypeFilter(state.payload);
	}
	updateQueryTypeControls();
	document.querySelectorAll(".tab").forEach((node) => node.classList.toggle("active", node === tab));
	renderTable();
	});
	}

	viewSelect.addEventListener("change", (event) => {
	state.currentView = event.target.value;
	if (state.currentView === "by_query_type" && state.payload) {
	populateQueryTypeFilter(state.payload);
	}
	updateQueryTypeControls();
	document.querySelectorAll(".tab").forEach((node) => node.classList.toggle("active", node.dataset.view === state.currentView));
	renderTable();
	});
	sortKey.addEventListener("change", renderTable);
	searchInput.addEventListener("input", renderTable);
	showDimensionsInput.addEventListener("change", (event) => {
	state.showDimensions = event.target.checked;
	renderTable();
	});

	autoLoad();
	</script>
	</body>
	</html>