Spaces:
Running
Running
Commit ·
9e30538
1
Parent(s): 1d7a096
Clean up leaderboard copy and model display names
Browse files- index.html +55 -54
index.html
CHANGED
|
@@ -355,17 +355,17 @@
|
|
| 355 |
}
|
| 356 |
|
| 357 |
.metric-intro {
|
| 358 |
-
margin: 0 0
|
| 359 |
color: var(--muted);
|
| 360 |
max-width: none;
|
| 361 |
-
line-height: 1.
|
| 362 |
}
|
| 363 |
|
| 364 |
.metric-core {
|
| 365 |
display: grid;
|
| 366 |
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 367 |
gap: 10px 18px;
|
| 368 |
-
margin: 0 0
|
| 369 |
padding: 0;
|
| 370 |
list-style: none;
|
| 371 |
}
|
|
@@ -393,36 +393,11 @@
|
|
| 393 |
color: var(--ink);
|
| 394 |
}
|
| 395 |
|
| 396 |
-
.metric-
|
| 397 |
-
display: grid;
|
| 398 |
-
grid-template-columns: repeat(4, minmax(0, 1fr));
|
| 399 |
-
gap: 12px 18px;
|
| 400 |
margin: 0 0 18px;
|
| 401 |
-
padding: 0;
|
| 402 |
-
list-style: none;
|
| 403 |
-
}
|
| 404 |
-
|
| 405 |
-
.metric-defs li {
|
| 406 |
-
padding-left: 16px;
|
| 407 |
-
position: relative;
|
| 408 |
color: var(--muted);
|
| 409 |
-
line-height: 1.55;
|
| 410 |
font-size: 0.97rem;
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
.metric-defs li::before {
|
| 414 |
-
content: "";
|
| 415 |
-
position: absolute;
|
| 416 |
-
left: 0;
|
| 417 |
-
top: 0.58rem;
|
| 418 |
-
width: 7px;
|
| 419 |
-
height: 7px;
|
| 420 |
-
border-radius: 999px;
|
| 421 |
-
background: var(--accent);
|
| 422 |
-
}
|
| 423 |
-
|
| 424 |
-
.metric-defs strong {
|
| 425 |
-
color: var(--ink);
|
| 426 |
}
|
| 427 |
|
| 428 |
.findings,
|
|
@@ -799,8 +774,7 @@
|
|
| 799 |
.controls,
|
| 800 |
.stage-grid,
|
| 801 |
.hero-points,
|
| 802 |
-
.metric-core
|
| 803 |
-
.metric-defs {
|
| 804 |
grid-template-columns: 1fr;
|
| 805 |
}
|
| 806 |
|
|
@@ -944,25 +918,16 @@
|
|
| 944 |
</div>
|
| 945 |
|
| 946 |
<p class="metric-intro">
|
| 947 |
-
SourceBench
|
| 948 |
-
The overall weighted score is the main ranking target; the dimension columns below make it easier to see
|
| 949 |
-
whether a system is strong because it cites more relevant, more accurate, fresher, more transparent, or more authoritative sources.
|
| 950 |
</p>
|
| 951 |
<ul class="metric-core">
|
| 952 |
<li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
|
| 953 |
<li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
|
| 954 |
<li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
|
| 955 |
</ul>
|
| 956 |
-
<
|
| 957 |
-
<
|
| 958 |
-
|
| 959 |
-
<li><strong>Freshness.</strong> Whether the source is timely for the topic being answered.</li>
|
| 960 |
-
<li><strong>Objectivity / Tone.</strong> Whether the source is balanced rather than sensational or biased.</li>
|
| 961 |
-
<li><strong>Layout / Ad Density.</strong> Whether the page is usable and not overwhelmed by ads or clutter.</li>
|
| 962 |
-
<li><strong>Accountability.</strong> Whether the source clearly indicates ownership, responsibility, or editorial control.</li>
|
| 963 |
-
<li><strong>Transparency.</strong> Whether the source clearly presents provenance, disclosure, or supporting context.</li>
|
| 964 |
-
<li><strong>Authority.</strong> Whether the source appears credible and institutionally trustworthy for the topic.</li>
|
| 965 |
-
</ul>
|
| 966 |
|
| 967 |
<div class="tabs">
|
| 968 |
<button class="tab active" data-view="overall" type="button">Overall</button>
|
|
@@ -1138,6 +1103,25 @@
|
|
| 1138 |
"VA-COS NLQ": "Shopping",
|
| 1139 |
};
|
| 1140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1141 |
const viewSelect = document.getElementById("view-select");
|
| 1142 |
const sortKey = document.getElementById("sort-key");
|
| 1143 |
const searchInput = document.getElementById("search-input");
|
|
@@ -1199,6 +1183,11 @@
|
|
| 1199 |
return tag ? `${queryType} (${tag})` : String(queryType || "-");
|
| 1200 |
}
|
| 1201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1202 |
function updateQueryTypeControls() {
|
| 1203 |
const visible = state.currentView === "by_query_type";
|
| 1204 |
queryFilterWrap.classList.toggle("visible", visible);
|
|
@@ -1247,7 +1236,7 @@
|
|
| 1247 |
const top = overall[0];
|
| 1248 |
document.getElementById("stat-models").textContent = overall.length;
|
| 1249 |
document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
|
| 1250 |
-
document.getElementById("stat-top-model").textContent = top ? top.model_name : "-";
|
| 1251 |
document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
|
| 1252 |
tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
|
| 1253 |
}
|
|
@@ -1290,7 +1279,7 @@
|
|
| 1290 |
return [
|
| 1291 |
{
|
| 1292 |
title: "Overall source quality remains meaningfully separated across systems",
|
| 1293 |
-
body: `${top.model_name} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
|
| 1294 |
},
|
| 1295 |
{
|
| 1296 |
title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
|
|
@@ -1299,12 +1288,12 @@
|
|
| 1299 |
{
|
| 1300 |
title: "High search overlap is not the same thing as high source quality",
|
| 1301 |
body: bestOverlap
|
| 1302 |
-
? `${bestOverlap.model_name} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${top.model_name}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
|
| 1303 |
: "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
|
| 1304 |
},
|
| 1305 |
{
|
| 1306 |
title: "Dimension scores reveal different strengths behind similar overall rankings",
|
| 1307 |
-
body: `${bestFreshness.model_name} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${qualityLeaders.transparency.model_name}, ${qualityLeaders.authority.model_name}, and ${qualityLeaders.accountability.model_name} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
|
| 1308 |
}
|
| 1309 |
];
|
| 1310 |
}
|
|
@@ -1344,7 +1333,7 @@
|
|
| 1344 |
const chatAvg = avg(chatRows, "weighted_total_content_score");
|
| 1345 |
|
| 1346 |
deepseekFindings.innerHTML = `
|
| 1347 |
-
<div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${best.model_name}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
|
| 1348 |
<div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
|
| 1349 |
<div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
|
| 1350 |
`;
|
|
@@ -1353,7 +1342,13 @@
|
|
| 1353 |
deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] || key}</th>`).join("")}</tr>`;
|
| 1354 |
deepseekTableBody.innerHTML = sorted.map((row) => `
|
| 1355 |
<tr>
|
| 1356 |
-
${cols.map((key) =>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1357 |
</tr>
|
| 1358 |
`).join("");
|
| 1359 |
}
|
|
@@ -1372,7 +1367,11 @@
|
|
| 1372 |
|
| 1373 |
const q = searchInput.value.trim().toLowerCase();
|
| 1374 |
if (q) {
|
| 1375 |
-
rows = rows.filter((row) =>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1376 |
}
|
| 1377 |
|
| 1378 |
const metric = sortKey.value;
|
|
@@ -1382,7 +1381,7 @@
|
|
| 1382 |
const aa = Number.isNaN(av) ? -Infinity : av;
|
| 1383 |
const bb = Number.isNaN(bv) ? -Infinity : bv;
|
| 1384 |
if (bb !== aa) return bb - aa;
|
| 1385 |
-
return
|
| 1386 |
});
|
| 1387 |
return rows;
|
| 1388 |
}
|
|
@@ -1413,7 +1412,9 @@
|
|
| 1413 |
<tr>
|
| 1414 |
${cols.map((key) => {
|
| 1415 |
const val = row[key];
|
| 1416 |
-
const rendered =
|
|
|
|
|
|
|
| 1417 |
return `<td>${rendered}</td>`;
|
| 1418 |
}).join("")}
|
| 1419 |
</tr>
|
|
|
|
| 355 |
}
|
| 356 |
|
| 357 |
.metric-intro {
|
| 358 |
+
margin: 0 0 12px;
|
| 359 |
color: var(--muted);
|
| 360 |
max-width: none;
|
| 361 |
+
line-height: 1.6;
|
| 362 |
}
|
| 363 |
|
| 364 |
.metric-core {
|
| 365 |
display: grid;
|
| 366 |
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 367 |
gap: 10px 18px;
|
| 368 |
+
margin: 0 0 12px;
|
| 369 |
padding: 0;
|
| 370 |
list-style: none;
|
| 371 |
}
|
|
|
|
| 393 |
color: var(--ink);
|
| 394 |
}
|
| 395 |
|
| 396 |
+
.metric-note {
|
|
|
|
|
|
|
|
|
|
| 397 |
margin: 0 0 18px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
color: var(--muted);
|
|
|
|
| 399 |
font-size: 0.97rem;
|
| 400 |
+
line-height: 1.58;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
}
|
| 402 |
|
| 403 |
.findings,
|
|
|
|
| 774 |
.controls,
|
| 775 |
.stage-grid,
|
| 776 |
.hero-points,
|
| 777 |
+
.metric-core {
|
|
|
|
| 778 |
grid-template-columns: 1fr;
|
| 779 |
}
|
| 780 |
|
|
|
|
| 918 |
</div>
|
| 919 |
|
| 920 |
<p class="metric-intro">
|
| 921 |
+
SourceBench ranks systems by judged source quality rather than answer fluency alone. The main leaderboard target is the weighted overall score.
|
|
|
|
|
|
|
| 922 |
</p>
|
| 923 |
<ul class="metric-core">
|
| 924 |
<li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
|
| 925 |
<li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
|
| 926 |
<li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
|
| 927 |
</ul>
|
| 928 |
+
<p class="metric-note">
|
| 929 |
+
Turn on <strong>Show dimension scores</strong> in Overall view to inspect the eight judged dimensions: semantic relevance, factual accuracy, freshness, objectivity, layout/ad density, accountability, transparency, and authority.
|
| 930 |
+
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
|
| 932 |
<div class="tabs">
|
| 933 |
<button class="tab active" data-view="overall" type="button">Overall</button>
|
|
|
|
| 1103 |
"VA-COS NLQ": "Shopping",
|
| 1104 |
};
|
| 1105 |
|
| 1106 |
+
const DISPLAY_MODEL_NAMES = {
|
| 1107 |
+
"gpt-5": "GPT-5",
|
| 1108 |
+
"gpt-4o": "GPT-4o",
|
| 1109 |
+
"grok-4.1-fast-non-reasoning": "Grok-4.1-Fast-Non-Reasoning",
|
| 1110 |
+
"claude": "Claude",
|
| 1111 |
+
"gensee": "Gensee",
|
| 1112 |
+
"exa": "Exa",
|
| 1113 |
+
"tavily": "Tavily",
|
| 1114 |
+
"google-search": "Google Search",
|
| 1115 |
+
"Gemini-3-Pro-Preview": "Gemini 3 Pro Preview",
|
| 1116 |
+
"Gemini-3-Flash-Preview": "Gemini 3 Flash Preview",
|
| 1117 |
+
"Gemini-2.5-Flash-Preview": "Gemini 2.5 Flash Preview",
|
| 1118 |
+
"Perplexity-Sonar-Pro": "Perplexity Sonar Pro",
|
| 1119 |
+
"deepseek-chat-gensee": "DeepSeek Chat + Gensee",
|
| 1120 |
+
"deepseek-reasoning-tavily": "DeepSeek Reasoning + Tavily",
|
| 1121 |
+
"deepseek-reasoning-gensee": "DeepSeek Reasoning + Gensee",
|
| 1122 |
+
"deepseek-chat-tavily": "DeepSeek Chat + Tavily",
|
| 1123 |
+
};
|
| 1124 |
+
|
| 1125 |
const viewSelect = document.getElementById("view-select");
|
| 1126 |
const sortKey = document.getElementById("sort-key");
|
| 1127 |
const searchInput = document.getElementById("search-input");
|
|
|
|
| 1183 |
return tag ? `${queryType} (${tag})` : String(queryType || "-");
|
| 1184 |
}
|
| 1185 |
|
| 1186 |
+
function formatModelName(name) {
|
| 1187 |
+
const key = String(name || "");
|
| 1188 |
+
return DISPLAY_MODEL_NAMES[key] || key;
|
| 1189 |
+
}
|
| 1190 |
+
|
| 1191 |
function updateQueryTypeControls() {
|
| 1192 |
const visible = state.currentView === "by_query_type";
|
| 1193 |
queryFilterWrap.classList.toggle("visible", visible);
|
|
|
|
| 1236 |
const top = overall[0];
|
| 1237 |
document.getElementById("stat-models").textContent = overall.length;
|
| 1238 |
document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
|
| 1239 |
+
document.getElementById("stat-top-model").textContent = top ? formatModelName(top.model_name) : "-";
|
| 1240 |
document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
|
| 1241 |
tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
|
| 1242 |
}
|
|
|
|
| 1279 |
return [
|
| 1280 |
{
|
| 1281 |
title: "Overall source quality remains meaningfully separated across systems",
|
| 1282 |
+
body: `${formatModelName(top.model_name)} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
|
| 1283 |
},
|
| 1284 |
{
|
| 1285 |
title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
|
|
|
|
| 1288 |
{
|
| 1289 |
title: "High search overlap is not the same thing as high source quality",
|
| 1290 |
body: bestOverlap
|
| 1291 |
+
? `${formatModelName(bestOverlap.model_name)} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${formatModelName(top.model_name)}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
|
| 1292 |
: "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
|
| 1293 |
},
|
| 1294 |
{
|
| 1295 |
title: "Dimension scores reveal different strengths behind similar overall rankings",
|
| 1296 |
+
body: `${formatModelName(bestFreshness.model_name)} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${formatModelName(qualityLeaders.transparency.model_name)}, ${formatModelName(qualityLeaders.authority.model_name)}, and ${formatModelName(qualityLeaders.accountability.model_name)} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
|
| 1297 |
}
|
| 1298 |
];
|
| 1299 |
}
|
|
|
|
| 1333 |
const chatAvg = avg(chatRows, "weighted_total_content_score");
|
| 1334 |
|
| 1335 |
deepseekFindings.innerHTML = `
|
| 1336 |
+
<div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${formatModelName(best.model_name)}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
|
| 1337 |
<div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
|
| 1338 |
<div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
|
| 1339 |
`;
|
|
|
|
| 1342 |
deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] || key}</th>`).join("")}</tr>`;
|
| 1343 |
deepseekTableBody.innerHTML = sorted.map((row) => `
|
| 1344 |
<tr>
|
| 1345 |
+
${cols.map((key) => {
|
| 1346 |
+
const value = row[key];
|
| 1347 |
+
const rendered = key === "model_name"
|
| 1348 |
+
? formatModelName(value)
|
| 1349 |
+
: (typeof value === "number" ? formatNumber(value) : (value ?? "-"));
|
| 1350 |
+
return `<td>${rendered}</td>`;
|
| 1351 |
+
}).join("")}
|
| 1352 |
</tr>
|
| 1353 |
`).join("");
|
| 1354 |
}
|
|
|
|
| 1367 |
|
| 1368 |
const q = searchInput.value.trim().toLowerCase();
|
| 1369 |
if (q) {
|
| 1370 |
+
rows = rows.filter((row) => {
|
| 1371 |
+
const raw = String(row.model_name || "").toLowerCase();
|
| 1372 |
+
const display = formatModelName(row.model_name).toLowerCase();
|
| 1373 |
+
return raw.includes(q) || display.includes(q);
|
| 1374 |
+
});
|
| 1375 |
}
|
| 1376 |
|
| 1377 |
const metric = sortKey.value;
|
|
|
|
| 1381 |
const aa = Number.isNaN(av) ? -Infinity : av;
|
| 1382 |
const bb = Number.isNaN(bv) ? -Infinity : bv;
|
| 1383 |
if (bb !== aa) return bb - aa;
|
| 1384 |
+
return formatModelName(a.model_name).localeCompare(formatModelName(b.model_name));
|
| 1385 |
});
|
| 1386 |
return rows;
|
| 1387 |
}
|
|
|
|
| 1412 |
<tr>
|
| 1413 |
${cols.map((key) => {
|
| 1414 |
const val = row[key];
|
| 1415 |
+
const rendered = key === "model_name"
|
| 1416 |
+
? formatModelName(val)
|
| 1417 |
+
: (typeof val === "number" ? formatNumber(val) : (val ?? "-"));
|
| 1418 |
return `<td>${rendered}</td>`;
|
| 1419 |
}).join("")}
|
| 1420 |
</tr>
|