Spaces:
Running
Running
Commit ·
1444582
1
Parent(s): 9e30538
Remove overview and highlights section
Browse files- index.html +0 -110
index.html
CHANGED
|
@@ -868,46 +868,6 @@
|
|
| 868 |
</article>
|
| 869 |
</section>
|
| 870 |
|
| 871 |
-
<section class="grid-2 plain-section" id="findings">
|
| 872 |
-
<article>
|
| 873 |
-
<div class="section-head">
|
| 874 |
-
<div>
|
| 875 |
-
<div class="section-kicker">Work overview</div>
|
| 876 |
-
<h2 style="font-size:1.45rem;">What this work measures</h2>
|
| 877 |
-
</div>
|
| 878 |
-
</div>
|
| 879 |
-
<div class="plain-list">
|
| 880 |
-
<div class="plain-item">
|
| 881 |
-
<strong>Source quality dimensions</strong>
|
| 882 |
-
<span>Each cited source is judged on semantic relevance, factual accuracy, freshness, objectivity, layout/ad density, accountability, transparency, and authority.</span>
|
| 883 |
-
</div>
|
| 884 |
-
<div class="plain-item">
|
| 885 |
-
<strong>Main leaderboard target</strong>
|
| 886 |
-
<span>The leaderboard emphasizes quality of referenced web sources, because a model can produce fluent answers while still relying on weak or misleading citations.</span>
|
| 887 |
-
</div>
|
| 888 |
-
<div class="plain-item">
|
| 889 |
-
<strong>Why this matters</strong>
|
| 890 |
-
<span>For web-grounded assistants, citation quality is a first-class property. A stronger source profile should make answers more trustworthy and auditable.</span>
|
| 891 |
-
</div>
|
| 892 |
-
</div>
|
| 893 |
-
</article>
|
| 894 |
-
|
| 895 |
-
<article>
|
| 896 |
-
<div class="section-head">
|
| 897 |
-
<div>
|
| 898 |
-
<div class="section-kicker">Simple conclusions</div>
|
| 899 |
-
<h2 style="font-size:1.45rem;">Current board highlights</h2>
|
| 900 |
-
</div>
|
| 901 |
-
</div>
|
| 902 |
-
<div class="findings" id="findings-list">
|
| 903 |
-
<div class="finding">
|
| 904 |
-
<strong>Load leaderboard data to see findings</strong>
|
| 905 |
-
<span>The page will summarize the top models, hardest query types, and search-overlap patterns from the current artifact.</span>
|
| 906 |
-
</div>
|
| 907 |
-
</div>
|
| 908 |
-
</article>
|
| 909 |
-
</section>
|
| 910 |
-
|
| 911 |
<section class="panel section" id="leaderboard">
|
| 912 |
<div class="section-head">
|
| 913 |
<div>
|
|
@@ -1133,7 +1093,6 @@
|
|
| 1133 |
const tableCount = document.getElementById("table-count");
|
| 1134 |
const tableHead = document.querySelector("#leaderboard-table thead");
|
| 1135 |
const tableBody = document.querySelector("#leaderboard-table tbody");
|
| 1136 |
-
const findingsList = document.getElementById("findings-list");
|
| 1137 |
const deepseekFindings = document.getElementById("deepseek-findings");
|
| 1138 |
const deepseekTableHead = document.querySelector("#deepseek-table thead");
|
| 1139 |
const deepseekTableBody = document.querySelector("#deepseek-table tbody");
|
|
@@ -1241,74 +1200,6 @@
|
|
| 1241 |
tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
|
| 1242 |
}
|
| 1243 |
|
| 1244 |
-
function computeFindings(payload) {
|
| 1245 |
-
const overall = (payload.overall || []).filter((row) => isMainBoardModel(row.model_name));
|
| 1246 |
-
const byType = (payload.by_query_type || []).filter((row) => isMainBoardModel(row.model_name));
|
| 1247 |
-
if (!overall.length || !byType.length) {
|
| 1248 |
-
return [{
|
| 1249 |
-
title: "No findings available",
|
| 1250 |
-
body: "Load a leaderboard artifact to generate summary findings."
|
| 1251 |
-
}];
|
| 1252 |
-
}
|
| 1253 |
-
|
| 1254 |
-
const top = overall[0];
|
| 1255 |
-
const bestFreshness = [...overall].sort((a, b) => (b.freshness || 0) - (a.freshness || 0))[0];
|
| 1256 |
-
const bestOverlap = [...overall]
|
| 1257 |
-
.filter((row) => row.percentage_ge_sources_in_se_sources !== null && row.percentage_ge_sources_in_se_sources !== undefined)
|
| 1258 |
-
.sort((a, b) => b.percentage_ge_sources_in_se_sources - a.percentage_ge_sources_in_se_sources)[0];
|
| 1259 |
-
const qualityLeaders = {
|
| 1260 |
-
transparency: [...overall].sort((a, b) => (b.transparency || 0) - (a.transparency || 0))[0],
|
| 1261 |
-
authority: [...overall].sort((a, b) => (b.authority || 0) - (a.authority || 0))[0],
|
| 1262 |
-
accountability: [...overall].sort((a, b) => (b.accountability || 0) - (a.accountability || 0))[0],
|
| 1263 |
-
};
|
| 1264 |
-
|
| 1265 |
-
const groupedByType = new Map();
|
| 1266 |
-
for (const row of byType) {
|
| 1267 |
-
if (!groupedByType.has(row.query_type)) groupedByType.set(row.query_type, []);
|
| 1268 |
-
groupedByType.get(row.query_type).push(row);
|
| 1269 |
-
}
|
| 1270 |
-
|
| 1271 |
-
const typeAverages = Array.from(groupedByType.entries()).map(([queryType, rows]) => {
|
| 1272 |
-
const avg = rows.reduce((sum, row) => sum + (row.weighted_total_content_score || 0), 0) / rows.length;
|
| 1273 |
-
return { queryType, avg };
|
| 1274 |
-
}).sort((a, b) => a.avg - b.avg);
|
| 1275 |
-
|
| 1276 |
-
const hardest = typeAverages[0];
|
| 1277 |
-
const easiest = typeAverages[typeAverages.length - 1];
|
| 1278 |
-
|
| 1279 |
-
return [
|
| 1280 |
-
{
|
| 1281 |
-
title: "Overall source quality remains meaningfully separated across systems",
|
| 1282 |
-
body: `${formatModelName(top.model_name)} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
|
| 1283 |
-
},
|
| 1284 |
-
{
|
| 1285 |
-
title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
|
| 1286 |
-
body: `Across the current artifact, ${hardest.queryType} has the lowest average weighted score (${formatNumber(hardest.avg)}), while ${easiest.queryType} is the easiest (${formatNumber(easiest.avg)}). This matches the broader SourceBench framing that harder query types expose source-selection weaknesses even when answers may still look fluent.`
|
| 1287 |
-
},
|
| 1288 |
-
{
|
| 1289 |
-
title: "High search overlap is not the same thing as high source quality",
|
| 1290 |
-
body: bestOverlap
|
| 1291 |
-
? `${formatModelName(bestOverlap.model_name)} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${formatModelName(top.model_name)}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
|
| 1292 |
-
: "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
|
| 1293 |
-
},
|
| 1294 |
-
{
|
| 1295 |
-
title: "Dimension scores reveal different strengths behind similar overall rankings",
|
| 1296 |
-
body: `${formatModelName(bestFreshness.model_name)} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${formatModelName(qualityLeaders.transparency.model_name)}, ${formatModelName(qualityLeaders.authority.model_name)}, and ${formatModelName(qualityLeaders.accountability.model_name)} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
|
| 1297 |
-
}
|
| 1298 |
-
];
|
| 1299 |
-
}
|
| 1300 |
-
|
| 1301 |
-
function renderFindings(payload) {
|
| 1302 |
-
const findings = computeFindings(payload);
|
| 1303 |
-
findingsList.innerHTML = "";
|
| 1304 |
-
for (const finding of findings) {
|
| 1305 |
-
const div = document.createElement("div");
|
| 1306 |
-
div.className = "finding";
|
| 1307 |
-
div.innerHTML = `<strong>${finding.title}</strong><span>${finding.body}</span>`;
|
| 1308 |
-
findingsList.appendChild(div);
|
| 1309 |
-
}
|
| 1310 |
-
}
|
| 1311 |
-
|
| 1312 |
function renderDeepSeekStudy(payload) {
|
| 1313 |
const rows = (payload.overall || [])
|
| 1314 |
.filter((row) => typeof row.model_name === "string" && row.model_name.startsWith("deepseek"));
|
|
@@ -1428,7 +1319,6 @@
|
|
| 1428 |
populateQueryTypeFilter(payload);
|
| 1429 |
updateQueryTypeControls();
|
| 1430 |
updateTopStats(payload);
|
| 1431 |
-
renderFindings(payload);
|
| 1432 |
renderDeepSeekStudy(payload);
|
| 1433 |
renderTable();
|
| 1434 |
setLoadStatus("Leaderboard data loaded.");
|
|
|
|
| 868 |
</article>
|
| 869 |
</section>
|
| 870 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
<section class="panel section" id="leaderboard">
|
| 872 |
<div class="section-head">
|
| 873 |
<div>
|
|
|
|
| 1093 |
const tableCount = document.getElementById("table-count");
|
| 1094 |
const tableHead = document.querySelector("#leaderboard-table thead");
|
| 1095 |
const tableBody = document.querySelector("#leaderboard-table tbody");
|
|
|
|
| 1096 |
const deepseekFindings = document.getElementById("deepseek-findings");
|
| 1097 |
const deepseekTableHead = document.querySelector("#deepseek-table thead");
|
| 1098 |
const deepseekTableBody = document.querySelector("#deepseek-table tbody");
|
|
|
|
| 1200 |
tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
|
| 1201 |
}
|
| 1202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1203 |
function renderDeepSeekStudy(payload) {
|
| 1204 |
const rows = (payload.overall || [])
|
| 1205 |
.filter((row) => typeof row.model_name === "string" && row.model_name.startsWith("deepseek"));
|
|
|
|
| 1319 |
populateQueryTypeFilter(payload);
|
| 1320 |
updateQueryTypeControls();
|
| 1321 |
updateTopStats(payload);
|
|
|
|
| 1322 |
renderDeepSeekStudy(payload);
|
| 1323 |
renderTable();
|
| 1324 |
setLoadStatus("Leaderboard data loaded.");
|