Kristinx0351 commited on
Commit
9e30538
·
1 Parent(s): 1d7a096

Clean up leaderboard copy and model display names

Browse files
Files changed (1) hide show
  1. index.html +55 -54
index.html CHANGED
@@ -355,17 +355,17 @@
355
  }
356
 
357
  .metric-intro {
358
- margin: 0 0 16px;
359
  color: var(--muted);
360
  max-width: none;
361
- line-height: 1.65;
362
  }
363
 
364
  .metric-core {
365
  display: grid;
366
  grid-template-columns: repeat(3, minmax(0, 1fr));
367
  gap: 10px 18px;
368
- margin: 0 0 14px;
369
  padding: 0;
370
  list-style: none;
371
  }
@@ -393,36 +393,11 @@
393
  color: var(--ink);
394
  }
395
 
396
- .metric-defs {
397
- display: grid;
398
- grid-template-columns: repeat(4, minmax(0, 1fr));
399
- gap: 12px 18px;
400
  margin: 0 0 18px;
401
- padding: 0;
402
- list-style: none;
403
- }
404
-
405
- .metric-defs li {
406
- padding-left: 16px;
407
- position: relative;
408
  color: var(--muted);
409
- line-height: 1.55;
410
  font-size: 0.97rem;
411
- }
412
-
413
- .metric-defs li::before {
414
- content: "";
415
- position: absolute;
416
- left: 0;
417
- top: 0.58rem;
418
- width: 7px;
419
- height: 7px;
420
- border-radius: 999px;
421
- background: var(--accent);
422
- }
423
-
424
- .metric-defs strong {
425
- color: var(--ink);
426
  }
427
 
428
  .findings,
@@ -799,8 +774,7 @@
799
  .controls,
800
  .stage-grid,
801
  .hero-points,
802
- .metric-core,
803
- .metric-defs {
804
  grid-template-columns: 1fr;
805
  }
806
 
@@ -944,25 +918,16 @@
944
  </div>
945
 
946
  <p class="metric-intro">
947
- SourceBench reports one overall score together with eight source-quality dimensions used by the judge model.
948
- The overall weighted score is the main ranking target; the dimension columns below make it easier to see
949
- whether a system is strong because it cites more relevant, more accurate, fresher, more transparent, or more authoritative sources.
950
  </p>
951
  <ul class="metric-core">
952
  <li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
953
  <li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
954
  <li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
955
  </ul>
956
- <ul class="metric-defs">
957
- <li><strong>Semantic Relevance.</strong> Whether the cited source is directly relevant to the query and answer.</li>
958
- <li><strong>Factual Accuracy.</strong> Whether the cited source appears reliable and factually correct.</li>
959
- <li><strong>Freshness.</strong> Whether the source is timely for the topic being answered.</li>
960
- <li><strong>Objectivity / Tone.</strong> Whether the source is balanced rather than sensational or biased.</li>
961
- <li><strong>Layout / Ad Density.</strong> Whether the page is usable and not overwhelmed by ads or clutter.</li>
962
- <li><strong>Accountability.</strong> Whether the source clearly indicates ownership, responsibility, or editorial control.</li>
963
- <li><strong>Transparency.</strong> Whether the source clearly presents provenance, disclosure, or supporting context.</li>
964
- <li><strong>Authority.</strong> Whether the source appears credible and institutionally trustworthy for the topic.</li>
965
- </ul>
966
 
967
  <div class="tabs">
968
  <button class="tab active" data-view="overall" type="button">Overall</button>
@@ -1138,6 +1103,25 @@
1138
  "VA-COS NLQ": "Shopping",
1139
  };
1140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1141
  const viewSelect = document.getElementById("view-select");
1142
  const sortKey = document.getElementById("sort-key");
1143
  const searchInput = document.getElementById("search-input");
@@ -1199,6 +1183,11 @@
1199
  return tag ? `${queryType} (${tag})` : String(queryType || "-");
1200
  }
1201
 
 
 
 
 
 
1202
  function updateQueryTypeControls() {
1203
  const visible = state.currentView === "by_query_type";
1204
  queryFilterWrap.classList.toggle("visible", visible);
@@ -1247,7 +1236,7 @@
1247
  const top = overall[0];
1248
  document.getElementById("stat-models").textContent = overall.length;
1249
  document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
1250
- document.getElementById("stat-top-model").textContent = top ? top.model_name : "-";
1251
  document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
1252
  tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
1253
  }
@@ -1290,7 +1279,7 @@
1290
  return [
1291
  {
1292
  title: "Overall source quality remains meaningfully separated across systems",
1293
- body: `${top.model_name} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
1294
  },
1295
  {
1296
  title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
@@ -1299,12 +1288,12 @@
1299
  {
1300
  title: "High search overlap is not the same thing as high source quality",
1301
  body: bestOverlap
1302
- ? `${bestOverlap.model_name} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${top.model_name}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
1303
  : "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
1304
  },
1305
  {
1306
  title: "Dimension scores reveal different strengths behind similar overall rankings",
1307
- body: `${bestFreshness.model_name} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${qualityLeaders.transparency.model_name}, ${qualityLeaders.authority.model_name}, and ${qualityLeaders.accountability.model_name} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
1308
  }
1309
  ];
1310
  }
@@ -1344,7 +1333,7 @@
1344
  const chatAvg = avg(chatRows, "weighted_total_content_score");
1345
 
1346
  deepseekFindings.innerHTML = `
1347
- <div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${best.model_name}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
1348
  <div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
1349
  <div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
1350
  `;
@@ -1353,7 +1342,13 @@
1353
  deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] || key}</th>`).join("")}</tr>`;
1354
  deepseekTableBody.innerHTML = sorted.map((row) => `
1355
  <tr>
1356
- ${cols.map((key) => `<td>${typeof row[key] === "number" ? formatNumber(row[key]) : (row[key] ?? "-")}</td>`).join("")}
 
 
 
 
 
 
1357
  </tr>
1358
  `).join("");
1359
  }
@@ -1372,7 +1367,11 @@
1372
 
1373
  const q = searchInput.value.trim().toLowerCase();
1374
  if (q) {
1375
- rows = rows.filter((row) => String(row.model_name || "").toLowerCase().includes(q));
 
 
 
 
1376
  }
1377
 
1378
  const metric = sortKey.value;
@@ -1382,7 +1381,7 @@
1382
  const aa = Number.isNaN(av) ? -Infinity : av;
1383
  const bb = Number.isNaN(bv) ? -Infinity : bv;
1384
  if (bb !== aa) return bb - aa;
1385
- return String(a.model_name || "").localeCompare(String(b.model_name || ""));
1386
  });
1387
  return rows;
1388
  }
@@ -1413,7 +1412,9 @@
1413
  <tr>
1414
  ${cols.map((key) => {
1415
  const val = row[key];
1416
- const rendered = typeof val === "number" ? formatNumber(val) : (val ?? "-");
 
 
1417
  return `<td>${rendered}</td>`;
1418
  }).join("")}
1419
  </tr>
 
355
  }
356
 
357
  .metric-intro {
358
+ margin: 0 0 12px;
359
  color: var(--muted);
360
  max-width: none;
361
+ line-height: 1.6;
362
  }
363
 
364
  .metric-core {
365
  display: grid;
366
  grid-template-columns: repeat(3, minmax(0, 1fr));
367
  gap: 10px 18px;
368
+ margin: 0 0 12px;
369
  padding: 0;
370
  list-style: none;
371
  }
 
393
  color: var(--ink);
394
  }
395
 
396
+ .metric-note {
 
 
 
397
  margin: 0 0 18px;
 
 
 
 
 
 
 
398
  color: var(--muted);
 
399
  font-size: 0.97rem;
400
+ line-height: 1.58;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  }
402
 
403
  .findings,
 
774
  .controls,
775
  .stage-grid,
776
  .hero-points,
777
+ .metric-core {
 
778
  grid-template-columns: 1fr;
779
  }
780
 
 
918
  </div>
919
 
920
  <p class="metric-intro">
921
+ SourceBench ranks systems by judged source quality rather than answer fluency alone. The main leaderboard target is the weighted overall score.
 
 
922
  </p>
923
  <ul class="metric-core">
924
  <li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
925
  <li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
926
  <li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
927
  </ul>
928
+ <p class="metric-note">
929
+ Turn on <strong>Show dimension scores</strong> in Overall view to inspect the eight judged dimensions: semantic relevance, factual accuracy, freshness, objectivity, layout/ad density, accountability, transparency, and authority.
930
+ </p>
 
 
 
 
 
 
 
931
 
932
  <div class="tabs">
933
  <button class="tab active" data-view="overall" type="button">Overall</button>
 
1103
  "VA-COS NLQ": "Shopping",
1104
  };
1105
 
1106
+ const DISPLAY_MODEL_NAMES = {
1107
+ "gpt-5": "GPT-5",
1108
+ "gpt-4o": "GPT-4o",
1109
+ "grok-4.1-fast-non-reasoning": "Grok-4.1-Fast-Non-Reasoning",
1110
+ "claude": "Claude",
1111
+ "gensee": "Gensee",
1112
+ "exa": "Exa",
1113
+ "tavily": "Tavily",
1114
+ "google-search": "Google Search",
1115
+ "Gemini-3-Pro-Preview": "Gemini 3 Pro Preview",
1116
+ "Gemini-3-Flash-Preview": "Gemini 3 Flash Preview",
1117
+ "Gemini-2.5-Flash-Preview": "Gemini 2.5 Flash Preview",
1118
+ "Perplexity-Sonar-Pro": "Perplexity Sonar Pro",
1119
+ "deepseek-chat-gensee": "DeepSeek Chat + Gensee",
1120
+ "deepseek-reasoning-tavily": "DeepSeek Reasoning + Tavily",
1121
+ "deepseek-reasoning-gensee": "DeepSeek Reasoning + Gensee",
1122
+ "deepseek-chat-tavily": "DeepSeek Chat + Tavily",
1123
+ };
1124
+
1125
  const viewSelect = document.getElementById("view-select");
1126
  const sortKey = document.getElementById("sort-key");
1127
  const searchInput = document.getElementById("search-input");
 
1183
  return tag ? `${queryType} (${tag})` : String(queryType || "-");
1184
  }
1185
 
1186
+ function formatModelName(name) {
1187
+ const key = String(name || "");
1188
+ return DISPLAY_MODEL_NAMES[key] || key;
1189
+ }
1190
+
1191
  function updateQueryTypeControls() {
1192
  const visible = state.currentView === "by_query_type";
1193
  queryFilterWrap.classList.toggle("visible", visible);
 
1236
  const top = overall[0];
1237
  document.getElementById("stat-models").textContent = overall.length;
1238
  document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
1239
+ document.getElementById("stat-top-model").textContent = top ? formatModelName(top.model_name) : "-";
1240
  document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
1241
  tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
1242
  }
 
1279
  return [
1280
  {
1281
  title: "Overall source quality remains meaningfully separated across systems",
1282
+ body: `${formatModelName(top.model_name)} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
1283
  },
1284
  {
1285
  title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
 
1288
  {
1289
  title: "High search overlap is not the same thing as high source quality",
1290
  body: bestOverlap
1291
+ ? `${formatModelName(bestOverlap.model_name)} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${formatModelName(top.model_name)}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
1292
  : "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
1293
  },
1294
  {
1295
  title: "Dimension scores reveal different strengths behind similar overall rankings",
1296
+ body: `${formatModelName(bestFreshness.model_name)} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${formatModelName(qualityLeaders.transparency.model_name)}, ${formatModelName(qualityLeaders.authority.model_name)}, and ${formatModelName(qualityLeaders.accountability.model_name)} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
1297
  }
1298
  ];
1299
  }
 
1333
  const chatAvg = avg(chatRows, "weighted_total_content_score");
1334
 
1335
  deepseekFindings.innerHTML = `
1336
+ <div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${formatModelName(best.model_name)}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
1337
  <div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
1338
  <div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
1339
  `;
 
1342
  deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] || key}</th>`).join("")}</tr>`;
1343
  deepseekTableBody.innerHTML = sorted.map((row) => `
1344
  <tr>
1345
+ ${cols.map((key) => {
1346
+ const value = row[key];
1347
+ const rendered = key === "model_name"
1348
+ ? formatModelName(value)
1349
+ : (typeof value === "number" ? formatNumber(value) : (value ?? "-"));
1350
+ return `<td>${rendered}</td>`;
1351
+ }).join("")}
1352
  </tr>
1353
  `).join("");
1354
  }
 
1367
 
1368
  const q = searchInput.value.trim().toLowerCase();
1369
  if (q) {
1370
+ rows = rows.filter((row) => {
1371
+ const raw = String(row.model_name || "").toLowerCase();
1372
+ const display = formatModelName(row.model_name).toLowerCase();
1373
+ return raw.includes(q) || display.includes(q);
1374
+ });
1375
  }
1376
 
1377
  const metric = sortKey.value;
 
1381
  const aa = Number.isNaN(av) ? -Infinity : av;
1382
  const bb = Number.isNaN(bv) ? -Infinity : bv;
1383
  if (bb !== aa) return bb - aa;
1384
+ return formatModelName(a.model_name).localeCompare(formatModelName(b.model_name));
1385
  });
1386
  return rows;
1387
  }
 
1412
  <tr>
1413
  ${cols.map((key) => {
1414
  const val = row[key];
1415
+ const rendered = key === "model_name"
1416
+ ? formatModelName(val)
1417
+ : (typeof val === "number" ? formatNumber(val) : (val ?? "-"));
1418
  return `<td>${rendered}</td>`;
1419
  }).join("")}
1420
  </tr>