Spaces:

WukLab
/

SourceBench-Leaderboard

Running

App Files Files Community

Kristinx0351 commited on 3 days ago

Commit

9e30538

1 Parent(s): 1d7a096

Clean up leaderboard copy and model display names

Browse files

Files changed (1) hide show

index.html +55 -54

index.html CHANGED Viewed

@@ -355,17 +355,17 @@
     }
     .metric-intro {
-      margin: 0 0 16px;
       color: var(--muted);
       max-width: none;
-      line-height: 1.65;
     }
     .metric-core {
       display: grid;
       grid-template-columns: repeat(3, minmax(0, 1fr));
       gap: 10px 18px;
-      margin: 0 0 14px;
       padding: 0;
       list-style: none;
     }
@@ -393,36 +393,11 @@
       color: var(--ink);
     }
-    .metric-defs {
-      display: grid;
-      grid-template-columns: repeat(4, minmax(0, 1fr));
-      gap: 12px 18px;
       margin: 0 0 18px;
-      padding: 0;
-      list-style: none;
-    }
-    .metric-defs li {
-      padding-left: 16px;
-      position: relative;
       color: var(--muted);
-      line-height: 1.55;
       font-size: 0.97rem;
-    }
-    .metric-defs li::before {
-      content: "";
-      position: absolute;
-      left: 0;
-      top: 0.58rem;
-      width: 7px;
-      height: 7px;
-      border-radius: 999px;
-      background: var(--accent);
-    }
-    .metric-defs strong {
-      color: var(--ink);
     }
     .findings,
@@ -799,8 +774,7 @@
       .controls,
       .stage-grid,
       .hero-points,
-      .metric-core,
-      .metric-defs {
         grid-template-columns: 1fr;
       }
@@ -944,25 +918,16 @@
       </div>
       <p class="metric-intro">
-        SourceBench reports one overall score together with eight source-quality dimensions used by the judge model.
-        The overall weighted score is the main ranking target; the dimension columns below make it easier to see
-        whether a system is strong because it cites more relevant, more accurate, fresher, more transparent, or more authoritative sources.
       </p>
       <ul class="metric-core">
         <li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
         <li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
         <li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
       </ul>
-      <ul class="metric-defs">
-        <li><strong>Semantic Relevance.</strong> Whether the cited source is directly relevant to the query and answer.</li>
-        <li><strong>Factual Accuracy.</strong> Whether the cited source appears reliable and factually correct.</li>
-        <li><strong>Freshness.</strong> Whether the source is timely for the topic being answered.</li>
-        <li><strong>Objectivity / Tone.</strong> Whether the source is balanced rather than sensational or biased.</li>
-        <li><strong>Layout / Ad Density.</strong> Whether the page is usable and not overwhelmed by ads or clutter.</li>
-        <li><strong>Accountability.</strong> Whether the source clearly indicates ownership, responsibility, or editorial control.</li>
-        <li><strong>Transparency.</strong> Whether the source clearly presents provenance, disclosure, or supporting context.</li>
-        <li><strong>Authority.</strong> Whether the source appears credible and institutionally trustworthy for the topic.</li>
-      </ul>
       <div class="tabs">
         <button class="tab active" data-view="overall" type="button">Overall</button>
@@ -1138,6 +1103,25 @@
       "VA-COS NLQ": "Shopping",
     };
     const viewSelect = document.getElementById("view-select");
     const sortKey = document.getElementById("sort-key");
     const searchInput = document.getElementById("search-input");
@@ -1199,6 +1183,11 @@
       return tag ? `${queryType} (${tag})` : String(queryType || "-");
     }
     function updateQueryTypeControls() {
       const visible = state.currentView === "by_query_type";
       queryFilterWrap.classList.toggle("visible", visible);
@@ -1247,7 +1236,7 @@
       const top = overall[0];
       document.getElementById("stat-models").textContent = overall.length;
       document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
-      document.getElementById("stat-top-model").textContent = top ? top.model_name : "-";
       document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
       tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
     }
@@ -1290,7 +1279,7 @@
       return [
         {
           title: "Overall source quality remains meaningfully separated across systems",
-          body: `${top.model_name} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
         },
         {
           title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
@@ -1299,12 +1288,12 @@
         {
           title: "High search overlap is not the same thing as high source quality",
           body: bestOverlap
-            ? `${bestOverlap.model_name} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${top.model_name}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
             : "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
         },
         {
           title: "Dimension scores reveal different strengths behind similar overall rankings",
-          body: `${bestFreshness.model_name} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${qualityLeaders.transparency.model_name}, ${qualityLeaders.authority.model_name}, and ${qualityLeaders.accountability.model_name} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
         }
       ];
     }
@@ -1344,7 +1333,7 @@
       const chatAvg = avg(chatRows, "weighted_total_content_score");
       deepseekFindings.innerHTML = `
-        <div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${best.model_name}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
         <div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
         <div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
       `;
@@ -1353,7 +1342,13 @@
       deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] || key}</th>`).join("")}</tr>`;
       deepseekTableBody.innerHTML = sorted.map((row) => `
         <tr>
-          ${cols.map((key) => `<td>${typeof row[key] === "number" ? formatNumber(row[key]) : (row[key] ?? "-")}</td>`).join("")}
         </tr>
       `).join("");
     }
@@ -1372,7 +1367,11 @@
       const q = searchInput.value.trim().toLowerCase();
       if (q) {
-        rows = rows.filter((row) => String(row.model_name || "").toLowerCase().includes(q));
       }
       const metric = sortKey.value;
@@ -1382,7 +1381,7 @@
         const aa = Number.isNaN(av) ? -Infinity : av;
         const bb = Number.isNaN(bv) ? -Infinity : bv;
         if (bb !== aa) return bb - aa;
-        return String(a.model_name || "").localeCompare(String(b.model_name || ""));
       });
       return rows;
     }
@@ -1413,7 +1412,9 @@
           <tr>
             ${cols.map((key) => {
               const val = row[key];
-              const rendered = typeof val === "number" ? formatNumber(val) : (val ?? "-");
               return `<td>${rendered}</td>`;
             }).join("")}
           </tr>

     }
     .metric-intro {
+      margin: 0 0 12px;
       color: var(--muted);
       max-width: none;
+      line-height: 1.6;
     }
     .metric-core {
       display: grid;
       grid-template-columns: repeat(3, minmax(0, 1fr));
       gap: 10px 18px;
+      margin: 0 0 12px;
       padding: 0;
       list-style: none;
     }
       color: var(--ink);
     }
+    .metric-note {
       margin: 0 0 18px;
       color: var(--muted);
       font-size: 0.97rem;
+      line-height: 1.58;
     }
     .findings,
       .controls,
       .stage-grid,
       .hero-points,
+      .metric-core {
         grid-template-columns: 1fr;
       }
       </div>
       <p class="metric-intro">
+        SourceBench ranks systems by judged source quality rather than answer fluency alone. The main leaderboard target is the weighted overall score.
       </p>
       <ul class="metric-core">
         <li><strong>Weighted Score.</strong> The main leaderboard score, combining the judged dimensions into one overall source-quality metric.</li>
         <li><strong>Unweighted Mean.</strong> The simple average across the judged dimension scores, without weighting.</li>
         <li><strong>% In SE.</strong> Percentage of model cited sources appearing in the first five pages of Google Search.</li>
       </ul>
+      <p class="metric-note">
+        Turn on <strong>Show dimension scores</strong> in Overall view to inspect the eight judged dimensions: semantic relevance, factual accuracy, freshness, objectivity, layout/ad density, accountability, transparency, and authority.
+      </p>
       <div class="tabs">
         <button class="tab active" data-view="overall" type="button">Overall</button>
       "VA-COS NLQ": "Shopping",
     };
+    const DISPLAY_MODEL_NAMES = {
+      "gpt-5": "GPT-5",
+      "gpt-4o": "GPT-4o",
+      "grok-4.1-fast-non-reasoning": "Grok-4.1-Fast-Non-Reasoning",
+      "claude": "Claude",
+      "gensee": "Gensee",
+      "exa": "Exa",
+      "tavily": "Tavily",
+      "google-search": "Google Search",
+      "Gemini-3-Pro-Preview": "Gemini 3 Pro Preview",
+      "Gemini-3-Flash-Preview": "Gemini 3 Flash Preview",
+      "Gemini-2.5-Flash-Preview": "Gemini 2.5 Flash Preview",
+      "Perplexity-Sonar-Pro": "Perplexity Sonar Pro",
+      "deepseek-chat-gensee": "DeepSeek Chat + Gensee",
+      "deepseek-reasoning-tavily": "DeepSeek Reasoning + Tavily",
+      "deepseek-reasoning-gensee": "DeepSeek Reasoning + Gensee",
+      "deepseek-chat-tavily": "DeepSeek Chat + Tavily",
+    };
     const viewSelect = document.getElementById("view-select");
     const sortKey = document.getElementById("sort-key");
     const searchInput = document.getElementById("search-input");
       return tag ? `${queryType} (${tag})` : String(queryType || "-");
     }
+    function formatModelName(name) {
+      const key = String(name || "");
+      return DISPLAY_MODEL_NAMES[key] || key;
+    }
     function updateQueryTypeControls() {
       const visible = state.currentView === "by_query_type";
       queryFilterWrap.classList.toggle("visible", visible);
       const top = overall[0];
       document.getElementById("stat-models").textContent = overall.length;
       document.getElementById("stat-query-types").textContent = new Set(byType.map((row) => row.query_type)).size;
+      document.getElementById("stat-top-model").textContent = top ? formatModelName(top.model_name) : "-";
       document.getElementById("stat-top-score").textContent = top ? formatNumber(top.weighted_total_content_score) : "-";
       tableGenerated.textContent = `Generated at ${formatPacificTimestamp(payload.metadata?.generated_at)}`;
     }
       return [
         {
           title: "Overall source quality remains meaningfully separated across systems",
+          body: `${formatModelName(top.model_name)} is the current overall leader with a weighted score of ${formatNumber(top.weighted_total_content_score)}. The spread across the current board suggests that citation quality is not saturated: systems still differ substantially once source relevance, accuracy, transparency, and authority are scored directly.`
         },
         {
           title: "Question type matters, and multi-hop fact synthesis is still the hardest slice",
         {
           title: "High search overlap is not the same thing as high source quality",
           body: bestOverlap
+            ? `${formatModelName(bestOverlap.model_name)} has the highest visible search overlap at ${formatNumber(bestOverlap.percentage_ge_sources_in_se_sources)}% In SE, but the best overall weighted score still belongs to ${formatModelName(top.model_name)}. This mirrors the paper's emphasis that leaderboard quality should not be reduced to overlap with search results alone.`
             : "The current artifact includes quality metrics beyond simple overlap with search-engine results, which is one of the main design points of SourceBench."
         },
         {
           title: "Dimension scores reveal different strengths behind similar overall rankings",
+          body: `${formatModelName(bestFreshness.model_name)} currently leads freshness at ${formatNumber(bestFreshness.freshness)}, while ${formatModelName(qualityLeaders.transparency.model_name)}, ${formatModelName(qualityLeaders.authority.model_name)}, and ${formatModelName(qualityLeaders.accountability.model_name)} lead key trust-related dimensions such as transparency, authority, and accountability. These per-dimension columns make it easier to see why two systems with similar overall scores can still have very different citation profiles.`
         }
       ];
     }
       const chatAvg = avg(chatRows, "weighted_total_content_score");
       deepseekFindings.innerHTML = `
+        <div class="study-item"><strong>Best DeepSeek variant in the current artifact: ${formatModelName(best.model_name)}</strong> with a weighted score of ${formatNumber(best.weighted_total_content_score)} and % In SE of ${formatNumber(best.percentage_ge_sources_in_se_sources)}.</div>
         <div class="study-item"><strong>Backend choice changes citation quality materially.</strong> The Gensee-backed variants average ${formatNumber(genseeAvg)} weighted score, while the Tavily-backed variants average ${formatNumber(tavilyAvg)}.</div>
         <div class="study-item"><strong>Reasoning mode does not dominate by itself.</strong> In this artifact, reasoning variants average ${formatNumber(reasoningAvg)} weighted score versus ${formatNumber(chatAvg)} for chat variants, suggesting that retrieval setup and source selection quality still matter directly.</div>
       `;
       deepseekTableHead.innerHTML = `<tr>${cols.map((key) => `<th>${labels[key] || key}</th>`).join("")}</tr>`;
       deepseekTableBody.innerHTML = sorted.map((row) => `
         <tr>
+          ${cols.map((key) => {
+            const value = row[key];
+            const rendered = key === "model_name"
+              ? formatModelName(value)
+              : (typeof value === "number" ? formatNumber(value) : (value ?? "-"));
+            return `<td>${rendered}</td>`;
+          }).join("")}
         </tr>
       `).join("");
     }
       const q = searchInput.value.trim().toLowerCase();
       if (q) {
+        rows = rows.filter((row) => {
+          const raw = String(row.model_name || "").toLowerCase();
+          const display = formatModelName(row.model_name).toLowerCase();
+          return raw.includes(q) || display.includes(q);
+        });
       }
       const metric = sortKey.value;
         const aa = Number.isNaN(av) ? -Infinity : av;
         const bb = Number.isNaN(bv) ? -Infinity : bv;
         if (bb !== aa) return bb - aa;
+        return formatModelName(a.model_name).localeCompare(formatModelName(b.model_name));
       });
       return rows;
     }
           <tr>
             ${cols.map((key) => {
               const val = row[key];
+              const rendered = key === "model_name"
+                ? formatModelName(val)
+                : (typeof val === "number" ? formatNumber(val) : (val ?? "-"));
               return `<td>${rendered}</td>`;
             }).join("")}
           </tr>