Spaces:

evaleval
/

general-eval-card

Running

evijit HF Staff Claude Sonnet 4.6 commited on 22 days ago

Commit

c2e86ea

1 Parent(s): b5fa10d

Compute and apply cleaned benchmark counts per model

- clean-hierarchy: after cleaning, build evalId→benchmarkKey map and
cross it against comparison-index scores to produce a per-model
distinct benchmark count; attach as _modelCoverageMap on EvalHierarchy
- backend-artifacts: add optional _modelCoverageMap field to EvalHierarchy
- sidecars: expose fetchModelCoverage(); bump cache to v11 so payload
is rebuilt with the new field
- data-backend: applyModelCoverage() helper overrides benchmarks_count
on all getModelCards / getModelCardsLite results (v2 path only),
replacing the warehouse's pre-clean count
- benchmark-detail: hide filter chips entirely in Overlaps view

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (5) hide show

components/benchmark-detail.tsx +1 -1
lib/backend-artifacts.ts +5 -0
lib/clean-hierarchy.ts +61 -0
lib/data-backend.ts +20 -2
lib/sidecars.ts +9 -1

components/benchmark-detail.tsx CHANGED Viewed

@@ -4724,7 +4724,7 @@ export function BenchmarkDetail({
           </div>
         )}
-        {groupingMode !== "source" && availableCategories.length > 0 && (
           <div className="mb-5 flex flex-wrap items-center gap-2">
             <span className="kicker mr-2">Category</span>
             <button

           </div>
         )}
+        {groupingMode === "category" && availableCategories.length > 0 && (
           <div className="mb-5 flex flex-wrap items-center gap-2">
             <span className="kicker mr-2">Category</span>
             <button

lib/backend-artifacts.ts CHANGED Viewed

@@ -384,6 +384,11 @@ export interface EvalHierarchy {
   stats?: EvalHierarchyStats
   families: HierarchyFamily[]
   benchmark_index?: BenchmarkIndexEntry[]
 }
 // ---------------------------------------------------------------------------

   stats?: EvalHierarchyStats
   families: HierarchyFamily[]
   benchmark_index?: BenchmarkIndexEntry[]
+  /** Per-model cleaned benchmark count, keyed by model_route_id.
+   *  Injected by cleanHierarchy() and persisted in the disk cache so
+   *  data-backend can override the warehouse's pre-baked benchmarks_count
+   *  (which is computed before the cleaner folds split families). */
+  _modelCoverageMap?: Record<string, number>
 }
 // ---------------------------------------------------------------------------

lib/clean-hierarchy.ts CHANGED Viewed

@@ -201,10 +201,71 @@ export function cleanHierarchy(
     )
   }
   recomputeStats(h)
   h[CLEANED_MARKER] = true
   return h
 }
 /**
  * Recompute the headline counts on `stats` so the home / evals pages
  * reflect the post-consolidation hierarchy. Upstream's `stats` block is

     )
   }
   recomputeStats(h)
+  if (comparisonIndex) {
+    h._modelCoverageMap = buildModelCoverageMap(h, comparisonIndex)
+  }
   h[CLEANED_MARKER] = true
   return h
 }
+/**
+ * Build a { model_route_id → distinct_benchmark_count } map from the
+ * cleaned hierarchy and the comparison-index scores.
+ *
+ * Steps:
+ *   1. Walk every surviving benchmark's summary_eval_ids to build
+ *      eval_summary_id → benchmark_key.
+ *   2. Walk comparison-index scores to collect, per model, the set of
+ *      eval_summary_ids it has a finite score for.
+ *   3. For each model, count the distinct benchmark_keys reachable
+ *      from its covered eval ids.
+ */
+function buildModelCoverageMap(
+  h: CleanableHierarchy,
+  comparisonIndex: ComparisonIndexLike,
+): Record<string, number> {
+  // Step 1: eval_summary_id → benchmark_key
+  const evalToBenchmark = new Map<string, string>()
+  const visitBench = (b: HierarchyBenchmark) => {
+    for (const id of b.summary_eval_ids ?? []) {
+      if (!evalToBenchmark.has(id)) evalToBenchmark.set(id, b.key)
+    }
+  }
+  for (const fam of h.families ?? []) {
+    for (const b of fam.benchmarks ?? []) visitBench(b)
+    for (const b of fam.standalone_benchmarks ?? []) visitBench(b)
+    for (const c of fam.composites ?? []) {
+      for (const b of c.benchmarks ?? []) visitBench(b)
+    }
+  }
+  // Step 2: model_route_id → Set<eval_summary_id with a finite score>
+  const modelEvals = new Map<string, Set<string>>()
+  for (const [evalId, entry] of Object.entries(comparisonIndex.evals ?? {})) {
+    for (const metric of entry.metrics ?? []) {
+      for (const row of metric.scores ?? []) {
+        const modelId = row.model_route_id || row.model_family_id
+        if (!modelId || row.score == null || !Number.isFinite(row.score as number)) continue
+        const set = modelEvals.get(modelId) ?? new Set<string>()
+        set.add(evalId)
+        modelEvals.set(modelId, set)
+      }
+    }
+  }
+  // Step 3: count distinct benchmark keys per model
+  const coverage: Record<string, number> = {}
+  for (const [modelId, evalIds] of modelEvals) {
+    const benchKeys = new Set<string>()
+    for (const id of evalIds) {
+      const bKey = evalToBenchmark.get(id)
+      if (bKey) benchKeys.add(bKey)
+    }
+    if (benchKeys.size > 0) coverage[modelId] = benchKeys.size
+  }
+  return coverage
+}
 /**
  * Recompute the headline counts on `stats` so the home / evals pages
  * reflect the post-consolidation hierarchy. Upstream's `stats` block is

lib/data-backend.ts CHANGED Viewed

@@ -26,9 +26,26 @@ async function hfData() {
   return import("@/lib/hf-data")
 }
 export async function getModelCards() {
   if (useViewLayerBackend()) {
-    return (await viewBackend()).getModelCards()
   }
   return (await legacyBackend()).getModelCardsFromDuckDB()
@@ -36,7 +53,8 @@ export async function getModelCards() {
 export async function getModelCardsLite() {
   if (useViewLayerBackend()) {
-    return (await viewBackend()).getModelCardsLite()
   }
   return (await legacyBackend()).getModelCardsLiteFromDuckDB()

   return import("@/lib/hf-data")
 }
+async function applyModelCoverage<T extends { route_id: string; benchmarks_count: number }>(
+  cards: T[],
+): Promise<T[]> {
+  try {
+    const coverage = await (await sidecars()).fetchModelCoverage()
+    if (Object.keys(coverage).length === 0) return cards
+    return cards.map((c) =>
+      coverage[c.route_id] != null
+        ? { ...c, benchmarks_count: coverage[c.route_id] }
+        : c,
+    )
+  } catch {
+    return cards
+  }
+}
 export async function getModelCards() {
   if (useViewLayerBackend()) {
+    const cards = await (await viewBackend()).getModelCards()
+    return applyModelCoverage(cards)
   }
   return (await legacyBackend()).getModelCardsFromDuckDB()
 export async function getModelCardsLite() {
   if (useViewLayerBackend()) {
+    const cards = await (await viewBackend()).getModelCardsLite()
+    return applyModelCoverage(cards)
   }
   return (await legacyBackend()).getModelCardsLiteFromDuckDB()

lib/sidecars.ts CHANGED Viewed

@@ -149,7 +149,7 @@ export function fetchHeadline(): Promise<CorpusAggregates> {
 // blobs don't get served against new code. The disk path embeds this
 // suffix; old files are simply ignored (and re-created on the next
 // stale read).
-const CLEAN_HIERARCHY_VERSION = "v10"
 /**
  * Returns the cleaned hierarchy used by the rest of the app — sanitised
@@ -204,6 +204,14 @@ export function fetchHierarchy(): Promise<EvalHierarchy> {
   return (cache.hierarchy ??= fetchCleanedHierarchy())
 }
 export function fetchComparisonIndex(): Promise<ComparisonIndex> {
   return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json").then(
     (index) => {

 // blobs don't get served against new code. The disk path embeds this
 // suffix; old files are simply ignored (and re-created on the next
 // stale read).
+const CLEAN_HIERARCHY_VERSION = "v11"
 /**
  * Returns the cleaned hierarchy used by the rest of the app — sanitised
   return (cache.hierarchy ??= fetchCleanedHierarchy())
 }
+/** Per-model cleaned benchmark count from the hierarchy payload.
+ *  Returns an empty map when the hierarchy was loaded without a
+ *  comparison-index (e.g. old cached v10 blobs). */
+export async function fetchModelCoverage(): Promise<Record<string, number>> {
+  const h = await fetchHierarchy()
+  return h._modelCoverageMap ?? {}
+}
 export function fetchComparisonIndex(): Promise<ComparisonIndex> {
   return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json").then(
     (index) => {