evijit HF Staff Claude Sonnet 4.6 commited on
Commit
c2e86ea
Β·
1 Parent(s): b5fa10d

Compute and apply cleaned benchmark counts per model

Browse files

- clean-hierarchy: after cleaning, build evalId→benchmarkKey map and
cross it against comparison-index scores to produce a per-model
distinct benchmark count; attach as _modelCoverageMap on EvalHierarchy
- backend-artifacts: add optional _modelCoverageMap field to EvalHierarchy
- sidecars: expose fetchModelCoverage(); bump cache to v11 so payload
is rebuilt with the new field
- data-backend: applyModelCoverage() helper overrides benchmarks_count
on all getModelCards / getModelCardsLite results (v2 path only),
replacing the warehouse's pre-clean count
- benchmark-detail: hide filter chips entirely in Overlaps view

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

components/benchmark-detail.tsx CHANGED
@@ -4724,7 +4724,7 @@ export function BenchmarkDetail({
4724
  </div>
4725
  )}
4726
 
4727
- {groupingMode !== "source" && availableCategories.length > 0 && (
4728
  <div className="mb-5 flex flex-wrap items-center gap-2">
4729
  <span className="kicker mr-2">Category</span>
4730
  <button
 
4724
  </div>
4725
  )}
4726
 
4727
+ {groupingMode === "category" && availableCategories.length > 0 && (
4728
  <div className="mb-5 flex flex-wrap items-center gap-2">
4729
  <span className="kicker mr-2">Category</span>
4730
  <button
lib/backend-artifacts.ts CHANGED
@@ -384,6 +384,11 @@ export interface EvalHierarchy {
384
  stats?: EvalHierarchyStats
385
  families: HierarchyFamily[]
386
  benchmark_index?: BenchmarkIndexEntry[]
 
 
 
 
 
387
  }
388
 
389
  // ---------------------------------------------------------------------------
 
384
  stats?: EvalHierarchyStats
385
  families: HierarchyFamily[]
386
  benchmark_index?: BenchmarkIndexEntry[]
387
+ /** Per-model cleaned benchmark count, keyed by model_route_id.
388
+ * Injected by cleanHierarchy() and persisted in the disk cache so
389
+ * data-backend can override the warehouse's pre-baked benchmarks_count
390
+ * (which is computed before the cleaner folds split families). */
391
+ _modelCoverageMap?: Record<string, number>
392
  }
393
 
394
  // ---------------------------------------------------------------------------
lib/clean-hierarchy.ts CHANGED
@@ -201,10 +201,71 @@ export function cleanHierarchy(
201
  )
202
  }
203
  recomputeStats(h)
 
 
 
204
  h[CLEANED_MARKER] = true
205
  return h
206
  }
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  /**
209
  * Recompute the headline counts on `stats` so the home / evals pages
210
  * reflect the post-consolidation hierarchy. Upstream's `stats` block is
 
201
  )
202
  }
203
  recomputeStats(h)
204
+ if (comparisonIndex) {
205
+ h._modelCoverageMap = buildModelCoverageMap(h, comparisonIndex)
206
+ }
207
  h[CLEANED_MARKER] = true
208
  return h
209
  }
210
 
211
+ /**
212
+ * Build a { model_route_id β†’ distinct_benchmark_count } map from the
213
+ * cleaned hierarchy and the comparison-index scores.
214
+ *
215
+ * Steps:
216
+ * 1. Walk every surviving benchmark's summary_eval_ids to build
217
+ * eval_summary_id β†’ benchmark_key.
218
+ * 2. Walk comparison-index scores to collect, per model, the set of
219
+ * eval_summary_ids it has a finite score for.
220
+ * 3. For each model, count the distinct benchmark_keys reachable
221
+ * from its covered eval ids.
222
+ */
223
+ function buildModelCoverageMap(
224
+ h: CleanableHierarchy,
225
+ comparisonIndex: ComparisonIndexLike,
226
+ ): Record<string, number> {
227
+ // Step 1: eval_summary_id β†’ benchmark_key
228
+ const evalToBenchmark = new Map<string, string>()
229
+ const visitBench = (b: HierarchyBenchmark) => {
230
+ for (const id of b.summary_eval_ids ?? []) {
231
+ if (!evalToBenchmark.has(id)) evalToBenchmark.set(id, b.key)
232
+ }
233
+ }
234
+ for (const fam of h.families ?? []) {
235
+ for (const b of fam.benchmarks ?? []) visitBench(b)
236
+ for (const b of fam.standalone_benchmarks ?? []) visitBench(b)
237
+ for (const c of fam.composites ?? []) {
238
+ for (const b of c.benchmarks ?? []) visitBench(b)
239
+ }
240
+ }
241
+
242
+ // Step 2: model_route_id β†’ Set<eval_summary_id with a finite score>
243
+ const modelEvals = new Map<string, Set<string>>()
244
+ for (const [evalId, entry] of Object.entries(comparisonIndex.evals ?? {})) {
245
+ for (const metric of entry.metrics ?? []) {
246
+ for (const row of metric.scores ?? []) {
247
+ const modelId = row.model_route_id || row.model_family_id
248
+ if (!modelId || row.score == null || !Number.isFinite(row.score as number)) continue
249
+ const set = modelEvals.get(modelId) ?? new Set<string>()
250
+ set.add(evalId)
251
+ modelEvals.set(modelId, set)
252
+ }
253
+ }
254
+ }
255
+
256
+ // Step 3: count distinct benchmark keys per model
257
+ const coverage: Record<string, number> = {}
258
+ for (const [modelId, evalIds] of modelEvals) {
259
+ const benchKeys = new Set<string>()
260
+ for (const id of evalIds) {
261
+ const bKey = evalToBenchmark.get(id)
262
+ if (bKey) benchKeys.add(bKey)
263
+ }
264
+ if (benchKeys.size > 0) coverage[modelId] = benchKeys.size
265
+ }
266
+ return coverage
267
+ }
268
+
269
  /**
270
  * Recompute the headline counts on `stats` so the home / evals pages
271
  * reflect the post-consolidation hierarchy. Upstream's `stats` block is
lib/data-backend.ts CHANGED
@@ -26,9 +26,26 @@ async function hfData() {
26
  return import("@/lib/hf-data")
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  export async function getModelCards() {
30
  if (useViewLayerBackend()) {
31
- return (await viewBackend()).getModelCards()
 
32
  }
33
 
34
  return (await legacyBackend()).getModelCardsFromDuckDB()
@@ -36,7 +53,8 @@ export async function getModelCards() {
36
 
37
  export async function getModelCardsLite() {
38
  if (useViewLayerBackend()) {
39
- return (await viewBackend()).getModelCardsLite()
 
40
  }
41
 
42
  return (await legacyBackend()).getModelCardsLiteFromDuckDB()
 
26
  return import("@/lib/hf-data")
27
  }
28
 
29
+ async function applyModelCoverage<T extends { route_id: string; benchmarks_count: number }>(
30
+ cards: T[],
31
+ ): Promise<T[]> {
32
+ try {
33
+ const coverage = await (await sidecars()).fetchModelCoverage()
34
+ if (Object.keys(coverage).length === 0) return cards
35
+ return cards.map((c) =>
36
+ coverage[c.route_id] != null
37
+ ? { ...c, benchmarks_count: coverage[c.route_id] }
38
+ : c,
39
+ )
40
+ } catch {
41
+ return cards
42
+ }
43
+ }
44
+
45
  export async function getModelCards() {
46
  if (useViewLayerBackend()) {
47
+ const cards = await (await viewBackend()).getModelCards()
48
+ return applyModelCoverage(cards)
49
  }
50
 
51
  return (await legacyBackend()).getModelCardsFromDuckDB()
 
53
 
54
  export async function getModelCardsLite() {
55
  if (useViewLayerBackend()) {
56
+ const cards = await (await viewBackend()).getModelCardsLite()
57
+ return applyModelCoverage(cards)
58
  }
59
 
60
  return (await legacyBackend()).getModelCardsLiteFromDuckDB()
lib/sidecars.ts CHANGED
@@ -149,7 +149,7 @@ export function fetchHeadline(): Promise<CorpusAggregates> {
149
  // blobs don't get served against new code. The disk path embeds this
150
  // suffix; old files are simply ignored (and re-created on the next
151
  // stale read).
152
- const CLEAN_HIERARCHY_VERSION = "v10"
153
 
154
  /**
155
  * Returns the cleaned hierarchy used by the rest of the app β€” sanitised
@@ -204,6 +204,14 @@ export function fetchHierarchy(): Promise<EvalHierarchy> {
204
  return (cache.hierarchy ??= fetchCleanedHierarchy())
205
  }
206
 
 
 
 
 
 
 
 
 
207
  export function fetchComparisonIndex(): Promise<ComparisonIndex> {
208
  return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json").then(
209
  (index) => {
 
149
  // blobs don't get served against new code. The disk path embeds this
150
  // suffix; old files are simply ignored (and re-created on the next
151
  // stale read).
152
+ const CLEAN_HIERARCHY_VERSION = "v11"
153
 
154
  /**
155
  * Returns the cleaned hierarchy used by the rest of the app β€” sanitised
 
204
  return (cache.hierarchy ??= fetchCleanedHierarchy())
205
  }
206
 
207
+ /** Per-model cleaned benchmark count from the hierarchy payload.
208
+ * Returns an empty map when the hierarchy was loaded without a
209
+ * comparison-index (e.g. old cached v10 blobs). */
210
+ export async function fetchModelCoverage(): Promise<Record<string, number>> {
211
+ const h = await fetchHierarchy()
212
+ return h._modelCoverageMap ?? {}
213
+ }
214
+
215
  export function fetchComparisonIndex(): Promise<ComparisonIndex> {
216
  return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json").then(
217
  (index) => {