Spaces:
Running
Running
Compute and apply cleaned benchmark counts per model
Browse files- clean-hierarchy: after cleaning, build evalIdβbenchmarkKey map and
cross it against comparison-index scores to produce a per-model
distinct benchmark count; attach as _modelCoverageMap on EvalHierarchy
- backend-artifacts: add optional _modelCoverageMap field to EvalHierarchy
- sidecars: expose fetchModelCoverage(); bump cache to v11 so payload
is rebuilt with the new field
- data-backend: applyModelCoverage() helper overrides benchmarks_count
on all getModelCards / getModelCardsLite results (v2 path only),
replacing the warehouse's pre-clean count
- benchmark-detail: hide filter chips entirely in Overlaps view
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- components/benchmark-detail.tsx +1 -1
- lib/backend-artifacts.ts +5 -0
- lib/clean-hierarchy.ts +61 -0
- lib/data-backend.ts +20 -2
- lib/sidecars.ts +9 -1
components/benchmark-detail.tsx
CHANGED
|
@@ -4724,7 +4724,7 @@ export function BenchmarkDetail({
|
|
| 4724 |
</div>
|
| 4725 |
)}
|
| 4726 |
|
| 4727 |
-
{groupingMode
|
| 4728 |
<div className="mb-5 flex flex-wrap items-center gap-2">
|
| 4729 |
<span className="kicker mr-2">Category</span>
|
| 4730 |
<button
|
|
|
|
| 4724 |
</div>
|
| 4725 |
)}
|
| 4726 |
|
| 4727 |
+
{groupingMode === "category" && availableCategories.length > 0 && (
|
| 4728 |
<div className="mb-5 flex flex-wrap items-center gap-2">
|
| 4729 |
<span className="kicker mr-2">Category</span>
|
| 4730 |
<button
|
lib/backend-artifacts.ts
CHANGED
|
@@ -384,6 +384,11 @@ export interface EvalHierarchy {
|
|
| 384 |
stats?: EvalHierarchyStats
|
| 385 |
families: HierarchyFamily[]
|
| 386 |
benchmark_index?: BenchmarkIndexEntry[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
}
|
| 388 |
|
| 389 |
// ---------------------------------------------------------------------------
|
|
|
|
| 384 |
stats?: EvalHierarchyStats
|
| 385 |
families: HierarchyFamily[]
|
| 386 |
benchmark_index?: BenchmarkIndexEntry[]
|
| 387 |
+
/** Per-model cleaned benchmark count, keyed by model_route_id.
|
| 388 |
+
* Injected by cleanHierarchy() and persisted in the disk cache so
|
| 389 |
+
* data-backend can override the warehouse's pre-baked benchmarks_count
|
| 390 |
+
* (which is computed before the cleaner folds split families). */
|
| 391 |
+
_modelCoverageMap?: Record<string, number>
|
| 392 |
}
|
| 393 |
|
| 394 |
// ---------------------------------------------------------------------------
|
lib/clean-hierarchy.ts
CHANGED
|
@@ -201,10 +201,71 @@ export function cleanHierarchy(
|
|
| 201 |
)
|
| 202 |
}
|
| 203 |
recomputeStats(h)
|
|
|
|
|
|
|
|
|
|
| 204 |
h[CLEANED_MARKER] = true
|
| 205 |
return h
|
| 206 |
}
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
/**
|
| 209 |
* Recompute the headline counts on `stats` so the home / evals pages
|
| 210 |
* reflect the post-consolidation hierarchy. Upstream's `stats` block is
|
|
|
|
| 201 |
)
|
| 202 |
}
|
| 203 |
recomputeStats(h)
|
| 204 |
+
if (comparisonIndex) {
|
| 205 |
+
h._modelCoverageMap = buildModelCoverageMap(h, comparisonIndex)
|
| 206 |
+
}
|
| 207 |
h[CLEANED_MARKER] = true
|
| 208 |
return h
|
| 209 |
}
|
| 210 |
|
| 211 |
+
/**
|
| 212 |
+
* Build a { model_route_id β distinct_benchmark_count } map from the
|
| 213 |
+
* cleaned hierarchy and the comparison-index scores.
|
| 214 |
+
*
|
| 215 |
+
* Steps:
|
| 216 |
+
* 1. Walk every surviving benchmark's summary_eval_ids to build
|
| 217 |
+
* eval_summary_id β benchmark_key.
|
| 218 |
+
* 2. Walk comparison-index scores to collect, per model, the set of
|
| 219 |
+
* eval_summary_ids it has a finite score for.
|
| 220 |
+
* 3. For each model, count the distinct benchmark_keys reachable
|
| 221 |
+
* from its covered eval ids.
|
| 222 |
+
*/
|
| 223 |
+
function buildModelCoverageMap(
|
| 224 |
+
h: CleanableHierarchy,
|
| 225 |
+
comparisonIndex: ComparisonIndexLike,
|
| 226 |
+
): Record<string, number> {
|
| 227 |
+
// Step 1: eval_summary_id β benchmark_key
|
| 228 |
+
const evalToBenchmark = new Map<string, string>()
|
| 229 |
+
const visitBench = (b: HierarchyBenchmark) => {
|
| 230 |
+
for (const id of b.summary_eval_ids ?? []) {
|
| 231 |
+
if (!evalToBenchmark.has(id)) evalToBenchmark.set(id, b.key)
|
| 232 |
+
}
|
| 233 |
+
}
|
| 234 |
+
for (const fam of h.families ?? []) {
|
| 235 |
+
for (const b of fam.benchmarks ?? []) visitBench(b)
|
| 236 |
+
for (const b of fam.standalone_benchmarks ?? []) visitBench(b)
|
| 237 |
+
for (const c of fam.composites ?? []) {
|
| 238 |
+
for (const b of c.benchmarks ?? []) visitBench(b)
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
// Step 2: model_route_id β Set<eval_summary_id with a finite score>
|
| 243 |
+
const modelEvals = new Map<string, Set<string>>()
|
| 244 |
+
for (const [evalId, entry] of Object.entries(comparisonIndex.evals ?? {})) {
|
| 245 |
+
for (const metric of entry.metrics ?? []) {
|
| 246 |
+
for (const row of metric.scores ?? []) {
|
| 247 |
+
const modelId = row.model_route_id || row.model_family_id
|
| 248 |
+
if (!modelId || row.score == null || !Number.isFinite(row.score as number)) continue
|
| 249 |
+
const set = modelEvals.get(modelId) ?? new Set<string>()
|
| 250 |
+
set.add(evalId)
|
| 251 |
+
modelEvals.set(modelId, set)
|
| 252 |
+
}
|
| 253 |
+
}
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
// Step 3: count distinct benchmark keys per model
|
| 257 |
+
const coverage: Record<string, number> = {}
|
| 258 |
+
for (const [modelId, evalIds] of modelEvals) {
|
| 259 |
+
const benchKeys = new Set<string>()
|
| 260 |
+
for (const id of evalIds) {
|
| 261 |
+
const bKey = evalToBenchmark.get(id)
|
| 262 |
+
if (bKey) benchKeys.add(bKey)
|
| 263 |
+
}
|
| 264 |
+
if (benchKeys.size > 0) coverage[modelId] = benchKeys.size
|
| 265 |
+
}
|
| 266 |
+
return coverage
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
/**
|
| 270 |
* Recompute the headline counts on `stats` so the home / evals pages
|
| 271 |
* reflect the post-consolidation hierarchy. Upstream's `stats` block is
|
lib/data-backend.ts
CHANGED
|
@@ -26,9 +26,26 @@ async function hfData() {
|
|
| 26 |
return import("@/lib/hf-data")
|
| 27 |
}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
export async function getModelCards() {
|
| 30 |
if (useViewLayerBackend()) {
|
| 31 |
-
|
|
|
|
| 32 |
}
|
| 33 |
|
| 34 |
return (await legacyBackend()).getModelCardsFromDuckDB()
|
|
@@ -36,7 +53,8 @@ export async function getModelCards() {
|
|
| 36 |
|
| 37 |
export async function getModelCardsLite() {
|
| 38 |
if (useViewLayerBackend()) {
|
| 39 |
-
|
|
|
|
| 40 |
}
|
| 41 |
|
| 42 |
return (await legacyBackend()).getModelCardsLiteFromDuckDB()
|
|
|
|
| 26 |
return import("@/lib/hf-data")
|
| 27 |
}
|
| 28 |
|
| 29 |
+
async function applyModelCoverage<T extends { route_id: string; benchmarks_count: number }>(
|
| 30 |
+
cards: T[],
|
| 31 |
+
): Promise<T[]> {
|
| 32 |
+
try {
|
| 33 |
+
const coverage = await (await sidecars()).fetchModelCoverage()
|
| 34 |
+
if (Object.keys(coverage).length === 0) return cards
|
| 35 |
+
return cards.map((c) =>
|
| 36 |
+
coverage[c.route_id] != null
|
| 37 |
+
? { ...c, benchmarks_count: coverage[c.route_id] }
|
| 38 |
+
: c,
|
| 39 |
+
)
|
| 40 |
+
} catch {
|
| 41 |
+
return cards
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
export async function getModelCards() {
|
| 46 |
if (useViewLayerBackend()) {
|
| 47 |
+
const cards = await (await viewBackend()).getModelCards()
|
| 48 |
+
return applyModelCoverage(cards)
|
| 49 |
}
|
| 50 |
|
| 51 |
return (await legacyBackend()).getModelCardsFromDuckDB()
|
|
|
|
| 53 |
|
| 54 |
export async function getModelCardsLite() {
|
| 55 |
if (useViewLayerBackend()) {
|
| 56 |
+
const cards = await (await viewBackend()).getModelCardsLite()
|
| 57 |
+
return applyModelCoverage(cards)
|
| 58 |
}
|
| 59 |
|
| 60 |
return (await legacyBackend()).getModelCardsLiteFromDuckDB()
|
lib/sidecars.ts
CHANGED
|
@@ -149,7 +149,7 @@ export function fetchHeadline(): Promise<CorpusAggregates> {
|
|
| 149 |
// blobs don't get served against new code. The disk path embeds this
|
| 150 |
// suffix; old files are simply ignored (and re-created on the next
|
| 151 |
// stale read).
|
| 152 |
-
const CLEAN_HIERARCHY_VERSION = "
|
| 153 |
|
| 154 |
/**
|
| 155 |
* Returns the cleaned hierarchy used by the rest of the app β sanitised
|
|
@@ -204,6 +204,14 @@ export function fetchHierarchy(): Promise<EvalHierarchy> {
|
|
| 204 |
return (cache.hierarchy ??= fetchCleanedHierarchy())
|
| 205 |
}
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
export function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
| 208 |
return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json").then(
|
| 209 |
(index) => {
|
|
|
|
| 149 |
// blobs don't get served against new code. The disk path embeds this
|
| 150 |
// suffix; old files are simply ignored (and re-created on the next
|
| 151 |
// stale read).
|
| 152 |
+
const CLEAN_HIERARCHY_VERSION = "v11"
|
| 153 |
|
| 154 |
/**
|
| 155 |
* Returns the cleaned hierarchy used by the rest of the app β sanitised
|
|
|
|
| 204 |
return (cache.hierarchy ??= fetchCleanedHierarchy())
|
| 205 |
}
|
| 206 |
|
| 207 |
+
/** Per-model cleaned benchmark count from the hierarchy payload.
|
| 208 |
+
* Returns an empty map when the hierarchy was loaded without a
|
| 209 |
+
* comparison-index (e.g. old cached v10 blobs). */
|
| 210 |
+
export async function fetchModelCoverage(): Promise<Record<string, number>> {
|
| 211 |
+
const h = await fetchHierarchy()
|
| 212 |
+
return h._modelCoverageMap ?? {}
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
export function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
| 216 |
return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json").then(
|
| 217 |
(index) => {
|