Spaces:
Running
Running
Swap backend data (#3)
Browse files- Integrate with test backend data (7635aee64606c5b9138e680b833ca1383b570887)
- Drop input_modalities/output_modalities from MODEL_CARD_COLUMNS (bfce8f214eed3054b820176601eaa0a23e31bee7)
- Merge remote-tracking branch 'origin/main' into feat/use-new-backend-data (25ba6d010ff24b92f252f8fe11a6624f68aa6690)
- Use model_key as the addressable identifier and wire comparison-index sidecar (0e529dce5eb243708739730f1fcec00d27202d71)
Co-authored-by: Jenny Chim <j-chim@users.noreply.huggingface.co>
- Dockerfile +17 -17
- app/page.tsx +1 -1
- components/signals/corpus-dashboard.tsx +61 -42
- components/signals/corpus-signals-strip.tsx +21 -11
- lib/backend-artifacts.ts +33 -21
- lib/benchmark-schema.ts +1 -0
- lib/data-backend.ts +133 -42
- lib/duckdb.ts +46 -0
- lib/hf-data.ts +42 -0
- lib/sidecars.ts +65 -0
- lib/view-data.ts +576 -0
- notes/backend-v2-migration.md +616 -0
- notes/merge-cheatsheet-backend-v2.md +193 -0
- scripts/cache-hf-data.mjs +7 -0
- tests/duckdb-data.test.ts +29 -29
- tests/view-data.test.ts +466 -0
Dockerfile
CHANGED
|
@@ -9,20 +9,18 @@ ARG PNPM_VERSION=10.25.0
|
|
| 9 |
|
| 10 |
# Build-time data-source configuration. HF Spaces "Variables" are NOT injected
|
| 11 |
# into Docker RUN steps automatically — only into the final runtime — so we
|
| 12 |
-
# bake the
|
| 13 |
-
#
|
| 14 |
-
#
|
| 15 |
-
ARG DATA_BACKEND=
|
| 16 |
ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
|
| 17 |
-
|
| 18 |
-
# `
|
| 19 |
-
#
|
| 20 |
-
#
|
| 21 |
-
# fetchers (`lib/hf-data.ts`) from attempting `evaleval/card_backend`
|
| 22 |
-
# network reads with `revalidate: 0` (which Next 15 treats as dynamic
|
| 23 |
-
# and fails the static export of `/`).
|
| 24 |
ENV DATA_BACKEND=${DATA_BACKEND} \
|
| 25 |
HF_DATASET_REPO=${HF_DATASET_REPO} \
|
|
|
|
| 26 |
LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
|
| 27 |
HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
|
| 28 |
HF_DATA_OFFLINE=1
|
|
@@ -49,13 +47,15 @@ RUN pnpm run build
|
|
| 49 |
FROM node:18-bullseye-slim AS runner
|
| 50 |
WORKDIR /app
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
#
|
| 56 |
-
#
|
|
|
|
| 57 |
ENV NODE_ENV=production \
|
| 58 |
-
DATA_BACKEND=
|
|
|
|
| 59 |
LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
|
| 60 |
HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
|
| 61 |
HF_DATA_OFFLINE=1
|
|
|
|
| 9 |
|
| 10 |
# Build-time data-source configuration. HF Spaces "Variables" are NOT injected
|
| 11 |
# into Docker RUN steps automatically — only into the final runtime — so we
|
| 12 |
+
# bake the selected backend here. `DATA_BACKEND=v2` reads `SNAPSHOT_URL`
|
| 13 |
+
# directly; legacy DuckDB mode still clones `HF_DATASET_REPO` into the cache.
|
| 14 |
+
# Override at build time via `--build-arg ...`.
|
| 15 |
+
ARG DATA_BACKEND=v2
|
| 16 |
ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
|
| 17 |
+
ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
|
| 18 |
+
# Static prerender (`next build`) executes route handlers. In legacy mode the
|
| 19 |
+
# cache populated by `cache-hf-data.mjs` lives at `/app/.cache/hf-data`; in v2
|
| 20 |
+
# the cache step is skipped and the app reads the pinned Stage J snapshot.
|
|
|
|
|
|
|
|
|
|
| 21 |
ENV DATA_BACKEND=${DATA_BACKEND} \
|
| 22 |
HF_DATASET_REPO=${HF_DATASET_REPO} \
|
| 23 |
+
SNAPSHOT_URL=${SNAPSHOT_URL} \
|
| 24 |
LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
|
| 25 |
HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
|
| 26 |
HF_DATA_OFFLINE=1
|
|
|
|
| 47 |
FROM node:18-bullseye-slim AS runner
|
| 48 |
WORKDIR /app
|
| 49 |
|
| 50 |
+
ARG DATA_BACKEND=v2
|
| 51 |
+
ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
|
| 52 |
+
|
| 53 |
+
# Runtime needs the same data-source envs that the builder used. Docker
|
| 54 |
+
# multi-stage doesn't carry ENVs across stages, so keep backend selection and
|
| 55 |
+
# snapshot/cache pointers explicit here too.
|
| 56 |
ENV NODE_ENV=production \
|
| 57 |
+
DATA_BACKEND=${DATA_BACKEND} \
|
| 58 |
+
SNAPSHOT_URL=${SNAPSHOT_URL} \
|
| 59 |
LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
|
| 60 |
HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
|
| 61 |
HF_DATA_OFFLINE=1
|
app/page.tsx
CHANGED
|
@@ -244,7 +244,7 @@ export default async function HomePage() {
|
|
| 244 |
<p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
|
| 245 |
The current backend snapshot does not include{" "}
|
| 246 |
<code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
|
| 247 |
-
|
| 248 |
</code>
|
| 249 |
. When it does, this section will render the four corpus-level rollups.
|
| 250 |
</p>
|
|
|
|
| 244 |
<p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
|
| 245 |
The current backend snapshot does not include{" "}
|
| 246 |
<code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
|
| 247 |
+
headline.json
|
| 248 |
</code>
|
| 249 |
. When it does, this section will render the four corpus-level rollups.
|
| 250 |
</p>
|
components/signals/corpus-dashboard.tsx
CHANGED
|
@@ -20,7 +20,7 @@ import {
|
|
| 20 |
formatPercent,
|
| 21 |
} from "./signal-utils"
|
| 22 |
|
| 23 |
-
const CATEGORY_ORDER = ["
|
| 24 |
|
| 25 |
const SOURCE_COLORS: Record<string, string> = {
|
| 26 |
first_party: "bg-amber-500",
|
|
@@ -51,13 +51,21 @@ export function CorpusDashboard({
|
|
| 51 |
}, [mode])
|
| 52 |
|
| 53 |
const categoryKeys = useMemo(
|
| 54 |
-
() =>
|
| 55 |
-
|
| 56 |
-
aggregates.reproducibility.by_category
|
| 57 |
-
aggregates.completeness.by_category
|
| 58 |
-
aggregates.provenance.by_category
|
| 59 |
-
aggregates.comparability.by_category
|
| 60 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
[aggregates]
|
| 62 |
)
|
| 63 |
|
|
@@ -190,25 +198,14 @@ function CompletenessSection({
|
|
| 190 |
icon={<ClipboardCheck className="h-5 w-5" />}
|
| 191 |
title="Reporting Completeness"
|
| 192 |
subtitle="How much benchmark documentation is populated."
|
| 193 |
-
headline={formatPercent(block.
|
| 194 |
-
headlineLabel={`
|
| 195 |
>
|
| 196 |
{scores.length > 0 && <Histogram scores={scores} />}
|
| 197 |
-
<div className="mt-4 grid gap-2">
|
| 198 |
-
{
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
<span className="font-medium">{formatFieldLabel(field)}</span>
|
| 202 |
-
<span className="shrink-0 tabular-nums text-muted-foreground">
|
| 203 |
-
{formatPercent(value.mean_score)}
|
| 204 |
-
</span>
|
| 205 |
-
</div>
|
| 206 |
-
<div className="mt-2 grid gap-1.5">
|
| 207 |
-
<MetricBar label="Any data" value={value.populated_rate} compact />
|
| 208 |
-
<MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
|
| 209 |
-
</div>
|
| 210 |
-
</div>
|
| 211 |
-
))}
|
| 212 |
</div>
|
| 213 |
</DashboardSection>
|
| 214 |
)
|
|
@@ -217,14 +214,16 @@ function CompletenessSection({
|
|
| 217 |
function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
|
| 218 |
const distribution = block.source_type_distribution
|
| 219 |
const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
|
|
|
|
|
|
|
| 220 |
|
| 221 |
return (
|
| 222 |
<DashboardSection
|
| 223 |
icon={<BarChart3 className="h-5 w-5" />}
|
| 224 |
title="Provenance"
|
| 225 |
subtitle="Who reported the scores, and whether groups have multiple sources."
|
| 226 |
-
headline={formatPercent(
|
| 227 |
-
headlineLabel="of
|
| 228 |
>
|
| 229 |
<div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
|
| 230 |
<div className="flex h-4 w-full">
|
|
@@ -240,34 +239,40 @@ function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
|
|
| 240 |
</div>
|
| 241 |
|
| 242 |
<div className="mt-3 grid gap-2 sm:grid-cols-2">
|
| 243 |
-
<RatioTile label="Multi-source
|
| 244 |
-
<RatioTile label="First-party only
|
| 245 |
</div>
|
| 246 |
</DashboardSection>
|
| 247 |
)
|
| 248 |
}
|
| 249 |
|
| 250 |
function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
return (
|
| 252 |
<DashboardSection
|
| 253 |
icon={<GitCompareArrows className="h-5 w-5" />}
|
| 254 |
title="Comparability"
|
| 255 |
subtitle="Eligible groups where scores diverge across setups or reporting organizations."
|
| 256 |
-
headline={formatNullableRate(
|
| 257 |
-
headlineLabel={`${block.
|
| 258 |
>
|
| 259 |
<div className="grid gap-3 md:grid-cols-2">
|
| 260 |
<ComparabilityRateCard
|
| 261 |
title="Variant divergence"
|
| 262 |
-
rate={
|
| 263 |
-
eligible={block.
|
| 264 |
-
divergent={block.
|
| 265 |
/>
|
| 266 |
<ComparabilityRateCard
|
| 267 |
title="Cross-party divergence"
|
| 268 |
-
rate={
|
| 269 |
-
eligible={block.
|
| 270 |
-
divergent={block.
|
| 271 |
/>
|
| 272 |
</div>
|
| 273 |
</DashboardSection>
|
|
@@ -288,6 +293,15 @@ function CategoryPanel({
|
|
| 288 |
comparability?: ComparabilityCorpusBlock
|
| 289 |
}) {
|
| 290 |
const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
return (
|
| 293 |
<section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
|
|
@@ -297,11 +311,11 @@ function CategoryPanel({
|
|
| 297 |
</div>
|
| 298 |
<div className="grid gap-3 sm:grid-cols-2">
|
| 299 |
<MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
|
| 300 |
-
<MiniMetric label="Documentation mean" value={formatPercent(completeness?.
|
| 301 |
-
<MiniMetric label="Multi-source
|
| 302 |
-
<MiniMetric label="Variant divergence" value={formatNullableRate(
|
| 303 |
</div>
|
| 304 |
-
{
|
| 305 |
<div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
|
| 306 |
Cross-party divergence: N/A - not enough multi-org coverage.
|
| 307 |
</div>
|
|
@@ -411,7 +425,7 @@ function RatioTile({ label, value, count }: { label: string; value: number | nul
|
|
| 411 |
<div className="text-sm font-medium">{label}</div>
|
| 412 |
<div className="mt-1 flex items-baseline justify-between gap-2">
|
| 413 |
<span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
|
| 414 |
-
<span className="text-xs text-muted-foreground">{count.toLocaleString()}
|
| 415 |
</div>
|
| 416 |
</div>
|
| 417 |
)
|
|
@@ -463,6 +477,11 @@ function formatNullableRate(value: number | null | undefined) {
|
|
| 463 |
return value == null ? "N/A" : formatPercent(value)
|
| 464 |
}
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
function formatGeneratedDate(value: string) {
|
| 467 |
const date = new Date(value)
|
| 468 |
if (Number.isNaN(date.getTime())) {
|
|
|
|
| 20 |
formatPercent,
|
| 21 |
} from "./signal-utils"
|
| 22 |
|
| 23 |
+
const CATEGORY_ORDER = ["Agentic", "General", "Knowledge", "Reasoning", "Safety", "Other"]
|
| 24 |
|
| 25 |
const SOURCE_COLORS: Record<string, string> = {
|
| 26 |
first_party: "bg-amber-500",
|
|
|
|
| 51 |
}, [mode])
|
| 52 |
|
| 53 |
const categoryKeys = useMemo(
|
| 54 |
+
() => {
|
| 55 |
+
const available = new Set([
|
| 56 |
+
...Object.keys(aggregates.reproducibility.by_category),
|
| 57 |
+
...Object.keys(aggregates.completeness.by_category),
|
| 58 |
+
...Object.keys(aggregates.provenance.by_category),
|
| 59 |
+
...Object.keys(aggregates.comparability.by_category),
|
| 60 |
+
])
|
| 61 |
+
|
| 62 |
+
return [
|
| 63 |
+
...CATEGORY_ORDER.filter((category) => available.has(category)),
|
| 64 |
+
...Array.from(available)
|
| 65 |
+
.filter((category) => !CATEGORY_ORDER.includes(category))
|
| 66 |
+
.sort((a, b) => a.localeCompare(b)),
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
[aggregates]
|
| 70 |
)
|
| 71 |
|
|
|
|
| 198 |
icon={<ClipboardCheck className="h-5 w-5" />}
|
| 199 |
title="Reporting Completeness"
|
| 200 |
subtitle="How much benchmark documentation is populated."
|
| 201 |
+
headline={formatPercent(block.completeness_avg)}
|
| 202 |
+
headlineLabel={`Range ${formatPercent(block.completeness_min)} to ${formatPercent(block.completeness_max)} across ${block.total_triples.toLocaleString()} reported score triples`}
|
| 203 |
>
|
| 204 |
{scores.length > 0 && <Histogram scores={scores} />}
|
| 205 |
+
<div className="mt-4 grid gap-2 sm:grid-cols-3">
|
| 206 |
+
<MiniMetric label="Minimum" value={formatPercent(block.completeness_min)} />
|
| 207 |
+
<MiniMetric label="Average" value={formatPercent(block.completeness_avg)} />
|
| 208 |
+
<MiniMetric label="Maximum" value={formatPercent(block.completeness_max)} />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
</div>
|
| 210 |
</DashboardSection>
|
| 211 |
)
|
|
|
|
| 214 |
function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
|
| 215 |
const distribution = block.source_type_distribution
|
| 216 |
const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
|
| 217 |
+
const multiSourceRate = rate(block.multi_source_triples, block.total_triples)
|
| 218 |
+
const firstPartyOnlyRate = rate(block.first_party_only_triples, block.total_triples)
|
| 219 |
|
| 220 |
return (
|
| 221 |
<DashboardSection
|
| 222 |
icon={<BarChart3 className="h-5 w-5" />}
|
| 223 |
title="Provenance"
|
| 224 |
subtitle="Who reported the scores, and whether groups have multiple sources."
|
| 225 |
+
headline={formatPercent(multiSourceRate)}
|
| 226 |
+
headlineLabel="of reported score triples have multiple reporting sources"
|
| 227 |
>
|
| 228 |
<div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
|
| 229 |
<div className="flex h-4 w-full">
|
|
|
|
| 239 |
</div>
|
| 240 |
|
| 241 |
<div className="mt-3 grid gap-2 sm:grid-cols-2">
|
| 242 |
+
<RatioTile label="Multi-source triples" value={multiSourceRate} count={block.multi_source_triples} />
|
| 243 |
+
<RatioTile label="First-party only triples" value={firstPartyOnlyRate} count={block.first_party_only_triples} />
|
| 244 |
</div>
|
| 245 |
</DashboardSection>
|
| 246 |
)
|
| 247 |
}
|
| 248 |
|
| 249 |
function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
|
| 250 |
+
const variantRate = rate(block.variant_divergent_count, block.groups_with_variant_check)
|
| 251 |
+
const crossPartyRate = rate(
|
| 252 |
+
block.cross_party_divergent_count,
|
| 253 |
+
block.groups_with_cross_party_check
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
return (
|
| 257 |
<DashboardSection
|
| 258 |
icon={<GitCompareArrows className="h-5 w-5" />}
|
| 259 |
title="Comparability"
|
| 260 |
subtitle="Eligible groups where scores diverge across setups or reporting organizations."
|
| 261 |
+
headline={formatNullableRate(variantRate)}
|
| 262 |
+
headlineLabel={`${block.variant_divergent_count.toLocaleString()} of ${block.groups_with_variant_check.toLocaleString()} setup-eligible groups diverge`}
|
| 263 |
>
|
| 264 |
<div className="grid gap-3 md:grid-cols-2">
|
| 265 |
<ComparabilityRateCard
|
| 266 |
title="Variant divergence"
|
| 267 |
+
rate={variantRate}
|
| 268 |
+
eligible={block.groups_with_variant_check}
|
| 269 |
+
divergent={block.variant_divergent_count}
|
| 270 |
/>
|
| 271 |
<ComparabilityRateCard
|
| 272 |
title="Cross-party divergence"
|
| 273 |
+
rate={crossPartyRate}
|
| 274 |
+
eligible={block.groups_with_cross_party_check}
|
| 275 |
+
divergent={block.cross_party_divergent_count}
|
| 276 |
/>
|
| 277 |
</div>
|
| 278 |
</DashboardSection>
|
|
|
|
| 293 |
comparability?: ComparabilityCorpusBlock
|
| 294 |
}) {
|
| 295 |
const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
|
| 296 |
+
const multiSourceRate = rate(provenance?.multi_source_triples, provenance?.total_triples)
|
| 297 |
+
const variantRate = rate(
|
| 298 |
+
comparability?.variant_divergent_count,
|
| 299 |
+
comparability?.groups_with_variant_check
|
| 300 |
+
)
|
| 301 |
+
const crossPartyRate = rate(
|
| 302 |
+
comparability?.cross_party_divergent_count,
|
| 303 |
+
comparability?.groups_with_cross_party_check
|
| 304 |
+
)
|
| 305 |
|
| 306 |
return (
|
| 307 |
<section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
|
|
|
|
| 311 |
</div>
|
| 312 |
<div className="grid gap-3 sm:grid-cols-2">
|
| 313 |
<MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
|
| 314 |
+
<MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_avg)} />
|
| 315 |
+
<MiniMetric label="Multi-source triples" value={formatPercent(multiSourceRate)} />
|
| 316 |
+
<MiniMetric label="Variant divergence" value={formatNullableRate(variantRate)} />
|
| 317 |
</div>
|
| 318 |
+
{crossPartyRate == null && (
|
| 319 |
<div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
|
| 320 |
Cross-party divergence: N/A - not enough multi-org coverage.
|
| 321 |
</div>
|
|
|
|
| 425 |
<div className="text-sm font-medium">{label}</div>
|
| 426 |
<div className="mt-1 flex items-baseline justify-between gap-2">
|
| 427 |
<span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
|
| 428 |
+
<span className="text-xs text-muted-foreground">{count.toLocaleString()} triples</span>
|
| 429 |
</div>
|
| 430 |
</div>
|
| 431 |
)
|
|
|
|
| 477 |
return value == null ? "N/A" : formatPercent(value)
|
| 478 |
}
|
| 479 |
|
| 480 |
+
function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
|
| 481 |
+
if (numerator == null || denominator == null || denominator <= 0) return null
|
| 482 |
+
return numerator / denominator
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
function formatGeneratedDate(value: string) {
|
| 486 |
const date = new Date(value)
|
| 487 |
if (Number.isNaN(date.getTime())) {
|
components/signals/corpus-signals-strip.tsx
CHANGED
|
@@ -39,8 +39,13 @@ export function CorpusSignalsStrip({
|
|
| 39 |
const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
|
| 40 |
const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
|
| 41 |
|
| 42 |
-
const
|
| 43 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
return (
|
| 46 |
<div className="signals-grid">
|
|
@@ -58,29 +63,29 @@ export function CorpusSignalsStrip({
|
|
| 58 |
/>
|
| 59 |
<SignalTile
|
| 60 |
id="completeness"
|
| 61 |
-
statValue={pctNum(comp.
|
| 62 |
statUnit="%"
|
| 63 |
-
headline={`mean across ${comp.
|
| 64 |
-
detail=
|
| 65 |
asks="Is the benchmark itself documented well enough to interpret a score on it?"
|
| 66 |
/>
|
| 67 |
<SignalTile
|
| 68 |
id="provenance"
|
| 69 |
-
statValue={pctNum(
|
| 70 |
statUnit="%"
|
| 71 |
-
headline="of
|
| 72 |
-
detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()}
|
| 73 |
asks="Who reported this score, and have others reproduced it?"
|
| 74 |
/>
|
| 75 |
<SignalTile
|
| 76 |
id="comparability"
|
| 77 |
statValue={pctNum(cmpRate)}
|
| 78 |
statUnit="%"
|
| 79 |
-
headline={`of setup-eligible groups diverge across variants (${cmp.
|
| 80 |
detail={
|
| 81 |
crossPartyAvailable
|
| 82 |
-
? `Cross-party divergence: ${formatPct(
|
| 83 |
-
: "Cross-party divergence not yet computable
|
| 84 |
}
|
| 85 |
asks="Are scores on the same benchmark actually measuring the same thing?"
|
| 86 |
/>
|
|
@@ -154,6 +159,11 @@ function formatPct(value: number | null | undefined): string {
|
|
| 154 |
return `${Math.round(value * 100)}%`
|
| 155 |
}
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
const FIELD_LABELS: Record<string, string> = {
|
| 158 |
temperature: "temperature",
|
| 159 |
max_tokens: "max tokens",
|
|
|
|
| 39 |
const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
|
| 40 |
const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
|
| 41 |
|
| 42 |
+
const multiSourceRate = rate(prov.multi_source_triples, prov.total_triples)
|
| 43 |
+
const cmpRate = rate(cmp.variant_divergent_count, cmp.groups_with_variant_check)
|
| 44 |
+
const crossPartyRate = rate(
|
| 45 |
+
cmp.cross_party_divergent_count,
|
| 46 |
+
cmp.groups_with_cross_party_check
|
| 47 |
+
)
|
| 48 |
+
const crossPartyAvailable = cmp.groups_with_cross_party_check > 0
|
| 49 |
|
| 50 |
return (
|
| 51 |
<div className="signals-grid">
|
|
|
|
| 63 |
/>
|
| 64 |
<SignalTile
|
| 65 |
id="completeness"
|
| 66 |
+
statValue={pctNum(comp.completeness_avg)}
|
| 67 |
statUnit="%"
|
| 68 |
+
headline={`mean across ${comp.total_triples.toLocaleString()} reported score triples.`}
|
| 69 |
+
detail={`Observed range: ${formatPct(comp.completeness_min)} to ${formatPct(comp.completeness_max)}.`}
|
| 70 |
asks="Is the benchmark itself documented well enough to interpret a score on it?"
|
| 71 |
/>
|
| 72 |
<SignalTile
|
| 73 |
id="provenance"
|
| 74 |
+
statValue={pctNum(multiSourceRate)}
|
| 75 |
statUnit="%"
|
| 76 |
+
headline="of reported score triples have reports from more than one party."
|
| 77 |
+
detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} triples.`}
|
| 78 |
asks="Who reported this score, and have others reproduced it?"
|
| 79 |
/>
|
| 80 |
<SignalTile
|
| 81 |
id="comparability"
|
| 82 |
statValue={pctNum(cmpRate)}
|
| 83 |
statUnit="%"
|
| 84 |
+
headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_count.toLocaleString()} of ${cmp.groups_with_variant_check.toLocaleString()}).`}
|
| 85 |
detail={
|
| 86 |
crossPartyAvailable
|
| 87 |
+
? `Cross-party divergence: ${formatPct(crossPartyRate)}.`
|
| 88 |
+
: "Cross-party divergence not yet computable: too few multi-org reports."
|
| 89 |
}
|
| 90 |
asks="Are scores on the same benchmark actually measuring the same thing?"
|
| 91 |
/>
|
|
|
|
| 159 |
return `${Math.round(value * 100)}%`
|
| 160 |
}
|
| 161 |
|
| 162 |
+
function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
|
| 163 |
+
if (numerator == null || denominator == null || denominator <= 0) return null
|
| 164 |
+
return numerator / denominator
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
const FIELD_LABELS: Record<string, string> = {
|
| 168 |
temperature: "temperature",
|
| 169 |
max_tokens: "max tokens",
|
lib/backend-artifacts.ts
CHANGED
|
@@ -12,6 +12,7 @@ export interface BackendManifest {
|
|
| 12 |
skipped_config_count?: number
|
| 13 |
summary_artifacts?: {
|
| 14 |
corpus_aggregates?: string
|
|
|
|
| 15 |
[key: string]: string | undefined
|
| 16 |
}
|
| 17 |
}
|
|
@@ -177,6 +178,27 @@ export interface CorpusAggregates {
|
|
| 177 |
completeness: Stratified<CompletenessCorpusBlock>
|
| 178 |
provenance: Stratified<ProvenanceCorpusBlock>
|
| 179 |
comparability: Stratified<ComparabilityCorpusBlock>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
}
|
| 181 |
|
| 182 |
export interface Stratified<T> {
|
|
@@ -198,35 +220,25 @@ export interface ReproducibilityCorpusBlock {
|
|
| 198 |
}
|
| 199 |
|
| 200 |
export interface CompletenessCorpusBlock {
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
mean_score: number
|
| 206 |
-
populated_rate: number
|
| 207 |
-
fully_populated_rate: number
|
| 208 |
-
benchmark_count: number
|
| 209 |
-
}>
|
| 210 |
}
|
| 211 |
|
| 212 |
export interface ProvenanceCorpusBlock {
|
| 213 |
total_triples: number
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
multi_source_rate: number | null
|
| 217 |
-
first_party_only_groups: number
|
| 218 |
-
first_party_only_rate: number | null
|
| 219 |
source_type_distribution: Record<ProvenanceSourceType, number>
|
| 220 |
}
|
| 221 |
|
| 222 |
export interface ComparabilityCorpusBlock {
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
cross_party_divergent_groups: number
|
| 229 |
-
cross_party_divergence_rate: number | null
|
| 230 |
}
|
| 231 |
|
| 232 |
export interface HierarchyTags {
|
|
|
|
| 12 |
skipped_config_count?: number
|
| 13 |
summary_artifacts?: {
|
| 14 |
corpus_aggregates?: string
|
| 15 |
+
eval_hierarchy?: string
|
| 16 |
[key: string]: string | undefined
|
| 17 |
}
|
| 18 |
}
|
|
|
|
| 178 |
completeness: Stratified<CompletenessCorpusBlock>
|
| 179 |
provenance: Stratified<ProvenanceCorpusBlock>
|
| 180 |
comparability: Stratified<ComparabilityCorpusBlock>
|
| 181 |
+
developers?: DeveloperListEntry[]
|
| 182 |
+
families?: Array<{
|
| 183 |
+
family_key: string
|
| 184 |
+
display_name: string
|
| 185 |
+
model_count: number
|
| 186 |
+
eval_count: number
|
| 187 |
+
}>
|
| 188 |
+
categories?: Array<{
|
| 189 |
+
category: string
|
| 190 |
+
model_count: number
|
| 191 |
+
eval_count: number
|
| 192 |
+
}>
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
export interface DeveloperListEntry {
|
| 196 |
+
developer: string
|
| 197 |
+
route_id: string
|
| 198 |
+
model_count: number
|
| 199 |
+
benchmark_count: number
|
| 200 |
+
evaluation_count: number
|
| 201 |
+
popular_evals: Array<{ benchmark: string; model_count: number }>
|
| 202 |
}
|
| 203 |
|
| 204 |
export interface Stratified<T> {
|
|
|
|
| 220 |
}
|
| 221 |
|
| 222 |
export interface CompletenessCorpusBlock {
|
| 223 |
+
total_triples: number
|
| 224 |
+
completeness_avg: number | null
|
| 225 |
+
completeness_min: number | null
|
| 226 |
+
completeness_max: number | null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
}
|
| 228 |
|
| 229 |
export interface ProvenanceCorpusBlock {
|
| 230 |
total_triples: number
|
| 231 |
+
multi_source_triples: number
|
| 232 |
+
first_party_only_triples: number
|
|
|
|
|
|
|
|
|
|
| 233 |
source_type_distribution: Record<ProvenanceSourceType, number>
|
| 234 |
}
|
| 235 |
|
| 236 |
export interface ComparabilityCorpusBlock {
|
| 237 |
+
total_triples: number
|
| 238 |
+
variant_divergent_count: number
|
| 239 |
+
cross_party_divergent_count: number
|
| 240 |
+
groups_with_variant_check: number
|
| 241 |
+
groups_with_cross_party_check: number
|
|
|
|
|
|
|
| 242 |
}
|
| 243 |
|
| 244 |
export interface HierarchyTags {
|
lib/benchmark-schema.ts
CHANGED
|
@@ -124,6 +124,7 @@ export interface ScoreDetails {
|
|
| 124 |
}
|
| 125 |
|
| 126 |
export interface GenerationConfig {
|
|
|
|
| 127 |
generation_args?: {
|
| 128 |
temperature?: number
|
| 129 |
top_p?: number
|
|
|
|
| 124 |
}
|
| 125 |
|
| 126 |
export interface GenerationConfig {
|
| 127 |
+
num_few_shot?: number
|
| 128 |
generation_args?: {
|
| 129 |
temperature?: number
|
| 130 |
top_p?: number
|
lib/data-backend.ts
CHANGED
|
@@ -1,49 +1,140 @@
|
|
| 1 |
import "server-only"
|
| 2 |
|
| 3 |
-
import {
|
| 4 |
-
getDashboardDataFromDuckDB,
|
| 5 |
-
getModelCardsFromDuckDB,
|
| 6 |
-
getModelCardsLiteFromDuckDB,
|
| 7 |
-
getEvalListDataFromDuckDB,
|
| 8 |
-
getEvalListLiteDataFromDuckDB,
|
| 9 |
-
getEvalListFromDuckDB,
|
| 10 |
-
getDeveloperListFromDuckDB,
|
| 11 |
-
getDeveloperSummaryByIdFromDuckDB,
|
| 12 |
-
getModelSummaryByIdFromDuckDB,
|
| 13 |
-
getEvalSummaryByIdFromDuckDB,
|
| 14 |
-
} from "@/lib/duckdb-data"
|
| 15 |
import { normalizeEvalSummary } from "@/lib/eval-processing"
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
/
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
export async function getEvalSummaryById(evalId: string) {
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
return summary ? normalizeEvalSummary(summary) : summary
|
| 42 |
}
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import "server-only"
|
| 2 |
|
| 3 |
+
import type { BackendManifestStatus } from "@/lib/backend-artifacts"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import { normalizeEvalSummary } from "@/lib/eval-processing"
|
| 5 |
+
|
| 6 |
+
const BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase() ?? "duckdb"
|
| 7 |
+
|
| 8 |
+
function useViewLayerBackend() {
|
| 9 |
+
return BACKEND_VERSION === "v2" || BACKEND_VERSION === "stage-j"
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
async function legacyBackend() {
|
| 13 |
+
return import("@/lib/duckdb-data")
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
async function viewBackend() {
|
| 17 |
+
return import("@/lib/view-data")
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
async function sidecars() {
|
| 21 |
+
return import("@/lib/sidecars")
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
async function hfData() {
|
| 25 |
+
return import("@/lib/hf-data")
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
export async function getModelCards() {
|
| 29 |
+
if (useViewLayerBackend()) {
|
| 30 |
+
return (await viewBackend()).getModelCards()
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
return (await legacyBackend()).getModelCardsFromDuckDB()
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
export async function getModelCardsLite() {
|
| 37 |
+
if (useViewLayerBackend()) {
|
| 38 |
+
return (await viewBackend()).getModelCardsLite()
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
return (await legacyBackend()).getModelCardsLiteFromDuckDB()
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
export async function getEvalListData() {
|
| 45 |
+
if (useViewLayerBackend()) {
|
| 46 |
+
return (await viewBackend()).getEvalListData()
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
return (await legacyBackend()).getEvalListDataFromDuckDB()
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
export async function getEvalListLiteData() {
|
| 53 |
+
if (useViewLayerBackend()) {
|
| 54 |
+
return (await viewBackend()).getEvalListLiteData()
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
return (await legacyBackend()).getEvalListLiteDataFromDuckDB()
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
export async function getEvalList() {
|
| 61 |
+
if (useViewLayerBackend()) {
|
| 62 |
+
return (await viewBackend()).getEvalList()
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
return (await legacyBackend()).getEvalListFromDuckDB()
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
export async function getDashboardData() {
|
| 69 |
+
if (useViewLayerBackend()) {
|
| 70 |
+
return (await viewBackend()).getDashboardData()
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
return (await legacyBackend()).getDashboardDataFromDuckDB()
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
export async function getDeveloperList() {
|
| 77 |
+
if (useViewLayerBackend()) {
|
| 78 |
+
return (await viewBackend()).getDeveloperList()
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
return (await legacyBackend()).getDeveloperListFromDuckDB()
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
export async function getDeveloperSummaryById(routeId: string) {
|
| 85 |
+
if (useViewLayerBackend()) {
|
| 86 |
+
return (await viewBackend()).getDeveloperSummaryById(routeId)
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
return (await legacyBackend()).getDeveloperSummaryByIdFromDuckDB(routeId)
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
export async function getModelSummaryById(modelId: string) {
|
| 93 |
+
if (useViewLayerBackend()) {
|
| 94 |
+
return (await viewBackend()).getModelSummaryById(modelId)
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
return (await legacyBackend()).getModelSummaryByIdFromDuckDB(modelId)
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
export async function getEvalSummaryById(evalId: string) {
|
| 101 |
+
if (useViewLayerBackend()) {
|
| 102 |
+
return (await viewBackend()).getEvalSummaryById(evalId)
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
const summary = await (await legacyBackend()).getEvalSummaryByIdFromDuckDB(evalId)
|
| 106 |
return summary ? normalizeEvalSummary(summary) : summary
|
| 107 |
}
|
| 108 |
|
| 109 |
+
export async function getBackendManifestData() {
|
| 110 |
+
if (useViewLayerBackend()) {
|
| 111 |
+
return (await sidecars()).fetchManifest()
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
return (await hfData()).fetchBackendManifest()
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
export async function getBackendManifestStatusData(): Promise<BackendManifestStatus> {
|
| 118 |
+
if (useViewLayerBackend()) {
|
| 119 |
+
const manifest = await (await sidecars()).fetchManifest()
|
| 120 |
+
return {
|
| 121 |
+
currentManifest: manifest,
|
| 122 |
+
latestManifest: manifest,
|
| 123 |
+
currentManifestSignature: manifest.generated_at,
|
| 124 |
+
latestManifestSignature: manifest.generated_at,
|
| 125 |
+
updateAvailable: false,
|
| 126 |
+
refreshing: false,
|
| 127 |
+
pendingRefreshCount: 0,
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
return (await hfData()).fetchBackendManifestStatus()
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
export async function getEvalHierarchyData() {
|
| 135 |
+
if (useViewLayerBackend()) {
|
| 136 |
+
return (await sidecars()).fetchHierarchy()
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
return (await hfData()).fetchEvalHierarchy()
|
| 140 |
+
}
|
lib/duckdb.ts
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import "server-only"
|
| 2 |
+
|
| 3 |
+
import { DuckDBConnection } from "@duckdb/node-api"
|
| 4 |
+
|
| 5 |
+
let connectionPromise: Promise<DuckDBConnection> | null = null
|
| 6 |
+
|
| 7 |
+
function getSnapshotUrl() {
|
| 8 |
+
const snapshotUrl = process.env.SNAPSHOT_URL?.trim()
|
| 9 |
+
if (!snapshotUrl) {
|
| 10 |
+
throw new Error("DATA_BACKEND=v2 requires SNAPSHOT_URL to point at a Stage J snapshot directory")
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
return snapshotUrl.replace(/\/+$/, "")
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
function snapshotArtifact(name: string) {
|
| 17 |
+
return `${getSnapshotUrl()}/${name}`
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
function sqlString(value: string) {
|
| 21 |
+
return `'${value.replace(/'/g, "''")}'`
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
const VIEW_FILES = {
|
| 25 |
+
models_view: "models_view.parquet",
|
| 26 |
+
evals_view: "evals_view.parquet",
|
| 27 |
+
eval_results_view: "eval_results_view.parquet",
|
| 28 |
+
} as const
|
| 29 |
+
|
| 30 |
+
export async function getConnection(): Promise<DuckDBConnection> {
|
| 31 |
+
if (!connectionPromise) {
|
| 32 |
+
connectionPromise = (async () => {
|
| 33 |
+
const connection = await DuckDBConnection.create()
|
| 34 |
+
|
| 35 |
+
for (const [viewName, fileName] of Object.entries(VIEW_FILES)) {
|
| 36 |
+
await connection.run(
|
| 37 |
+
`CREATE OR REPLACE VIEW ${viewName} AS SELECT * FROM read_parquet(${sqlString(snapshotArtifact(fileName))})`
|
| 38 |
+
)
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
return connection
|
| 42 |
+
})()
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
return connectionPromise
|
| 46 |
+
}
|
lib/hf-data.ts
CHANGED
|
@@ -138,6 +138,15 @@ function getManifestSignature(manifest: BackendManifest | null | undefined) {
|
|
| 138 |
// reading the same on-disk artifacts cannot diverge mid-test via background
|
| 139 |
// refresh, and useful generally for offline development.
|
| 140 |
const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
|
| 143 |
if (OFFLINE) {
|
|
@@ -423,6 +432,19 @@ async function fetchHFJson<T>(relativePath: string): Promise<T> {
|
|
| 423 |
}
|
| 424 |
|
| 425 |
export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
const snapshot = await getManifestSnapshot()
|
| 427 |
const currentManifest = getCurrentManifestFromSnapshot(snapshot)
|
| 428 |
const currentManifestSignature = getManifestSignature(currentManifest)
|
|
@@ -864,14 +886,26 @@ export async function fetchDevelopersList(): Promise<HFDeveloperEntry[]> {
|
|
| 864 |
}
|
| 865 |
|
| 866 |
export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 867 |
return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
|
| 868 |
}
|
| 869 |
|
| 870 |
export async function fetchBackendManifest(): Promise<BackendManifest> {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
return fetchHFJson<BackendManifest>("manifest.json")
|
| 872 |
}
|
| 873 |
|
| 874 |
export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
|
| 876 |
return adaptEvalHierarchy(raw)
|
| 877 |
}
|
|
@@ -971,10 +1005,18 @@ function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
|
|
| 971 |
}
|
| 972 |
|
| 973 |
export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
return fetchHFJson<ComparisonIndex>("comparison-index.json")
|
| 975 |
}
|
| 976 |
|
| 977 |
export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 978 |
return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
|
| 979 |
}
|
| 980 |
|
|
|
|
| 138 |
// reading the same on-disk artifacts cannot diverge mid-test via background
|
| 139 |
// refresh, and useful generally for offline development.
|
| 140 |
const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
|
| 141 |
+
const DATA_BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase()
|
| 142 |
+
|
| 143 |
+
function useViewLayerBackend() {
|
| 144 |
+
return DATA_BACKEND_VERSION === "v2" || DATA_BACKEND_VERSION === "stage-j"
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
async function fetchSnapshotSidecars() {
|
| 148 |
+
return import("@/lib/sidecars")
|
| 149 |
+
}
|
| 150 |
|
| 151 |
async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
|
| 152 |
if (OFFLINE) {
|
|
|
|
| 432 |
}
|
| 433 |
|
| 434 |
export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
|
| 435 |
+
if (useViewLayerBackend()) {
|
| 436 |
+
const manifest = await (await fetchSnapshotSidecars()).fetchManifest()
|
| 437 |
+
return {
|
| 438 |
+
currentManifest: manifest,
|
| 439 |
+
latestManifest: manifest,
|
| 440 |
+
currentManifestSignature: manifest.generated_at,
|
| 441 |
+
latestManifestSignature: manifest.generated_at,
|
| 442 |
+
updateAvailable: false,
|
| 443 |
+
refreshing: false,
|
| 444 |
+
pendingRefreshCount: 0,
|
| 445 |
+
}
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
const snapshot = await getManifestSnapshot()
|
| 449 |
const currentManifest = getCurrentManifestFromSnapshot(snapshot)
|
| 450 |
const currentManifestSignature = getManifestSignature(currentManifest)
|
|
|
|
| 886 |
}
|
| 887 |
|
| 888 |
export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
|
| 889 |
+
if (useViewLayerBackend()) {
|
| 890 |
+
return (await import("@/lib/view-data")).getBenchmarkMetadataMap()
|
| 891 |
+
}
|
| 892 |
+
|
| 893 |
return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
|
| 894 |
}
|
| 895 |
|
| 896 |
export async function fetchBackendManifest(): Promise<BackendManifest> {
|
| 897 |
+
if (useViewLayerBackend()) {
|
| 898 |
+
return (await fetchSnapshotSidecars()).fetchManifest()
|
| 899 |
+
}
|
| 900 |
+
|
| 901 |
return fetchHFJson<BackendManifest>("manifest.json")
|
| 902 |
}
|
| 903 |
|
| 904 |
export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
|
| 905 |
+
if (useViewLayerBackend()) {
|
| 906 |
+
return adaptEvalHierarchy(await (await fetchSnapshotSidecars()).fetchHierarchy())
|
| 907 |
+
}
|
| 908 |
+
|
| 909 |
const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
|
| 910 |
return adaptEvalHierarchy(raw)
|
| 911 |
}
|
|
|
|
| 1005 |
}
|
| 1006 |
|
| 1007 |
export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
| 1008 |
+
if (useViewLayerBackend()) {
|
| 1009 |
+
return (await fetchSnapshotSidecars()).fetchComparisonIndex()
|
| 1010 |
+
}
|
| 1011 |
+
|
| 1012 |
return fetchHFJson<ComparisonIndex>("comparison-index.json")
|
| 1013 |
}
|
| 1014 |
|
| 1015 |
export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
|
| 1016 |
+
if (useViewLayerBackend()) {
|
| 1017 |
+
return (await fetchSnapshotSidecars()).fetchHeadline()
|
| 1018 |
+
}
|
| 1019 |
+
|
| 1020 |
return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
|
| 1021 |
}
|
| 1022 |
|
lib/sidecars.ts
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import "server-only"
|
| 2 |
+
|
| 3 |
+
import type {
|
| 4 |
+
BackendManifest,
|
| 5 |
+
ComparisonIndex,
|
| 6 |
+
CorpusAggregates,
|
| 7 |
+
EvalHierarchy,
|
| 8 |
+
} from "@/lib/backend-artifacts"
|
| 9 |
+
|
| 10 |
+
let cache: {
|
| 11 |
+
manifest?: Promise<BackendManifest>
|
| 12 |
+
headline?: Promise<CorpusAggregates>
|
| 13 |
+
hierarchy?: Promise<EvalHierarchy>
|
| 14 |
+
comparisonIndex?: Promise<ComparisonIndex>
|
| 15 |
+
} = {}
|
| 16 |
+
|
| 17 |
+
function getSnapshotUrl() {
|
| 18 |
+
const snapshotUrl = process.env.SNAPSHOT_URL?.trim()
|
| 19 |
+
if (!snapshotUrl) {
|
| 20 |
+
throw new Error("DATA_BACKEND=v2 requires SNAPSHOT_URL to point at a Stage J snapshot directory")
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
return snapshotUrl.replace(/\/+$/, "")
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
function sidecarUrl(name: string) {
|
| 27 |
+
return `${getSnapshotUrl()}/${name}`
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
async function fetchJson<T>(name: string): Promise<T> {
|
| 31 |
+
const url = sidecarUrl(name)
|
| 32 |
+
|
| 33 |
+
if (url.startsWith("file://")) {
|
| 34 |
+
const fs = await import("fs/promises")
|
| 35 |
+
const text = await fs.readFile(new URL(url), "utf8")
|
| 36 |
+
return JSON.parse(text) as T
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
const response = await fetch(url, { next: { revalidate: 3600 } })
|
| 40 |
+
if (!response.ok) {
|
| 41 |
+
throw new Error(`Snapshot sidecar fetch failed: ${response.status} ${response.statusText} for ${url}`)
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
return (await response.json()) as T
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
export function fetchManifest(): Promise<BackendManifest> {
|
| 48 |
+
return (cache.manifest ??= fetchJson<BackendManifest>("manifest.json"))
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
export function fetchHeadline(): Promise<CorpusAggregates> {
|
| 52 |
+
return (cache.headline ??= fetchJson<CorpusAggregates>("headline.json"))
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
export function fetchHierarchy(): Promise<EvalHierarchy> {
|
| 56 |
+
return (cache.hierarchy ??= fetchJson<EvalHierarchy>("hierarchy.json"))
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
export function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
| 60 |
+
return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json"))
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
export function resetSidecarCacheForTests() {
|
| 64 |
+
cache = {}
|
| 65 |
+
}
|
lib/view-data.ts
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import "server-only"
|
| 2 |
+
|
| 3 |
+
import { getConnection } from "@/lib/duckdb"
|
| 4 |
+
import { fetchHeadline } from "@/lib/sidecars"
|
| 5 |
+
import {
|
| 6 |
+
EVALUATION_CATEGORIES,
|
| 7 |
+
type BenchmarkCard,
|
| 8 |
+
type BenchmarkEvaluation,
|
| 9 |
+
type CategoryType,
|
| 10 |
+
type EvaluationCardData,
|
| 11 |
+
type EvaluationResult,
|
| 12 |
+
type GenerationConfig,
|
| 13 |
+
type MetricConfig,
|
| 14 |
+
type ModelInfo,
|
| 15 |
+
type ModelEvaluationSummary,
|
| 16 |
+
type ModelVariantSummary,
|
| 17 |
+
type ScoreDetails,
|
| 18 |
+
type SourceData,
|
| 19 |
+
type SourceMetadata,
|
| 20 |
+
} from "@/lib/benchmark-schema"
|
| 21 |
+
import type { DeveloperListEntry } from "@/lib/backend-artifacts"
|
| 22 |
+
import type {
|
| 23 |
+
BenchmarkEvalListItem,
|
| 24 |
+
BenchmarkEvalSummary,
|
| 25 |
+
ModelResultForBenchmark,
|
| 26 |
+
} from "@/lib/eval-processing"
|
| 27 |
+
|
| 28 |
+
type Row = Record<string, any>
|
| 29 |
+
|
| 30 |
+
const MODEL_CARD_COLUMNS = `
|
| 31 |
+
id, model_key, route_id, model_name, model_id, canonical_model_name, developer,
|
| 32 |
+
evaluations_count, benchmarks_count, variant_count,
|
| 33 |
+
categories, category_stats, latest_timestamp,
|
| 34 |
+
evaluator_count, evaluator_names, source_type_count, source_types,
|
| 35 |
+
evidence_count, missing_generation_config_count,
|
| 36 |
+
third_party_eval_count, independent_verification_ratio,
|
| 37 |
+
reproducibility_status, eval_libraries, latest_source_name,
|
| 38 |
+
params_billions, benchmark_names, score_summary,
|
| 39 |
+
reproducibility_summary, provenance_summary, comparability_summary,
|
| 40 |
+
top_scores, source_urls, detail_urls,
|
| 41 |
+
model_url, release_date,
|
| 42 |
+
architecture, params, inference_engine, inference_platform
|
| 43 |
+
`
|
| 44 |
+
|
| 45 |
+
const EVAL_LIST_COLUMNS = `
|
| 46 |
+
evaluation_id, evaluation_name, canonical_display_name,
|
| 47 |
+
composite_benchmark_key, composite_benchmark_name,
|
| 48 |
+
benchmark_family_key, benchmark_leaf_key, category,
|
| 49 |
+
metric_config, models_count, evaluator_names, source_types,
|
| 50 |
+
latest_source_name, third_party_ratio,
|
| 51 |
+
missing_generation_config_count, best_model, worst_model,
|
| 52 |
+
avg_score, avg_score_norm, has_card, benchmark_card,
|
| 53 |
+
is_aggregated, aggregate_sources, tags,
|
| 54 |
+
metrics_count, metric_names, instance_data, top_score,
|
| 55 |
+
subtasks_count, is_summary_score, summary_eval_ids,
|
| 56 |
+
root_metrics, subtasks, leaderboard_metrics,
|
| 57 |
+
reproducibility_summary, provenance_summary, comparability_summary,
|
| 58 |
+
source_data
|
| 59 |
+
`
|
| 60 |
+
|
| 61 |
+
const CELL_JOIN_COLUMNS = `
|
| 62 |
+
r.*,
|
| 63 |
+
e.evaluation_name AS eval_evaluation_name,
|
| 64 |
+
e.canonical_display_name AS eval_canonical_display_name,
|
| 65 |
+
e.composite_benchmark_key AS eval_composite_benchmark_key,
|
| 66 |
+
e.composite_benchmark_name AS eval_composite_benchmark_name,
|
| 67 |
+
e.benchmark_family_key AS eval_benchmark_family_key,
|
| 68 |
+
e.benchmark_leaf_key AS eval_benchmark_leaf_key,
|
| 69 |
+
e.category AS eval_category,
|
| 70 |
+
e.metric_config AS eval_metric_config,
|
| 71 |
+
e.source_data AS eval_source_data,
|
| 72 |
+
e.benchmark_card AS eval_benchmark_card,
|
| 73 |
+
e.tags AS eval_tags,
|
| 74 |
+
e.is_summary_score AS eval_is_summary_score,
|
| 75 |
+
e.summary_eval_ids AS eval_summary_eval_ids
|
| 76 |
+
`
|
| 77 |
+
|
| 78 |
+
function normalizeDuckDBValue(value: unknown): unknown {
|
| 79 |
+
if (typeof value === "bigint") {
|
| 80 |
+
return Number(value)
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
if (value instanceof Date) {
|
| 84 |
+
return value.toISOString()
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
if (value instanceof Map) {
|
| 88 |
+
return Object.fromEntries(
|
| 89 |
+
Array.from(value.entries()).map(([key, mapValue]) => [String(key), normalizeDuckDBValue(mapValue)])
|
| 90 |
+
)
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
if (Array.isArray(value)) {
|
| 94 |
+
return value.map(normalizeDuckDBValue)
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
if (value && typeof value === "object") {
|
| 98 |
+
const duckValue = value as {
|
| 99 |
+
constructor?: { name?: string }
|
| 100 |
+
entries?: unknown
|
| 101 |
+
items?: unknown
|
| 102 |
+
scale?: unknown
|
| 103 |
+
value?: unknown
|
| 104 |
+
toString?: () => string
|
| 105 |
+
}
|
| 106 |
+
const constructorName = duckValue.constructor?.name ?? ""
|
| 107 |
+
|
| 108 |
+
if (constructorName === "DuckDBStructValue" && duckValue.entries && typeof duckValue.entries === "object") {
|
| 109 |
+
return normalizeDuckDBValue(duckValue.entries)
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
if (
|
| 113 |
+
(constructorName === "DuckDBListValue" || constructorName === "DuckDBArrayValue") &&
|
| 114 |
+
Array.isArray(duckValue.items)
|
| 115 |
+
) {
|
| 116 |
+
return duckValue.items.map(normalizeDuckDBValue)
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
if (constructorName === "DuckDBMapValue" && Array.isArray(duckValue.entries)) {
|
| 120 |
+
return Object.fromEntries(
|
| 121 |
+
duckValue.entries.map((entry) => {
|
| 122 |
+
const pair = entry as { key: unknown; value: unknown }
|
| 123 |
+
return [String(pair.key), normalizeDuckDBValue(pair.value)]
|
| 124 |
+
})
|
| 125 |
+
)
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
if (constructorName === "DuckDBDecimalValue" && typeof duckValue.toString === "function") {
|
| 129 |
+
return Number(duckValue.toString())
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
if (constructorName.startsWith("DuckDB") && typeof duckValue.toString === "function") {
|
| 133 |
+
return duckValue.toString()
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
return Object.fromEntries(
|
| 137 |
+
Object.entries(value).map(([key, objectValue]) => [key, normalizeDuckDBValue(objectValue)])
|
| 138 |
+
)
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
return value
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
async function readRows<T = Row>(sql: string, params: unknown[] = []): Promise<T[]> {
|
| 145 |
+
const connection = await getConnection()
|
| 146 |
+
const reader = params.length > 0
|
| 147 |
+
? await connection.runAndReadAll(sql, params as any[])
|
| 148 |
+
: await connection.runAndReadAll(sql)
|
| 149 |
+
return reader.getRowObjects().map((row) => normalizeDuckDBValue(row) as T)
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
function asNumber(value: unknown, fallback = 0) {
|
| 153 |
+
if (typeof value === "number" && Number.isFinite(value)) return value
|
| 154 |
+
if (typeof value === "bigint") return Number(value)
|
| 155 |
+
if (typeof value === "string" && value.trim() !== "") {
|
| 156 |
+
const parsed = Number(value)
|
| 157 |
+
if (Number.isFinite(parsed)) return parsed
|
| 158 |
+
}
|
| 159 |
+
return fallback
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
function optionalNumber(value: unknown) {
|
| 163 |
+
if (value == null) return undefined
|
| 164 |
+
const parsed = asNumber(value, Number.NaN)
|
| 165 |
+
return Number.isFinite(parsed) ? parsed : undefined
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
function asString(value: unknown, fallback = "") {
|
| 169 |
+
return typeof value === "string" ? value : fallback
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
function optionalString(value: unknown) {
|
| 173 |
+
return typeof value === "string" && value.length > 0 ? value : undefined
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
function asArray<T>(value: unknown): T[] {
|
| 177 |
+
return Array.isArray(value) ? value as T[] : []
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
function normalizeCategory(value: unknown): CategoryType {
|
| 181 |
+
return EVALUATION_CATEGORIES.includes(value as CategoryType)
|
| 182 |
+
? value as CategoryType
|
| 183 |
+
: "General"
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
function emptyEvaluationsByCategory(): Record<CategoryType, BenchmarkEvaluation[]> {
|
| 187 |
+
return EVALUATION_CATEGORIES.reduce((acc, category) => {
|
| 188 |
+
acc[category] = []
|
| 189 |
+
return acc
|
| 190 |
+
}, {} as Record<CategoryType, BenchmarkEvaluation[]>)
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
function sourceMetadataFromRow(row: Row): SourceMetadata {
|
| 194 |
+
if (row.source_metadata && typeof row.source_metadata === "object") {
|
| 195 |
+
return row.source_metadata as SourceMetadata
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
return {
|
| 199 |
+
source_type: "documentation",
|
| 200 |
+
source_organization_name: asString(row.latest_source_name, "Unknown"),
|
| 201 |
+
evaluator_relationship: "other",
|
| 202 |
+
}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
function sourceDataFromRow(row: Row): BenchmarkEvaluation["source_data"] {
|
| 206 |
+
const sourceData = row.source_data ?? row.eval_source_data
|
| 207 |
+
if (sourceData) {
|
| 208 |
+
return sourceData as BenchmarkEvaluation["source_data"]
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return {
|
| 212 |
+
dataset_name: asString(row.eval_evaluation_name ?? row.evaluation_name ?? row.benchmark_id, "Unknown dataset"),
|
| 213 |
+
} satisfies SourceData
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
function scoreDetailsFromRow(row: Row): ScoreDetails {
|
| 217 |
+
const details = row.score_details && typeof row.score_details === "object"
|
| 218 |
+
? row.score_details as Partial<ScoreDetails>
|
| 219 |
+
: {}
|
| 220 |
+
const score = asNumber(details.score ?? row.score)
|
| 221 |
+
|
| 222 |
+
return {
|
| 223 |
+
...details,
|
| 224 |
+
score,
|
| 225 |
+
} as ScoreDetails
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
function metricConfigFromRow(row: Row): MetricConfig {
|
| 229 |
+
const config = (row.metric_config ?? row.eval_metric_config ?? {}) as Partial<MetricConfig>
|
| 230 |
+
const scoreType = config.score_type === "binary" || config.score_type === "discrete"
|
| 231 |
+
? config.score_type
|
| 232 |
+
: "continuous"
|
| 233 |
+
|
| 234 |
+
return {
|
| 235 |
+
evaluation_description: asString(
|
| 236 |
+
config.evaluation_description ??
|
| 237 |
+
row.metric_description ??
|
| 238 |
+
row.metric_display_name ??
|
| 239 |
+
row.eval_evaluation_name ??
|
| 240 |
+
row.evaluation_name,
|
| 241 |
+
""
|
| 242 |
+
),
|
| 243 |
+
lower_is_better: Boolean(row.lower_is_better ?? config.lower_is_better ?? false),
|
| 244 |
+
score_type: scoreType,
|
| 245 |
+
min_score: optionalNumber(config.min_score ?? row.min_score),
|
| 246 |
+
max_score: optionalNumber(config.max_score ?? row.max_score),
|
| 247 |
+
unit: optionalString(row.metric_unit ?? config.unit),
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
function modelInfoFromModelRow(row: Row): ModelInfo {
|
| 252 |
+
return {
|
| 253 |
+
name: asString(row.model_name ?? row.model_family_name ?? row.model_id ?? row.model_key, "Unknown model"),
|
| 254 |
+
id: asString(row.model_key ?? row.model_id ?? row.id ?? row.route_id, "unknown-model"),
|
| 255 |
+
developer: optionalString(row.developer),
|
| 256 |
+
inference_platform: optionalString(row.inference_platform),
|
| 257 |
+
inference_engine: optionalString(row.inference_engine),
|
| 258 |
+
architecture: optionalString(row.architecture),
|
| 259 |
+
parameter_count: optionalString(row.params),
|
| 260 |
+
release_date: optionalString(row.release_date),
|
| 261 |
+
model_url: optionalString(row.model_url),
|
| 262 |
+
additional_details: {
|
| 263 |
+
params_billions: row.params_billions,
|
| 264 |
+
},
|
| 265 |
+
modalities: {
|
| 266 |
+
input: asArray<string>(row.input_modalities),
|
| 267 |
+
output: asArray<string>(row.output_modalities),
|
| 268 |
+
},
|
| 269 |
+
}
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
function resultFromCell(row: Row): EvaluationResult {
|
| 273 |
+
const scoreDetails = scoreDetailsFromRow(row)
|
| 274 |
+
const generationConfig = row.generation_config as GenerationConfig | undefined
|
| 275 |
+
const annotations = row.evalcards_annotations
|
| 276 |
+
|
| 277 |
+
return {
|
| 278 |
+
evaluation_name: asString(row.metric_display_name ?? row.eval_evaluation_name ?? row.metric_id, "Score"),
|
| 279 |
+
display_name: optionalString(row.metric_display_name),
|
| 280 |
+
canonical_display_name: optionalString(row.metric_display_name),
|
| 281 |
+
metric_summary_id: optionalString(row.metric_summary_id),
|
| 282 |
+
metric_key: optionalString(row.metric_id),
|
| 283 |
+
evaluation_timestamp: asString(row.evaluation_timestamp, ""),
|
| 284 |
+
source_data: sourceDataFromRow(row),
|
| 285 |
+
metric_config: metricConfigFromRow(row),
|
| 286 |
+
score_details: scoreDetails,
|
| 287 |
+
generation_config: generationConfig,
|
| 288 |
+
detailed_evaluation_results_url: optionalString(row.instance_file_path),
|
| 289 |
+
evalcards: annotations ? { annotations } : undefined,
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
function reshapeCellToModelResult(row: Row): ModelResultForBenchmark {
|
| 294 |
+
const scoreDetails = scoreDetailsFromRow(row)
|
| 295 |
+
|
| 296 |
+
return {
|
| 297 |
+
model_info: (row.model_info ?? modelInfoFromModelRow(row)) as ModelInfo,
|
| 298 |
+
model_route_id: optionalString(row.model_route_id),
|
| 299 |
+
score: scoreDetails.score,
|
| 300 |
+
score_details: scoreDetails,
|
| 301 |
+
evaluation_timestamp: asString(row.evaluation_timestamp, ""),
|
| 302 |
+
source_metadata: sourceMetadataFromRow(row),
|
| 303 |
+
source_data: sourceDataFromRow(row),
|
| 304 |
+
source_record_url: optionalString(row.source_record_url),
|
| 305 |
+
aggregate_components: asArray<NonNullable<ModelResultForBenchmark["aggregate_components"]>[number]>(
|
| 306 |
+
row.aggregate_components
|
| 307 |
+
),
|
| 308 |
+
result: resultFromCell(row),
|
| 309 |
+
}
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
function reshapeCellToBenchmarkEvaluation(row: Row): BenchmarkEvaluation {
|
| 313 |
+
const result = resultFromCell(row)
|
| 314 |
+
const modelInfo = (row.model_info ?? modelInfoFromModelRow(row)) as ModelInfo
|
| 315 |
+
|
| 316 |
+
return {
|
| 317 |
+
schema_version: "1.0",
|
| 318 |
+
eval_summary_id: optionalString(row.evaluation_id),
|
| 319 |
+
evaluation_id: asString(row.evaluation_id ?? row.benchmark_id, "unknown-evaluation"),
|
| 320 |
+
retrieved_timestamp: asString(row.evaluation_timestamp, ""),
|
| 321 |
+
benchmark: optionalString(row.eval_evaluation_name ?? row.benchmark_id),
|
| 322 |
+
display_name: optionalString(row.eval_evaluation_name),
|
| 323 |
+
canonical_display_name: optionalString(row.eval_canonical_display_name),
|
| 324 |
+
category: normalizeCategory(row.eval_category ?? row.category),
|
| 325 |
+
benchmark_family_key: optionalString(row.eval_benchmark_family_key),
|
| 326 |
+
benchmark_family_name: optionalString(row.eval_composite_benchmark_name),
|
| 327 |
+
benchmark_parent_key: optionalString(row.eval_composite_benchmark_key),
|
| 328 |
+
benchmark_parent_name: optionalString(row.eval_composite_benchmark_name),
|
| 329 |
+
benchmark_leaf_key: optionalString(row.eval_benchmark_leaf_key),
|
| 330 |
+
benchmark_leaf_name: optionalString(row.eval_evaluation_name),
|
| 331 |
+
is_summary_score: Boolean(row.eval_is_summary_score ?? row.is_summary_score),
|
| 332 |
+
source_data: sourceDataFromRow(row),
|
| 333 |
+
source_metadata: sourceMetadataFromRow(row),
|
| 334 |
+
eval_library: row.eval_library,
|
| 335 |
+
model_info: modelInfo,
|
| 336 |
+
generation_config: row.generation_config,
|
| 337 |
+
evaluation_results: [result],
|
| 338 |
+
}
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
function modelSummaryFromRows(modelRow: Row, cellRows: Row[]): ModelEvaluationSummary {
|
| 342 |
+
const evaluationsByCategory = emptyEvaluationsByCategory()
|
| 343 |
+
for (const cellRow of cellRows) {
|
| 344 |
+
const evaluation = reshapeCellToBenchmarkEvaluation(cellRow)
|
| 345 |
+
const category = normalizeCategory(evaluation.category)
|
| 346 |
+
evaluationsByCategory[category].push(evaluation)
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
const categoriesCovered = asArray<CategoryType>(modelRow.categories).filter((category) =>
|
| 350 |
+
EVALUATION_CATEGORIES.includes(category)
|
| 351 |
+
)
|
| 352 |
+
const modelInfo = (modelRow.model_info ?? modelInfoFromModelRow(modelRow)) as ModelInfo
|
| 353 |
+
const totalEvaluations = asNumber(modelRow.total_evaluations ?? modelRow.evaluations_count)
|
| 354 |
+
const lastUpdated = asString(modelRow.last_updated ?? modelRow.latest_timestamp, "")
|
| 355 |
+
const rawModelIds = asArray<string>(modelRow.raw_model_ids)
|
| 356 |
+
|
| 357 |
+
const core = {
|
| 358 |
+
model_info: modelInfo,
|
| 359 |
+
evaluations_by_category: evaluationsByCategory,
|
| 360 |
+
total_evaluations: totalEvaluations,
|
| 361 |
+
last_updated: lastUpdated,
|
| 362 |
+
categories_covered: categoriesCovered.length > 0
|
| 363 |
+
? categoriesCovered
|
| 364 |
+
: EVALUATION_CATEGORIES.filter((category) => evaluationsByCategory[category].length > 0),
|
| 365 |
+
reproducibility_summary: modelRow.reproducibility_summary,
|
| 366 |
+
provenance_summary: modelRow.provenance_summary,
|
| 367 |
+
comparability_summary: modelRow.comparability_summary,
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
const variants = asArray<Row>(modelRow.variants).map((variant, index) => ({
|
| 371 |
+
...core,
|
| 372 |
+
...variant,
|
| 373 |
+
variant_id: asString(variant.variant_id ?? variant.variant_key, `variant-${index}`),
|
| 374 |
+
variant_key: asString(variant.variant_key, `variant-${index}`),
|
| 375 |
+
variant_label: asString(variant.variant_label ?? variant.variant_display_name, "Default"),
|
| 376 |
+
variant_display_name: asString(variant.variant_display_name ?? variant.variant_label ?? modelRow.model_name, modelRow.model_name),
|
| 377 |
+
raw_model_ids: asArray<string>(variant.raw_model_ids),
|
| 378 |
+
family_id: asString(variant.family_id ?? modelRow.model_family_id, modelRow.model_family_id),
|
| 379 |
+
family_name: asString(variant.family_name ?? modelRow.model_family_name, modelRow.model_family_name),
|
| 380 |
+
total_evaluations: asNumber(variant.total_evaluations ?? totalEvaluations),
|
| 381 |
+
last_updated: asString(variant.last_updated ?? lastUpdated, lastUpdated),
|
| 382 |
+
categories_covered: asArray<CategoryType>(variant.categories_covered).length > 0
|
| 383 |
+
? asArray<CategoryType>(variant.categories_covered)
|
| 384 |
+
: core.categories_covered,
|
| 385 |
+
model_info: {
|
| 386 |
+
...modelInfo,
|
| 387 |
+
name: asString(variant.variant_display_name ?? variant.variant_label ?? modelInfo.name, modelInfo.name),
|
| 388 |
+
},
|
| 389 |
+
})) as ModelVariantSummary[]
|
| 390 |
+
|
| 391 |
+
return {
|
| 392 |
+
...core,
|
| 393 |
+
model_family_id: asString(modelRow.model_family_id ?? modelRow.model_key ?? modelRow.model_id, modelRow.model_key ?? modelRow.model_id),
|
| 394 |
+
model_route_id: asString(modelRow.model_route_id ?? modelRow.route_id, modelRow.route_id),
|
| 395 |
+
model_family_name: asString(modelRow.model_family_name ?? modelRow.model_name, modelRow.model_name),
|
| 396 |
+
raw_model_ids: rawModelIds.length > 0 ? rawModelIds : [asString(modelRow.model_key ?? modelRow.model_id, "")].filter(Boolean),
|
| 397 |
+
variants,
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
async function getModelEvaluationRows(modelKey: string): Promise<Row[]> {
|
| 402 |
+
// model_key is the producer's addressable identifier — non-null for both
|
| 403 |
+
// resolved and unresolved models (the latter fall back to the raw source
|
| 404 |
+
// name). Querying by model_id alone would silently miss unresolved models.
|
| 405 |
+
return readRows<Row>(
|
| 406 |
+
`SELECT ${CELL_JOIN_COLUMNS}
|
| 407 |
+
FROM eval_results_view r
|
| 408 |
+
LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
|
| 409 |
+
WHERE r.model_key = ?
|
| 410 |
+
AND r.score IS NOT NULL
|
| 411 |
+
ORDER BY r.category, r.percentile DESC NULLS LAST`,
|
| 412 |
+
[modelKey]
|
| 413 |
+
)
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
export async function getModelCards(): Promise<EvaluationCardData[]> {
|
| 417 |
+
return readRows<EvaluationCardData>(
|
| 418 |
+
`SELECT ${MODEL_CARD_COLUMNS}
|
| 419 |
+
FROM models_view
|
| 420 |
+
ORDER BY latest_timestamp DESC NULLS LAST`
|
| 421 |
+
)
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
|
| 425 |
+
return readRows<EvaluationCardData>(
|
| 426 |
+
`SELECT ${MODEL_CARD_COLUMNS}
|
| 427 |
+
FROM models_view
|
| 428 |
+
ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`
|
| 429 |
+
)
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
export async function getEvalListData(): Promise<{
|
| 433 |
+
evals: BenchmarkEvalListItem[]
|
| 434 |
+
totalModels: number
|
| 435 |
+
}> {
|
| 436 |
+
const [evals, countRows] = await Promise.all([
|
| 437 |
+
readRows<BenchmarkEvalListItem>(
|
| 438 |
+
`SELECT ${EVAL_LIST_COLUMNS}
|
| 439 |
+
FROM evals_view
|
| 440 |
+
ORDER BY evaluation_name ASC`
|
| 441 |
+
),
|
| 442 |
+
readRows<{ n: number }>("SELECT COUNT(*) AS n FROM models_view"),
|
| 443 |
+
])
|
| 444 |
+
|
| 445 |
+
return {
|
| 446 |
+
evals,
|
| 447 |
+
totalModels: asNumber(countRows[0]?.n),
|
| 448 |
+
}
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
export async function getEvalListLiteData(): Promise<{
|
| 452 |
+
evals: BenchmarkEvalListItem[]
|
| 453 |
+
totalModels: number
|
| 454 |
+
}> {
|
| 455 |
+
return getEvalListData()
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
export async function getEvalList() {
|
| 459 |
+
const { evals } = await getEvalListData()
|
| 460 |
+
return evals
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
export async function getDashboardData() {
|
| 464 |
+
const [models, evals] = await Promise.all([
|
| 465 |
+
getModelCards(),
|
| 466 |
+
getEvalList(),
|
| 467 |
+
])
|
| 468 |
+
return { models, evals }
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
export async function getModelSummaryById(routeId: string): Promise<ModelEvaluationSummary | null> {
|
| 472 |
+
// Lookups use the addressable identifier (`model_key`/`route_id`/
|
| 473 |
+
// `model_route_id`/`model_family_id`) so unresolved models — whose
|
| 474 |
+
// `model_id` is NULL — are still findable. `model_id` is kept in the
|
| 475 |
+
// OR chain as a back-compat fallback for old links.
|
| 476 |
+
const rows = await readRows<Row>(
|
| 477 |
+
`SELECT *
|
| 478 |
+
FROM models_view
|
| 479 |
+
WHERE model_key = ? OR route_id = ? OR model_route_id = ? OR model_family_id = ? OR model_id = ?
|
| 480 |
+
LIMIT 1`,
|
| 481 |
+
[routeId, routeId, routeId, routeId, routeId]
|
| 482 |
+
)
|
| 483 |
+
const modelRow = rows[0]
|
| 484 |
+
if (!modelRow) return null
|
| 485 |
+
|
| 486 |
+
const cellRows = await getModelEvaluationRows(asString(modelRow.model_key ?? modelRow.model_id, routeId))
|
| 487 |
+
return modelSummaryFromRows(modelRow, cellRows)
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
|
| 491 |
+
const evalRows = await readRows<Row>(
|
| 492 |
+
"SELECT * FROM evals_view WHERE evaluation_id = ? LIMIT 1",
|
| 493 |
+
[evalId]
|
| 494 |
+
)
|
| 495 |
+
const evalRow = evalRows[0]
|
| 496 |
+
if (!evalRow) return null
|
| 497 |
+
|
| 498 |
+
let cellRows = await readRows<Row>(
|
| 499 |
+
`SELECT ${CELL_JOIN_COLUMNS}
|
| 500 |
+
FROM eval_results_view r
|
| 501 |
+
LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
|
| 502 |
+
WHERE r.evaluation_id = ?
|
| 503 |
+
AND r.metric_id = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
|
| 504 |
+
AND r.score IS NOT NULL
|
| 505 |
+
ORDER BY r.position ASC NULLS LAST`,
|
| 506 |
+
[evalId, evalId]
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
if (cellRows.length === 0) {
|
| 510 |
+
cellRows = await readRows<Row>(
|
| 511 |
+
`SELECT ${CELL_JOIN_COLUMNS}
|
| 512 |
+
FROM eval_results_view r
|
| 513 |
+
LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
|
| 514 |
+
WHERE r.evaluation_id = ?
|
| 515 |
+
AND r.score IS NOT NULL
|
| 516 |
+
ORDER BY r.position ASC NULLS LAST`,
|
| 517 |
+
[evalId]
|
| 518 |
+
)
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
return {
|
| 522 |
+
...evalRow,
|
| 523 |
+
model_results: cellRows.map(reshapeCellToModelResult),
|
| 524 |
+
} as BenchmarkEvalSummary
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
|
| 528 |
+
const headline = await fetchHeadline()
|
| 529 |
+
return [...(headline.developers ?? [])].sort((a, b) => a.developer.localeCompare(b.developer))
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
export async function getDeveloperSummaryById(routeId: string) {
|
| 533 |
+
const developers = await getDeveloperList()
|
| 534 |
+
const developer = developers.find((entry) => entry.route_id === routeId)
|
| 535 |
+
if (!developer) return null
|
| 536 |
+
|
| 537 |
+
const models = await readRows<EvaluationCardData>(
|
| 538 |
+
`SELECT ${MODEL_CARD_COLUMNS}
|
| 539 |
+
FROM models_view
|
| 540 |
+
WHERE developer = ?
|
| 541 |
+
ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`,
|
| 542 |
+
[developer.developer]
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
return {
|
| 546 |
+
...developer,
|
| 547 |
+
models,
|
| 548 |
+
}
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
export async function getBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
|
| 552 |
+
const rows = await readRows<Row>(
|
| 553 |
+
`SELECT evaluation_id, evaluation_name, composite_benchmark_key, benchmark_card
|
| 554 |
+
FROM evals_view
|
| 555 |
+
WHERE benchmark_card IS NOT NULL`
|
| 556 |
+
)
|
| 557 |
+
const result: Record<string, BenchmarkCard> = {}
|
| 558 |
+
|
| 559 |
+
for (const row of rows) {
|
| 560 |
+
const card = row.benchmark_card as BenchmarkCard | null | undefined
|
| 561 |
+
if (!card) continue
|
| 562 |
+
|
| 563 |
+
const keys = [
|
| 564 |
+
row.evaluation_id,
|
| 565 |
+
row.evaluation_name,
|
| 566 |
+
row.composite_benchmark_key,
|
| 567 |
+
card.benchmark_details?.name,
|
| 568 |
+
].filter((key): key is string => typeof key === "string" && key.length > 0)
|
| 569 |
+
|
| 570 |
+
for (const key of keys) {
|
| 571 |
+
result[key] = card
|
| 572 |
+
}
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
return result
|
| 576 |
+
}
|
notes/backend-v2-migration.md
ADDED
|
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Frontend migration to backend v2 (Stage J view layer)
|
| 2 |
+
|
| 3 |
+
> **Status:** spec, drafted 2026-05-03 against `eval_card_backend`'s
|
| 4 |
+
> Stage J view-layer contract.
|
| 5 |
+
>
|
| 6 |
+
> **Sources:**
|
| 7 |
+
> - Backend spec (the contract this consumes):
|
| 8 |
+
> `../eval_card_backend/notes/08-frontend-view-layer.md`
|
| 9 |
+
> - Canonical schema (audit/debug only; not in hot path):
|
| 10 |
+
> `../eval_card_backend/notes/01-schema-from-frontend.md`
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## Context
|
| 15 |
+
|
| 16 |
+
The legacy producer (`eval_cards_backend_pipeline`) emitted ten
|
| 17 |
+
parquets where each row carried a `payload_json` VARCHAR with the
|
| 18 |
+
post-TS-adapter shape baked in. The frontend's "DuckDB backend"
|
| 19 |
+
(`lib/duckdb-data.ts`) read these blobs and `JSON.parse`d them — column
|
| 20 |
+
projection, filter pushdown, and type contracts were all forfeited.
|
| 21 |
+
|
| 22 |
+
The new producer (`eval_card_backend`) emits a typed view layer over
|
| 23 |
+
its canonical normalised tables. Three Parquet files cover every page
|
| 24 |
+
shape, three small JSON sidecars cover corpus-level scalars and the
|
| 25 |
+
hierarchy tree. Column names match the frontend's TS interfaces
|
| 26 |
+
field-for-field, so the row→object cast is a typed spread for most
|
| 27 |
+
accessors. Two interfaces (`ModelResultForBenchmark` and the
|
| 28 |
+
`evaluations_by_category` body of `ModelEvaluationSummary`) require a
|
| 29 |
+
small mechanical reshape over the row, since one nests fields that the
|
| 30 |
+
view stores flat — see the per-accessor sections below. No
|
| 31 |
+
HF-record-to-display adapter logic survives.
|
| 32 |
+
|
| 33 |
+
This document specifies what changes in `general-eval-card` once
|
| 34 |
+
backend v2 is faithfully implemented. **The visual frontend, page
|
| 35 |
+
renderers, and TS interface shapes do not change.** Only the I/O
|
| 36 |
+
boundary moves.
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## What changes (overview)
|
| 41 |
+
|
| 42 |
+
| layer | before (v1) | after (v2) |
|
| 43 |
+
|---|---|---|
|
| 44 |
+
| Distribution | `LOCAL_PIPELINE_OUTPUT` env var pointing at a producer output dir; `duckdb/v1/` subpath; implicit "warehouse/latest/" coupling | `SNAPSHOT_URL` env var (file:// or HF dataset URL); one snapshot pinned per deploy |
|
| 45 |
+
| Storage shape | 10 parquets each with one `payload_json` column | 3 typed-column view parquets + 3 JSON sidecars |
|
| 46 |
+
| Read pattern | `SELECT payload_json FROM read_parquet(?) WHERE id = ?`, then `JSON.parse` | `SELECT col1, col2, ... FROM <view> WHERE id = ?`, typed row spread |
|
| 47 |
+
| List vs detail | Separate `*_lite.parquet` files | Column projection on the same parquet |
|
| 48 |
+
| Suite/aggregate dispatch | Eval id prefix (`aggregate__`, `matrix__`) → different parquet | `is_summary_score` flag + `parent_benchmark_id` on `evals_view` |
|
| 49 |
+
| Slug rule | Custom `replace('/', '__')` escapes; per-page slug helpers | Producer-owned RFC 3986 percent-encoded `route_id` / `evaluation_id` / `metric_summary_id`; frontend decodes only on `<Link>` href |
|
| 50 |
+
| Corpus aggregates | `corpus-aggregates.json` over HF JSON loader | `headline.json` sidecar in the snapshot dir |
|
| 51 |
+
| Hierarchy | Synthesised in the producer's `eval_hierarchy` JSON | `hierarchy.json` sidecar |
|
| 52 |
+
| Backend manifest | `manifest.json` fetched from upstream HF dataset root via `lib/hf-data.ts` | `manifest.json` sidecar inside the snapshot dir, read via `SNAPSHOT_URL` |
|
| 53 |
+
|
| 54 |
+
The TS interfaces (`EvaluationCardData`, `BenchmarkEvalSummary`,
|
| 55 |
+
`ModelEvaluationSummary`, `ModelResultForBenchmark`, `CorpusAggregates`,
|
| 56 |
+
`EvalHierarchy`, `BackendManifest`) stay as-is — the producer agreed to
|
| 57 |
+
emit columns under those exact names.
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## What does not change
|
| 62 |
+
|
| 63 |
+
- All page components under `app/`. The renderer trees are unchanged.
|
| 64 |
+
- TS interface declarations in `lib/benchmark-schema.ts`,
|
| 65 |
+
`lib/eval-processing.ts`, `lib/backend-artifacts.ts`. These are now
|
| 66 |
+
the contract surface — column names match field names by agreement
|
| 67 |
+
with the producer.
|
| 68 |
+
- Component files under `components/`.
|
| 69 |
+
- `lib/glossary.ts`, `lib/known-issues.ts`, `lib/utils.ts`,
|
| 70 |
+
`lib/na-utils.ts` — these are pure presentation helpers.
|
| 71 |
+
- `app/api/*/route.ts` handlers stay as thin pass-throughs to
|
| 72 |
+
`lib/data-backend.ts`.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## Distribution: `SNAPSHOT_URL`
|
| 77 |
+
|
| 78 |
+
Frontend reads `SNAPSHOT_URL` from env at process start. One deploy =
|
| 79 |
+
one snapshot. The URL points at a directory containing the six
|
| 80 |
+
artifacts the frontend reads:
|
| 81 |
+
|
| 82 |
+
```
|
| 83 |
+
$SNAPSHOT_URL/
|
| 84 |
+
├── models_view.parquet
|
| 85 |
+
├── evals_view.parquet
|
| 86 |
+
├── eval_results_view.parquet
|
| 87 |
+
├── headline.json
|
| 88 |
+
├── hierarchy.json
|
| 89 |
+
└── manifest.json
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
Examples:
|
| 93 |
+
|
| 94 |
+
- Local dev: `SNAPSHOT_URL=file:///path/to/eval_card_backend/warehouse/2026-05-03T15-48-59Z`
|
| 95 |
+
- Production (pinned snapshot): `SNAPSHOT_URL=https://huggingface.co/datasets/evaleval/eval-cards-data/resolve/<rev>/warehouse/<snapshot_id>`
|
| 96 |
+
- Production (rolling): `SNAPSHOT_URL=https://huggingface.co/datasets/evaleval/eval-cards-data/resolve/main/warehouse/latest`
|
| 97 |
+
|
| 98 |
+
`LOCAL_PIPELINE_OUTPUT` is removed. The `duckdb/v1/` subpath is
|
| 99 |
+
removed. The producer maintains a `warehouse/latest/` alias that
|
| 100 |
+
points at the most recent snapshot, so deploys can pin either to a
|
| 101 |
+
timestamped snapshot (immutable, redeploy required to roll forward)
|
| 102 |
+
or to `latest` (auto-rolls forward on the next Space rebuild). Within
|
| 103 |
+
a running process the snapshot is still effectively constant — sidecar
|
| 104 |
+
caches in `lib/sidecars.ts` are first-write-wins per process.
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## DuckDB connection lifecycle
|
| 109 |
+
|
| 110 |
+
`lib/duckdb.ts` (new file; replaces the connection-management portion
|
| 111 |
+
of `lib/duckdb-data.ts`):
|
| 112 |
+
|
| 113 |
+
```ts
|
| 114 |
+
import "server-only"
|
| 115 |
+
import { DuckDBConnection } from "@duckdb/node-api"
|
| 116 |
+
|
| 117 |
+
let connectionPromise: Promise<DuckDBConnection> | null = null
|
| 118 |
+
|
| 119 |
+
const SNAPSHOT_URL = process.env.SNAPSHOT_URL
|
| 120 |
+
if (!SNAPSHOT_URL) {
|
| 121 |
+
throw new Error("SNAPSHOT_URL must be set; see notes/backend-v2-migration.md")
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const VIEWS = {
|
| 125 |
+
models_view: `${SNAPSHOT_URL}/models_view.parquet`,
|
| 126 |
+
evals_view: `${SNAPSHOT_URL}/evals_view.parquet`,
|
| 127 |
+
eval_results_view: `${SNAPSHOT_URL}/eval_results_view.parquet`,
|
| 128 |
+
} as const
|
| 129 |
+
|
| 130 |
+
export async function getConnection(): Promise<DuckDBConnection> {
|
| 131 |
+
if (!connectionPromise) {
|
| 132 |
+
connectionPromise = (async () => {
|
| 133 |
+
const conn = await DuckDBConnection.create()
|
| 134 |
+
// httpfs is built into duckdb-node-api; no INSTALL needed.
|
| 135 |
+
// Register each parquet as a view so callers write `FROM models_view`,
|
| 136 |
+
// not the full URL.
|
| 137 |
+
for (const [name, path] of Object.entries(VIEWS)) {
|
| 138 |
+
await conn.run(
|
| 139 |
+
`CREATE OR REPLACE VIEW ${name} AS SELECT * FROM read_parquet(?)`,
|
| 140 |
+
[path]
|
| 141 |
+
)
|
| 142 |
+
}
|
| 143 |
+
return conn
|
| 144 |
+
})()
|
| 145 |
+
}
|
| 146 |
+
return connectionPromise
|
| 147 |
+
}
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
One connection per Node process. Views are registered once at
|
| 151 |
+
startup; subsequent queries write `FROM models_view` rather than
|
| 152 |
+
re-passing the parquet URL. DuckDB's column projection means the cost
|
| 153 |
+
of `SELECT route_id, model_name FROM models_view` is independent of
|
| 154 |
+
how wide `models_view` is.
|
| 155 |
+
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
## Per-accessor mapping
|
| 159 |
+
|
| 160 |
+
`lib/data-backend.ts` keeps its current export names. `lib/duckdb-data.ts`
|
| 161 |
+
gets gutted; each function becomes a thin typed `SELECT`. The mapping
|
| 162 |
+
below uses the column names spec'd in
|
| 163 |
+
`../eval_card_backend/notes/08-frontend-view-layer.md` — the row
|
| 164 |
+
returned by DuckDB casts directly to the TS interface.
|
| 165 |
+
|
| 166 |
+
### Models
|
| 167 |
+
|
| 168 |
+
```ts
|
| 169 |
+
// getModelCards / getModelCardsLite — list pages
|
| 170 |
+
export async function getModelCards(): Promise<EvaluationCardData[]> {
|
| 171 |
+
const conn = await getConnection()
|
| 172 |
+
const reader = await conn.runAndReadAll(`
|
| 173 |
+
SELECT id, route_id, model_name, model_id, canonical_model_name, developer,
|
| 174 |
+
evaluations_count, benchmarks_count, variant_count,
|
| 175 |
+
categories, category_stats, latest_timestamp,
|
| 176 |
+
evaluator_count, evaluator_names, source_type_count, source_types,
|
| 177 |
+
evidence_count, missing_generation_config_count,
|
| 178 |
+
third_party_eval_count, independent_verification_ratio,
|
| 179 |
+
reproducibility_status, eval_libraries, latest_source_name,
|
| 180 |
+
params_billions, benchmark_names, score_summary,
|
| 181 |
+
reproducibility_summary, provenance_summary, comparability_summary,
|
| 182 |
+
top_scores, source_urls, detail_urls,
|
| 183 |
+
model_url, release_date, input_modalities, output_modalities,
|
| 184 |
+
architecture, params, inference_engine, inference_platform
|
| 185 |
+
FROM models_view
|
| 186 |
+
ORDER BY latest_timestamp DESC
|
| 187 |
+
`)
|
| 188 |
+
return reader.getRowObjects() as EvaluationCardData[]
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// "Lite" is just narrower projection — same parquet, fewer columns.
|
| 192 |
+
export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
|
| 193 |
+
const conn = await getConnection()
|
| 194 |
+
const reader = await conn.runAndReadAll(`
|
| 195 |
+
SELECT id, route_id, model_name, model_id, developer,
|
| 196 |
+
evaluations_count, benchmarks_count, categories,
|
| 197 |
+
latest_timestamp, third_party_eval_count,
|
| 198 |
+
independent_verification_ratio, reproducibility_status,
|
| 199 |
+
latest_source_name, params_billions
|
| 200 |
+
FROM models_view
|
| 201 |
+
ORDER BY benchmarks_count DESC, evaluations_count DESC, model_name ASC
|
| 202 |
+
`)
|
| 203 |
+
return reader.getRowObjects() as EvaluationCardData[]
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
// getModelSummaryById — detail page.
|
| 207 |
+
//
|
| 208 |
+
// The row carries the metadata shell (variants[], categories,
|
| 209 |
+
// category_stats, signal summaries, model_family_id, raw_model_ids,
|
| 210 |
+
// total_evaluations, last_updated). The full `ModelEvaluationSummary`
|
| 211 |
+
// also requires `evaluations_by_category: Record<CategoryType,
|
| 212 |
+
// BenchmarkEvaluation[]>`, which is a heavyweight per-cell breakdown —
|
| 213 |
+
// produced by a separate join over `eval_results_view`, see
|
| 214 |
+
// `getModelEvaluationCells` below.
|
| 215 |
+
//
|
| 216 |
+
// Returning a `ModelSummaryShell` (Omit-ed type, defined alongside the
|
| 217 |
+
// existing TS interface) makes the contract explicit and stops the cast
|
| 218 |
+
// from lying. The model-detail page composes the full
|
| 219 |
+
// `ModelEvaluationSummary` from `shell` + `cells`.
|
| 220 |
+
export type ModelSummaryShell = Omit<
|
| 221 |
+
ModelEvaluationSummary,
|
| 222 |
+
"evaluations_by_category"
|
| 223 |
+
>
|
| 224 |
+
|
| 225 |
+
export async function getModelSummaryById(routeId: string): Promise<ModelSummaryShell | null> {
|
| 226 |
+
const conn = await getConnection()
|
| 227 |
+
const reader = await conn.runAndReadAll(
|
| 228 |
+
`SELECT * FROM models_view WHERE route_id = ? OR model_family_id = ? LIMIT 1`,
|
| 229 |
+
[routeId, routeId]
|
| 230 |
+
)
|
| 231 |
+
const rows = reader.getRowObjects()
|
| 232 |
+
if (rows.length === 0) return null
|
| 233 |
+
return rows[0] as unknown as ModelSummaryShell
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
// Per-cell reshape helper. `eval_results_view` rows carry the per-cell
|
| 237 |
+
// fields scattered (model_info, score_details, evaluation_timestamp,
|
| 238 |
+
// source_metadata, source_data, metric_*, etc.) rather than under a
|
| 239 |
+
// nested `result: EvaluationResult` STRUCT. Reshape into the
|
| 240 |
+
// `ModelResultForBenchmark` shape the leaderboard / model-detail
|
| 241 |
+
// renderers expect. Single helper; reused by getEvalSummaryById and
|
| 242 |
+
// getModelEvaluationCells. No HF-record-to-display logic survives.
|
| 243 |
+
function reshapeCellToModelResult(row: Record<string, any>): ModelResultForBenchmark {
|
| 244 |
+
return {
|
| 245 |
+
model_info: row.model_info,
|
| 246 |
+
model_route_id: row.model_route_id,
|
| 247 |
+
score: row.score,
|
| 248 |
+
score_details: row.score_details,
|
| 249 |
+
evaluation_timestamp: row.evaluation_timestamp,
|
| 250 |
+
source_metadata: row.source_metadata,
|
| 251 |
+
source_data: row.source_data,
|
| 252 |
+
source_record_url: row.source_record_url,
|
| 253 |
+
aggregate_components: row.aggregate_components,
|
| 254 |
+
result: {
|
| 255 |
+
evaluation_name: row.metric_display_name,
|
| 256 |
+
metric_summary_id: row.metric_summary_id,
|
| 257 |
+
metric_key: row.metric_id,
|
| 258 |
+
evaluation_timestamp: row.evaluation_timestamp,
|
| 259 |
+
metric_config: { lower_is_better: row.lower_is_better, unit: row.metric_unit, /* …denormalised meta… */ },
|
| 260 |
+
score_details: row.score_details,
|
| 261 |
+
evalcards: row.evalcards_annotations ? { annotations: row.evalcards_annotations } : undefined,
|
| 262 |
+
},
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
// Helper for the model-detail page's evaluations_by_category body.
|
| 267 |
+
// The page groups by `category` in TS after this returns.
|
| 268 |
+
export async function getModelEvaluationCells(modelId: string): Promise<ModelResultForBenchmark[]> {
|
| 269 |
+
const conn = await getConnection()
|
| 270 |
+
const reader = await conn.runAndReadAll(
|
| 271 |
+
`SELECT * FROM eval_results_view WHERE model_id = ? ORDER BY category, percentile DESC`,
|
| 272 |
+
[modelId]
|
| 273 |
+
)
|
| 274 |
+
return reader.getRowObjects().map(reshapeCellToModelResult)
|
| 275 |
+
}
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
### Evals
|
| 279 |
+
|
| 280 |
+
```ts
|
| 281 |
+
// getEvalListData / getEvalListLiteData — list pages
|
| 282 |
+
export async function getEvalListData(): Promise<{
|
| 283 |
+
evals: BenchmarkEvalListItem[]
|
| 284 |
+
totalModels: number
|
| 285 |
+
}> {
|
| 286 |
+
const conn = await getConnection()
|
| 287 |
+
const [evalsReader, modelsReader] = await Promise.all([
|
| 288 |
+
conn.runAndReadAll(`
|
| 289 |
+
SELECT evaluation_id, evaluation_name, canonical_display_name,
|
| 290 |
+
composite_benchmark_key, composite_benchmark_name,
|
| 291 |
+
benchmark_family_key, benchmark_leaf_key, category,
|
| 292 |
+
metric_config, models_count, evaluator_names, source_types,
|
| 293 |
+
latest_source_name, third_party_ratio,
|
| 294 |
+
missing_generation_config_count, best_model, worst_model,
|
| 295 |
+
avg_score, avg_score_norm, has_card,
|
| 296 |
+
is_aggregated, aggregate_sources, tags,
|
| 297 |
+
metrics_count, metric_names, instance_data, top_score,
|
| 298 |
+
subtasks_count, is_summary_score, summary_eval_ids,
|
| 299 |
+
root_metrics, subtasks, leaderboard_metrics,
|
| 300 |
+
reproducibility_summary, provenance_summary, comparability_summary,
|
| 301 |
+
source_data
|
| 302 |
+
FROM evals_view
|
| 303 |
+
ORDER BY evaluation_name ASC
|
| 304 |
+
`),
|
| 305 |
+
conn.runAndReadAll(`SELECT COUNT(*) AS n FROM models_view`),
|
| 306 |
+
])
|
| 307 |
+
return {
|
| 308 |
+
evals: evalsReader.getRowObjects() as BenchmarkEvalListItem[],
|
| 309 |
+
totalModels: Number(modelsReader.getRowObjects()[0].n),
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
// getEvalSummaryById — detail page.
|
| 314 |
+
//
|
| 315 |
+
// No more aggregate__/matrix__ id-prefix dispatch — `evals_view` is the
|
| 316 |
+
// single source for all eval shapes. Suite-vs-leaf is a column
|
| 317 |
+
// (`is_summary_score`, `is_aggregated`) on the same parquet.
|
| 318 |
+
//
|
| 319 |
+
// `model_results[]` rows go through the same reshape helper as
|
| 320 |
+
// `getModelEvaluationCells` (defined below) — they share the
|
| 321 |
+
// ModelResultForBenchmark target shape, so the eval/metric/cell
|
| 322 |
+
// → BenchmarkEvaluation reshape is one helper, two callers.
|
| 323 |
+
export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
|
| 324 |
+
const conn = await getConnection()
|
| 325 |
+
const [evalReader, cellsReader] = await Promise.all([
|
| 326 |
+
conn.runAndReadAll(
|
| 327 |
+
`SELECT * FROM evals_view WHERE evaluation_id = ? LIMIT 1`,
|
| 328 |
+
[evalId]
|
| 329 |
+
),
|
| 330 |
+
conn.runAndReadAll(
|
| 331 |
+
`SELECT * FROM eval_results_view
|
| 332 |
+
WHERE evaluation_id = ?
|
| 333 |
+
AND metric_id = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
|
| 334 |
+
ORDER BY position ASC`,
|
| 335 |
+
[evalId, evalId]
|
| 336 |
+
),
|
| 337 |
+
])
|
| 338 |
+
const evalRows = evalReader.getRowObjects()
|
| 339 |
+
if (evalRows.length === 0) return null
|
| 340 |
+
return {
|
| 341 |
+
...(evalRows[0] as Omit<BenchmarkEvalSummary, "model_results">),
|
| 342 |
+
model_results: cellsReader.getRowObjects().map(reshapeCellToModelResult),
|
| 343 |
+
} as BenchmarkEvalSummary
|
| 344 |
+
}
|
| 345 |
+
```
|
| 346 |
+
|
| 347 |
+
### Developers
|
| 348 |
+
|
| 349 |
+
```ts
|
| 350 |
+
// getDeveloperList — list page; reads from headline.json (precomputed,
|
| 351 |
+
// including producer-owned route_id, model/benchmark/evaluation counts,
|
| 352 |
+
// and popular_evals). DeveloperListEntry is satisfied directly by the
|
| 353 |
+
// headline entry shape.
|
| 354 |
+
export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
|
| 355 |
+
const headline = await fetchHeadline()
|
| 356 |
+
return headline.developers as DeveloperListEntry[]
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
// getDeveloperSummaryById — detail page; reads models_view filtered by developer.
|
| 360 |
+
// The route_id on headline.developers[] is the canonical lookup key — we don't
|
| 361 |
+
// re-derive `developer` from the URL slug, since percent-decoding may not
|
| 362 |
+
// round-trip exactly to the producer's source string.
|
| 363 |
+
export async function getDeveloperSummaryById(routeId: string) {
|
| 364 |
+
const headline = await fetchHeadline()
|
| 365 |
+
const headlineEntry = headline.developers.find((d) => d.route_id === routeId)
|
| 366 |
+
if (!headlineEntry) return null
|
| 367 |
+
const conn = await getConnection()
|
| 368 |
+
const reader = await conn.runAndReadAll(
|
| 369 |
+
`SELECT * FROM models_view WHERE developer = ?`,
|
| 370 |
+
[headlineEntry.developer]
|
| 371 |
+
)
|
| 372 |
+
const models = reader.getRowObjects() as EvaluationCardData[]
|
| 373 |
+
return { ...headlineEntry, models }
|
| 374 |
+
}
|
| 375 |
+
```
|
| 376 |
+
|
| 377 |
+
### Dashboard convenience accessor
|
| 378 |
+
|
| 379 |
+
```ts
|
| 380 |
+
// Was: { models, evals } over both legacy parquets; same shape, new sources.
|
| 381 |
+
export async function getDashboardData() {
|
| 382 |
+
const [models, evalListData] = await Promise.all([
|
| 383 |
+
getModelCards(),
|
| 384 |
+
getEvalListData(),
|
| 385 |
+
])
|
| 386 |
+
return { models, evals: evalListData.evals }
|
| 387 |
+
}
|
| 388 |
+
```
|
| 389 |
+
|
| 390 |
+
---
|
| 391 |
+
|
| 392 |
+
## Sidecar fetchers (replace `lib/hf-data.ts` corpus calls)
|
| 393 |
+
|
| 394 |
+
Three small JSON files live in the snapshot dir alongside the
|
| 395 |
+
parquets. New module `lib/sidecars.ts` exposes typed fetchers.
|
| 396 |
+
`lib/hf-data.ts`'s `fetchCorpusAggregates`, `fetchEvalHierarchy`,
|
| 397 |
+
`fetchBackendManifest`, and `fetchBackendManifestStatus` get their
|
| 398 |
+
implementations replaced — same export names, new sources.
|
| 399 |
+
|
| 400 |
+
```ts
|
| 401 |
+
// lib/sidecars.ts
|
| 402 |
+
import "server-only"
|
| 403 |
+
import type {
|
| 404 |
+
CorpusAggregates,
|
| 405 |
+
EvalHierarchy,
|
| 406 |
+
BackendManifest,
|
| 407 |
+
} from "@/lib/backend-artifacts"
|
| 408 |
+
|
| 409 |
+
const SNAPSHOT_URL = process.env.SNAPSHOT_URL!
|
| 410 |
+
|
| 411 |
+
let cache: {
|
| 412 |
+
manifest?: Promise<BackendManifest>
|
| 413 |
+
headline?: Promise<CorpusAggregates>
|
| 414 |
+
hierarchy?: Promise<EvalHierarchy>
|
| 415 |
+
} = {}
|
| 416 |
+
|
| 417 |
+
async function fetchJson<T>(name: string): Promise<T> {
|
| 418 |
+
const url = `${SNAPSHOT_URL}/${name}`
|
| 419 |
+
const res = url.startsWith("file://")
|
| 420 |
+
? await import("fs/promises").then((fs) => fs.readFile(new URL(url), "utf8"))
|
| 421 |
+
: await fetch(url, { next: { revalidate: 3600 } }).then((r) => r.text())
|
| 422 |
+
return JSON.parse(typeof res === "string" ? res : res.toString()) as T
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
export function fetchManifest(): Promise<BackendManifest> {
|
| 426 |
+
return (cache.manifest ??= fetchJson<BackendManifest>("manifest.json"))
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
export function fetchHeadline(): Promise<CorpusAggregates> {
|
| 430 |
+
return (cache.headline ??= fetchJson<CorpusAggregates>("headline.json"))
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
export function fetchHierarchy(): Promise<EvalHierarchy> {
|
| 434 |
+
return (cache.hierarchy ??= fetchJson<EvalHierarchy>("hierarchy.json"))
|
| 435 |
+
}
|
| 436 |
+
```
|
| 437 |
+
|
| 438 |
+
Then in `lib/hf-data.ts`:
|
| 439 |
+
|
| 440 |
+
```ts
|
| 441 |
+
// fetchBackendManifest: was a fetchHFJsonSafe call; now reads the snapshot sidecar.
|
| 442 |
+
export const fetchBackendManifest = fetchManifest
|
| 443 |
+
export const fetchCorpusAggregates = fetchHeadline
|
| 444 |
+
export const fetchEvalHierarchy = fetchHierarchy
|
| 445 |
+
|
| 446 |
+
// fetchBackendManifestStatus: simplified — single snapshot pin, no "latest" comparison.
|
| 447 |
+
export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
|
| 448 |
+
const m = await fetchManifest()
|
| 449 |
+
return {
|
| 450 |
+
currentManifest: m,
|
| 451 |
+
latestManifest: m, // no separate "latest" — snapshot is pinned
|
| 452 |
+
currentManifestSignature: m.generated_at,
|
| 453 |
+
latestManifestSignature: m.generated_at,
|
| 454 |
+
updateAvailable: false,
|
| 455 |
+
refreshing: false,
|
| 456 |
+
pendingRefreshCount: 0,
|
| 457 |
+
}
|
| 458 |
+
}
|
| 459 |
+
```
|
| 460 |
+
|
| 461 |
+
---
|
| 462 |
+
|
| 463 |
+
## What deletes
|
| 464 |
+
|
| 465 |
+
After v2 is live, the following code is dead and can be removed in a
|
| 466 |
+
follow-up cleanup:
|
| 467 |
+
|
| 468 |
+
- `lib/duckdb-data.ts` — replaced by typed SELECTs split between
|
| 469 |
+
`lib/duckdb.ts` (connection) and `lib/data-backend.ts` (queries).
|
| 470 |
+
- The `payload_json` parser helpers (`parsePayload`, `readPayloads`,
|
| 471 |
+
`readPayloadById`, `assertDeveloperListShape`) — no JSON blobs to
|
| 472 |
+
parse.
|
| 473 |
+
- The `aggregate__` / `matrix__` eval-id prefix dispatch in
|
| 474 |
+
`getEvalSummaryByIdFromDuckDB` — the typed view is the only path.
|
| 475 |
+
- `lib/model-data.ts` — most of its functions exist to convert HF
|
| 476 |
+
JSON records into `BenchmarkEvaluation` / `EvaluationCardData`. Once
|
| 477 |
+
the producer emits those shapes directly, the adapter logic deletes.
|
| 478 |
+
Keep only the helpers that don't touch HF records (slug parsing,
|
| 479 |
+
display formatters).
|
| 480 |
+
- `lib/eval-processing.ts` — the `groupEvaluationsByModel`,
|
| 481 |
+
`createModelSummary`, `createBenchmarkEvalSummary`, and
|
| 482 |
+
`inferCategoryFromBenchmark` adapter functions are no longer called
|
| 483 |
+
in the data path. The exported types stay.
|
| 484 |
+
- `scripts/audit-adapters.mjs`, `scripts/dump-adapter-outputs.mts`,
|
| 485 |
+
`scripts/compare-data-backends.mjs`, `scripts/refresh-fixtures.mjs`,
|
| 486 |
+
`scripts/cache-hf-data.mjs` — adapter / parity-check tooling for the
|
| 487 |
+
legacy pipeline. Delete once v1 is retired.
|
| 488 |
+
- `data/models/`, `data/developers/`, `data/benchmarks.json`,
|
| 489 |
+
`data/models.json`, `data/developers.json` — bundled snapshots of
|
| 490 |
+
v1 output for fixture tests. Replace with v2 fixtures if needed.
|
| 491 |
+
- `LOCAL_PIPELINE_OUTPUT` env var, `duckdb/v1/` subpath conventions,
|
| 492 |
+
and the parity-emitter expectations documented in
|
| 493 |
+
`lib/duckdb-data.ts`'s preamble.
|
| 494 |
+
- `inferCategoryFromBenchmark` regex chain in
|
| 495 |
+
`lib/benchmark-schema.ts` — producer is the source of truth for
|
| 496 |
+
category. Keep the `EVALUATION_CATEGORIES` const + `CategoryType`
|
| 497 |
+
type; delete the inference function and `BENCHMARK_PRIORITY_RULES`.
|
| 498 |
+
|
| 499 |
+
---
|
| 500 |
+
|
| 501 |
+
## Slug rule
|
| 502 |
+
|
| 503 |
+
Producer emits all URL-bearing identifiers in
|
| 504 |
+
RFC 3986 percent-encoded form (`route_id`, `evaluation_id`,
|
| 505 |
+
`metric_summary_id`). Frontend treats them as opaque except for
|
| 506 |
+
`<Link>` href construction:
|
| 507 |
+
|
| 508 |
+
```tsx
|
| 509 |
+
// Old: href={`/models/${model.route_id}`} // route_id was already escaped via __ rule
|
| 510 |
+
// New: href={`/models/${model.route_id}`} // same code; route_id is now percent-encoded
|
| 511 |
+
```
|
| 512 |
+
|
| 513 |
+
Decode happens inside the route handler when looking up by slug:
|
| 514 |
+
|
| 515 |
+
```ts
|
| 516 |
+
// app/models/[id]/page.tsx
|
| 517 |
+
export default async function ModelDetailPage({ params }: { params: { id: string } }) {
|
| 518 |
+
const summary = await getModelSummaryById(params.id) // pass encoded form straight through
|
| 519 |
+
...
|
| 520 |
+
}
|
| 521 |
+
```
|
| 522 |
+
|
| 523 |
+
`getModelSummaryById` looks up by `route_id = ?` directly without
|
| 524 |
+
decoding — the producer's `route_id` column matches the URL path
|
| 525 |
+
segment byte-for-byte. The legacy `replace('/', '__')` and
|
| 526 |
+
`replace(/\//g, ...)` helpers in `lib/utils.ts` and `lib/model-family.ts`
|
| 527 |
+
become dead code; remove them in the cleanup pass.
|
| 528 |
+
|
| 529 |
+
---
|
| 530 |
+
|
| 531 |
+
## Migration strategy
|
| 532 |
+
|
| 533 |
+
A feature flag gates v1 vs v2 during the transition:
|
| 534 |
+
|
| 535 |
+
```ts
|
| 536 |
+
// lib/data-backend.ts
|
| 537 |
+
const BACKEND_VERSION = process.env.DATA_BACKEND ?? "v1"
|
| 538 |
+
|
| 539 |
+
export const getModelCards =
|
| 540 |
+
BACKEND_VERSION === "v2"
|
| 541 |
+
? (await import("@/lib/duckdb")).getModelCards
|
| 542 |
+
: (await import("@/lib/duckdb-data")).getModelCardsFromDuckDB
|
| 543 |
+
// ... same pattern for other accessors
|
| 544 |
+
```
|
| 545 |
+
|
| 546 |
+
Phase plan:
|
| 547 |
+
|
| 548 |
+
1. **Producer ships Stage J.** `eval_card_backend` emits the six
|
| 549 |
+
v2 artifacts in `warehouse/<snapshot_id>/`. Existing canonical
|
| 550 |
+
parquets stay alongside.
|
| 551 |
+
2. **Frontend lands `lib/duckdb.ts` + `lib/sidecars.ts`** behind the
|
| 552 |
+
`DATA_BACKEND=v2` flag. CI builds both backends; default stays v1.
|
| 553 |
+
3. **Smoke test in dev with `DATA_BACKEND=v2`,
|
| 554 |
+
`SNAPSHOT_URL=file://...`.** Verify each page renders identical
|
| 555 |
+
bytes (modulo source-of-data labels). Where they diverge, file
|
| 556 |
+
producer issues — do not patch the frontend to paper over.
|
| 557 |
+
4. **Flip the production default to v2.** Keep v1 path compilable but
|
| 558 |
+
unreachable. Monitor for a release.
|
| 559 |
+
5. **Delete v1 path** (the "What deletes" list above).
|
| 560 |
+
|
| 561 |
+
The flag is intentionally process-wide, not per-accessor. Mixing
|
| 562 |
+
backends within one render produces inconsistent snapshots.
|
| 563 |
+
|
| 564 |
+
---
|
| 565 |
+
|
| 566 |
+
## What doesn't move
|
| 567 |
+
|
| 568 |
+
- **Instance-level data fetching** (`fetchInstanceLevelData` in
|
| 569 |
+
`lib/hf-data.ts`). Instance JSONL is referenced by URL in
|
| 570 |
+
`eval_results_view.instance_file_path`; the lazy-load stays. Pointer
|
| 571 |
+
shape on the row is unchanged from v1.
|
| 572 |
+
- **Benchmark card metadata** lives inside `evals_view.benchmark_card`
|
| 573 |
+
STRUCT now, not a separate `benchmark_card_*.json` per file. The
|
| 574 |
+
page reads it from the eval row directly. Adapter-style readers
|
| 575 |
+
(`fetchBenchmarkMetadataMap`) become a `SELECT benchmark_id, benchmark_card
|
| 576 |
+
FROM evals_view` aggregation if anything still calls them — most
|
| 577 |
+
callers should fold into `getEvalSummaryById`.
|
| 578 |
+
- **EvalCards annotations** (`evalcards.annotations`) live on
|
| 579 |
+
`eval_results_view.evalcards_annotations` per-row. The eval-detail
|
| 580 |
+
page reads them inline; no separate fetcher.
|
| 581 |
+
|
| 582 |
+
---
|
| 583 |
+
|
| 584 |
+
## Open questions / risks
|
| 585 |
+
|
| 586 |
+
- **httpfs cold-start latency.** First query against an HF-hosted
|
| 587 |
+
parquet pays a round trip per file. Mitigate by pre-registering all
|
| 588 |
+
three views at process start (above), so the first user query hits
|
| 589 |
+
warm metadata. Measure on the production HF Space; if too slow,
|
| 590 |
+
consider downloading the snapshot to local disk at container start
|
| 591 |
+
(~MB per snapshot).
|
| 592 |
+
- **Connection lifetime in serverless.** Vercel's serverless
|
| 593 |
+
runtime tears down the Node process per request; the
|
| 594 |
+
`connectionPromise` cache doesn't help. The HF Space deployment
|
| 595 |
+
(Docker, long-lived) is unaffected. If we ever target serverless,
|
| 596 |
+
switch to `duckdb-wasm` in the browser or a separate serving
|
| 597 |
+
process.
|
| 598 |
+
- **`aggregate_components[]` on `eval_results_view`.** This array is
|
| 599 |
+
the per-suite-component breakdown for rollup rows. For non-rollup
|
| 600 |
+
rows it's always empty. If suite rollups grow common, the storage
|
| 601 |
+
cost of trailing-empty arrays is non-trivial; consider splitting
|
| 602 |
+
into a dedicated parquet at that point.
|
| 603 |
+
- **Category drift.** Producer's `category_mapping.json` will lag real
|
| 604 |
+
benchmark tag changes. The mapping is producer-owned, so the
|
| 605 |
+
frontend can't patch around drift — this is a feature, not a bug,
|
| 606 |
+
but it requires operator discipline. Surface "uncategorised
|
| 607 |
+
benchmark count" in the producer's run summary and the home-page
|
| 608 |
+
manifest banner.
|
| 609 |
+
- **Type widening for `score_summary` etc.** The producer emits these
|
| 610 |
+
as DuckDB STRUCTs; the TS interface declares them as nested
|
| 611 |
+
`{ count, min, max, average }`. `runAndReadAll` returns nested
|
| 612 |
+
STRUCTs as plain JS objects, so the cast works — but if duckdb-node
|
| 613 |
+
changes its STRUCT serialisation, audit the `as` casts here. Add a
|
| 614 |
+
dev-only validator that runs `EvaluationCardData`'s shape check at
|
| 615 |
+
the row level on the first `getModelCards()` call after process
|
| 616 |
+
start.
|
notes/merge-cheatsheet-backend-v2.md
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Merge cheatsheet: pulling `main` into `feat/use-new-backend-data`
|
| 2 |
+
|
| 3 |
+
> Drafted 2026-05-04, before pulling. Companion to `backend-v2-migration.md`
|
| 4 |
+
> (which is the design doc). This file is just a per-file conflict guide.
|
| 5 |
+
>
|
| 6 |
+
> Branch: `feat/use-new-backend-data` (2 commits ahead of `main`:
|
| 7 |
+
> `7635aee` Integrate with test backend data, `bfce8f2` Drop
|
| 8 |
+
> input/output_modalities from MODEL_CARD_COLUMNS).
|
| 9 |
+
|
| 10 |
+
## Triage at a glance
|
| 11 |
+
|
| 12 |
+
| File | Risk | Strategy |
|
| 13 |
+
|---|---|---|
|
| 14 |
+
| `lib/data-backend.ts` | **High** | Keep ours wholesale; re-port any new accessors main added |
|
| 15 |
+
| `lib/backend-artifacts.ts` | **High** | Keep our schema renames; reconcile any *new* main-side fields against producer output |
|
| 16 |
+
| `components/signals/corpus-dashboard.tsx` | **Med** | Keep main's UI structure; rewire data fields to v2 names |
|
| 17 |
+
| `components/signals/corpus-signals-strip.tsx` | **Med** | Same as above |
|
| 18 |
+
| `lib/hf-data.ts` | **Med** | Keep `useViewLayerBackend()` short-circuits at top of 5 fetchers |
|
| 19 |
+
| `Dockerfile` | **Med** | Keep our `DATA_BACKEND=v2` + `SNAPSHOT_URL` wiring; layer main's other changes on top |
|
| 20 |
+
| `lib/benchmark-schema.ts` | **Low** | Trivial 1-line addition (`num_few_shot?`) |
|
| 21 |
+
| `app/page.tsx` | **Low** | One-line copy change (`corpus-aggregates.json` → `headline.json`) |
|
| 22 |
+
|
| 23 |
+
New files (no conflict possible): `lib/view-data.ts`, `lib/duckdb.ts`,
|
| 24 |
+
`lib/sidecars.ts`, `tests/view-data.test.ts`,
|
| 25 |
+
`notes/backend-v2-migration.md`.
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## `lib/data-backend.ts` — High
|
| 30 |
+
|
| 31 |
+
**What we did:** Replaced static re-exports from `lib/duckdb-data` with
|
| 32 |
+
a `BACKEND_VERSION` env-flag dispatcher. Each accessor now branches on
|
| 33 |
+
`useViewLayerBackend()` (true when `DATA_BACKEND=v2` or `stage-j`) and
|
| 34 |
+
lazy-imports either `@/lib/view-data` or `@/lib/duckdb-data`.
|
| 35 |
+
Manifest/hierarchy accessors branch between `@/lib/sidecars` and
|
| 36 |
+
`@/lib/hf-data`.
|
| 37 |
+
|
| 38 |
+
**Reconcile:**
|
| 39 |
+
- Conflict almost certain if main touched any export wiring here.
|
| 40 |
+
- **Keep our file as-is.** The dispatcher pattern is load-bearing.
|
| 41 |
+
- If main added a new accessor (e.g. `getFooBar`), add a new dispatcher
|
| 42 |
+
function following the same pattern — only the legacy branch needs
|
| 43 |
+
to be wired immediately; v2 branch can throw `Not implemented` until
|
| 44 |
+
`lib/view-data.ts` adds it.
|
| 45 |
+
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
## `lib/backend-artifacts.ts` — High
|
| 49 |
+
|
| 50 |
+
**What we did:** Renamed corpus-block fields to match what the v2
|
| 51 |
+
producer emits:
|
| 52 |
+
|
| 53 |
+
| Block | v1 (main) | v2 (ours) |
|
| 54 |
+
|---|---|---|
|
| 55 |
+
| Completeness | `total_benchmarks`, `completeness_score_mean`, `completeness_score_median`, `per_field_population{}` | `total_triples`, `completeness_avg`, `completeness_min`, `completeness_max` |
|
| 56 |
+
| Provenance | `multi_source_groups`, `multi_source_rate`, `first_party_only_groups`, `first_party_only_rate`, `total_groups` | `multi_source_triples`, `first_party_only_triples`, `total_triples` (rates dropped — derived in components via local `rate()` helper) |
|
| 57 |
+
| Comparability | `variant_eligible_groups`, `variant_divergent_groups`, `variant_divergence_rate`, `cross_party_eligible_groups`, `cross_party_divergent_groups`, `cross_party_divergence_rate`, `total_groups` | `total_triples`, `variant_divergent_count`, `cross_party_divergent_count`, `groups_with_variant_check`, `groups_with_cross_party_check` |
|
| 58 |
+
|
| 59 |
+
Also added: `DeveloperListEntry` interface, optional
|
| 60 |
+
`developers/families/categories` arrays on `CorpusAggregates`,
|
| 61 |
+
optional `eval_hierarchy` key in `BackendManifest.summary_artifacts`.
|
| 62 |
+
|
| 63 |
+
**Reconcile:**
|
| 64 |
+
- Producer is the source of truth for v2 field names — do **not** add
|
| 65 |
+
back v1 names to satisfy a main-side change. If main added a field
|
| 66 |
+
the v2 producer doesn't emit, either drop it or check
|
| 67 |
+
`eval_card_backend/notes/08-frontend-view-layer.md` first.
|
| 68 |
+
- Keep all three new optional sections on `CorpusAggregates`
|
| 69 |
+
(developers, families, categories) — they back the new
|
| 70 |
+
developer-list path.
|
| 71 |
+
- The `summary_artifacts.eval_hierarchy` key is additive; safe to keep
|
| 72 |
+
alongside whatever main added there.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## `components/signals/corpus-dashboard.tsx` — Medium
|
| 77 |
+
|
| 78 |
+
**What we did:** Mechanical rewrite of every field reference in this
|
| 79 |
+
file to use the v2 names from `lib/backend-artifacts.ts` (above).
|
| 80 |
+
Removed the `per_field_population` per-field grid and replaced it with
|
| 81 |
+
a `min / avg / max` MiniMetric trio. Added a local `rate(num, denom)`
|
| 82 |
+
helper (returns null if either side is null/zero) since v2 stores
|
| 83 |
+
counts, not pre-computed rates. Title-cased `CATEGORY_ORDER`
|
| 84 |
+
(`"Agentic"`, `"General"`, …) and made the keys-to-render set extend
|
| 85 |
+
gracefully to unknown categories.
|
| 86 |
+
|
| 87 |
+
**Reconcile:**
|
| 88 |
+
- If main touched this file for design/UX reasons, **prefer main's
|
| 89 |
+
visual structure** — but keep our field accessors. The recipe is:
|
| 90 |
+
- Anywhere main reads `multi_source_rate`, replace with `rate(prov.multi_source_triples, prov.total_triples)`.
|
| 91 |
+
- Anywhere main reads `completeness_score_mean`, replace with `comp.completeness_avg`.
|
| 92 |
+
- Anywhere main reads `*_eligible_groups` / `*_divergent_groups`, swap to `groups_with_*_check` / `*_divergent_count`.
|
| 93 |
+
- Drop any new code that reads `per_field_population` — gone in v2.
|
| 94 |
+
- Keep the local `rate()` helper at the bottom of the file.
|
| 95 |
+
- Category lookup must use the new title-cased keys (or stay tolerant
|
| 96 |
+
via the `available` set logic we added).
|
| 97 |
+
|
| 98 |
+
---
|
| 99 |
+
|
| 100 |
+
## `components/signals/corpus-signals-strip.tsx` — Medium
|
| 101 |
+
|
| 102 |
+
**What we did:** Same field renames as above, same local `rate()`
|
| 103 |
+
helper added. Headline copy updated from "groups" → "triples" where
|
| 104 |
+
the underlying unit changed.
|
| 105 |
+
|
| 106 |
+
**Reconcile:** Apply the same recipe as `corpus-dashboard.tsx`. The
|
| 107 |
+
two files share field names and the `rate()` helper.
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
## `lib/hf-data.ts` — Medium
|
| 112 |
+
|
| 113 |
+
**What we did:** Added an early-return guard at the top of five
|
| 114 |
+
functions:
|
| 115 |
+
- `fetchBackendManifestStatus` — synthesizes a status from the v2 manifest sidecar
|
| 116 |
+
- `fetchBenchmarkMetadataMap` — delegates to `view-data.getBenchmarkMetadataMap`
|
| 117 |
+
- `fetchBackendManifest` — delegates to `sidecars.fetchManifest`
|
| 118 |
+
- `fetchEvalHierarchy` — delegates to `sidecars.fetchHierarchy` (still wraps in `adaptEvalHierarchy`)
|
| 119 |
+
- `fetchCorpusAggregates` — delegates to `sidecars.fetchHeadline`
|
| 120 |
+
|
| 121 |
+
Plus a module-level `useViewLayerBackend()` helper and a lazy
|
| 122 |
+
`fetchSnapshotSidecars()` importer near the top of the file.
|
| 123 |
+
|
| 124 |
+
**Reconcile:**
|
| 125 |
+
- These are all additive guards at the start of existing functions —
|
| 126 |
+
conflicts are likely only if main re-shaped the same function
|
| 127 |
+
bodies.
|
| 128 |
+
- Pattern: `if (useViewLayerBackend()) { return <v2 path> }` then fall
|
| 129 |
+
through to the existing v1 implementation untouched.
|
| 130 |
+
- If main renamed one of these functions, port the guard into the
|
| 131 |
+
renamed version. Don't drop the guard.
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
## `Dockerfile` — Medium
|
| 136 |
+
|
| 137 |
+
**What we did:**
|
| 138 |
+
- Default `ARG DATA_BACKEND` flipped from `duckdb` → `v2` in **both**
|
| 139 |
+
stages (builder and runner).
|
| 140 |
+
- Added `ARG SNAPSHOT_URL` + `ENV SNAPSHOT_URL` in both stages,
|
| 141 |
+
defaulting to a pinned `evaleval/eval-cards-data` warehouse path.
|
| 142 |
+
- Comment block rewritten to reflect v2 + legacy coexistence.
|
| 143 |
+
- Kept legacy `LOCAL_PIPELINE_OUTPUT`, `HF_DATA_LOCAL_DIR`,
|
| 144 |
+
`HF_DATA_OFFLINE=1` envs intact (legacy backend still compilable).
|
| 145 |
+
|
| 146 |
+
**Uncommitted tweak (working tree):** `SNAPSHOT_URL` default points at
|
| 147 |
+
`j-chim/temp_evalcard_backend` instead of `evaleval/eval-cards-data` —
|
| 148 |
+
this is the dev/test dataset for the temp HF Space deploy. Do **not**
|
| 149 |
+
commit this override; revert before merging to main, or keep it only
|
| 150 |
+
on local working copy.
|
| 151 |
+
|
| 152 |
+
**Reconcile:**
|
| 153 |
+
- Keep our `DATA_BACKEND=v2` default and `SNAPSHOT_URL` plumbing.
|
| 154 |
+
- Layer main's non-data changes (base image bumps, `pnpm` version,
|
| 155 |
+
build commands) on top.
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
## `lib/benchmark-schema.ts` — Low
|
| 160 |
+
|
| 161 |
+
**What we did:** Added one optional field, `num_few_shot?: number`, on
|
| 162 |
+
`GenerationConfig`. That's it.
|
| 163 |
+
|
| 164 |
+
**Reconcile:** Trivially additive. Keep our line; merge tool should
|
| 165 |
+
handle it cleanly unless main touched the same struct.
|
| 166 |
+
|
| 167 |
+
---
|
| 168 |
+
|
| 169 |
+
## `app/page.tsx` — Low
|
| 170 |
+
|
| 171 |
+
**What we did:** One-line copy change in the empty-state banner —
|
| 172 |
+
`corpus-aggregates.json` → `headline.json` (the v2 sidecar name).
|
| 173 |
+
|
| 174 |
+
**Reconcile:** Trivial. Keep ours.
|
| 175 |
+
|
| 176 |
+
---
|
| 177 |
+
|
| 178 |
+
## Order of operations after `git pull`
|
| 179 |
+
|
| 180 |
+
1. Resolve `lib/backend-artifacts.ts` first — it's the schema source
|
| 181 |
+
of truth that the components depend on.
|
| 182 |
+
2. Resolve `lib/data-backend.ts` and `lib/hf-data.ts` — backend wiring.
|
| 183 |
+
3. Resolve the two `components/signals/*` files using the rename recipe.
|
| 184 |
+
4. Resolve `Dockerfile` — keep our v2 envs.
|
| 185 |
+
5. `app/page.tsx` and `lib/benchmark-schema.ts` — should auto-merge or
|
| 186 |
+
be trivial.
|
| 187 |
+
6. Run `pnpm tsc --noEmit` (or whatever the project's typecheck is) to
|
| 188 |
+
catch any v1 field references main introduced that didn't conflict
|
| 189 |
+
textually but break against our renamed types.
|
| 190 |
+
7. Run `pnpm test` — `tests/view-data.test.ts` and
|
| 191 |
+
`tests/duckdb-data.test.ts` should both still pass.
|
| 192 |
+
8. Smoke test with `DATA_BACKEND=v2 SNAPSHOT_URL=file://…` and again
|
| 193 |
+
without (legacy path) — both must render.
|
scripts/cache-hf-data.mjs
CHANGED
|
@@ -18,6 +18,13 @@ import { promisify } from "util"
|
|
| 18 |
const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
|
| 19 |
const cacheDir = path.join(root, ".cache", "hf-data")
|
| 20 |
const publicDir = path.join(root, "public")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
|
| 22 |
|| "https://huggingface.co/datasets/evaleval/card_backend"
|
| 23 |
const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`
|
|
|
|
| 18 |
const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
|
| 19 |
const cacheDir = path.join(root, ".cache", "hf-data")
|
| 20 |
const publicDir = path.join(root, "public")
|
| 21 |
+
const dataBackend = process.env.DATA_BACKEND?.trim().toLowerCase()
|
| 22 |
+
if (dataBackend === "v2" || dataBackend === "stage-j") {
|
| 23 |
+
await fs.mkdir(cacheDir, { recursive: true })
|
| 24 |
+
console.log("[cache-hf-data] DATA_BACKEND=v2: skipping legacy HF cache; runtime reads SNAPSHOT_URL")
|
| 25 |
+
process.exit(0)
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
|
| 29 |
|| "https://huggingface.co/datasets/evaleval/card_backend"
|
| 30 |
const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`
|
tests/duckdb-data.test.ts
CHANGED
|
@@ -12,27 +12,13 @@ function sqlString(value: string) {
|
|
| 12 |
}
|
| 13 |
|
| 14 |
async function writeParquetPayload(outputDir: string, fileName: string, payloads: unknown[]) {
|
| 15 |
-
const parquetDir = path.join(outputDir, "
|
| 16 |
await mkdir(parquetDir, { recursive: true })
|
| 17 |
|
| 18 |
const selects = payloads
|
| 19 |
-
.map((payload
|
| 20 |
-
const record = payload as Record<string, unknown>
|
| 21 |
const payloadJson = JSON.stringify(payload)
|
| 22 |
-
return
|
| 23 |
-
`SELECT 'model_card_lite' AS record_type`,
|
| 24 |
-
`${sqlString(String(record.model_route_id ?? index))} AS model_route_id`,
|
| 25 |
-
`${sqlString(String(record.model_family_id ?? ""))} AS model_family_id`,
|
| 26 |
-
`${sqlString(String(record.developer ?? ""))} AS developer`,
|
| 27 |
-
`NULL AS eval_summary_id`,
|
| 28 |
-
`NULL AS developer_route_id`,
|
| 29 |
-
`NULL AS category`,
|
| 30 |
-
`NULL AS benchmark_family_key`,
|
| 31 |
-
`${Number(record.benchmark_family_count ?? 0)} AS models_count`,
|
| 32 |
-
`${Number(record.total_evaluations ?? 0)} AS total_evaluations`,
|
| 33 |
-
`${sqlString(String(record.last_updated ?? ""))} AS last_updated`,
|
| 34 |
-
`${sqlString(payloadJson)} AS payload_json`,
|
| 35 |
-
].join(", ")
|
| 36 |
})
|
| 37 |
.join(" UNION ALL ")
|
| 38 |
|
|
@@ -49,22 +35,36 @@ describe("DuckDB local data backend", () => {
|
|
| 49 |
process.env.LOCAL_PIPELINE_OUTPUT = outputDir
|
| 50 |
await writeParquetPayload(outputDir, "model_cards_lite.parquet", [
|
| 51 |
{
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
params_billions: 100,
|
| 57 |
-
total_evaluations: 3,
|
| 58 |
-
benchmark_count: 2,
|
| 59 |
-
benchmark_family_count: 2,
|
| 60 |
-
categories_covered: ["reasoning"],
|
| 61 |
-
last_updated: "2026-01-01T00:00:00Z",
|
| 62 |
-
variants: [],
|
| 63 |
score_summary: { count: 1, min: 0.7, max: 0.9, average: 0.8 },
|
| 64 |
benchmark_names: ["mmlu"],
|
| 65 |
-
|
| 66 |
{ benchmark: "mmlu", score: 0.9, metric: "accuracy" },
|
| 67 |
],
|
|
|
|
|
|
|
| 68 |
},
|
| 69 |
])
|
| 70 |
|
|
@@ -93,7 +93,7 @@ describe("DuckDB local data backend", () => {
|
|
| 93 |
try {
|
| 94 |
process.env.LOCAL_PIPELINE_OUTPUT = outputDir
|
| 95 |
await expect(getModelCardsLiteFromDuckDB()).rejects.toThrow(
|
| 96 |
-
/
|
| 97 |
)
|
| 98 |
} finally {
|
| 99 |
if (previousOutput == null) {
|
|
|
|
| 12 |
}
|
| 13 |
|
| 14 |
async function writeParquetPayload(outputDir: string, fileName: string, payloads: unknown[]) {
|
| 15 |
+
const parquetDir = path.join(outputDir, "duckdb", "v1")
|
| 16 |
await mkdir(parquetDir, { recursive: true })
|
| 17 |
|
| 18 |
const selects = payloads
|
| 19 |
+
.map((payload) => {
|
|
|
|
| 20 |
const payloadJson = JSON.stringify(payload)
|
| 21 |
+
return `SELECT ${sqlString(payloadJson)} AS payload_json`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
})
|
| 23 |
.join(" UNION ALL ")
|
| 24 |
|
|
|
|
| 35 |
process.env.LOCAL_PIPELINE_OUTPUT = outputDir
|
| 36 |
await writeParquetPayload(outputDir, "model_cards_lite.parquet", [
|
| 37 |
{
|
| 38 |
+
id: "openai/gpt-5",
|
| 39 |
+
route_id: "openai__gpt-5",
|
| 40 |
+
model_name: "GPT 5",
|
| 41 |
+
model_id: "openai/gpt-5",
|
| 42 |
+
canonical_model_name: "GPT 5",
|
| 43 |
+
developer: "OpenAI",
|
| 44 |
+
evaluations_count: 3,
|
| 45 |
+
benchmarks_count: 2,
|
| 46 |
+
variant_count: 1,
|
| 47 |
+
categories: ["Reasoning"],
|
| 48 |
+
category_stats: { General: 0, Reasoning: 2, Agentic: 0, Safety: 0, Knowledge: 0 },
|
| 49 |
+
latest_timestamp: "2026-01-01T00:00:00Z",
|
| 50 |
+
evaluator_count: 1,
|
| 51 |
+
evaluator_names: ["OpenAI"],
|
| 52 |
+
source_type_count: 1,
|
| 53 |
+
source_types: ["documentation"],
|
| 54 |
+
evidence_count: 3,
|
| 55 |
+
missing_generation_config_count: 0,
|
| 56 |
+
third_party_eval_count: 0,
|
| 57 |
+
independent_verification_ratio: 0,
|
| 58 |
+
reproducibility_status: "complete",
|
| 59 |
+
eval_libraries: [],
|
| 60 |
params_billions: 100,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
score_summary: { count: 1, min: 0.7, max: 0.9, average: 0.8 },
|
| 62 |
benchmark_names: ["mmlu"],
|
| 63 |
+
top_scores: [
|
| 64 |
{ benchmark: "mmlu", score: 0.9, metric: "accuracy" },
|
| 65 |
],
|
| 66 |
+
source_urls: [],
|
| 67 |
+
detail_urls: [],
|
| 68 |
},
|
| 69 |
])
|
| 70 |
|
|
|
|
| 93 |
try {
|
| 94 |
process.env.LOCAL_PIPELINE_OUTPUT = outputDir
|
| 95 |
await expect(getModelCardsLiteFromDuckDB()).rejects.toThrow(
|
| 96 |
+
/duckdb\/v1\/model_cards_lite\.parquet/
|
| 97 |
)
|
| 98 |
} finally {
|
| 99 |
if (previousOutput == null) {
|
tests/view-data.test.ts
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { mkdir, mkdtemp, rm, writeFile } from "fs/promises"
|
| 2 |
+
import os from "os"
|
| 3 |
+
import path from "path"
|
| 4 |
+
|
| 5 |
+
import { DuckDBConnection } from "@duckdb/node-api"
|
| 6 |
+
import { describe, expect, it } from "vitest"
|
| 7 |
+
|
| 8 |
+
function sqlString(value: string) {
|
| 9 |
+
return `'${value.replace(/'/g, "''")}'`
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
async function copyParquet(connection: DuckDBConnection, sql: string, outputPath: string) {
|
| 13 |
+
await connection.run(`COPY (${sql}) TO ${sqlString(outputPath)} (FORMAT parquet)`)
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
async function writeSyntheticStageJSnapshot(snapshotDir: string) {
|
| 17 |
+
await mkdir(snapshotDir, { recursive: true })
|
| 18 |
+
const connection = await DuckDBConnection.create()
|
| 19 |
+
|
| 20 |
+
await copyParquet(
|
| 21 |
+
connection,
|
| 22 |
+
`
|
| 23 |
+
SELECT
|
| 24 |
+
TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
|
| 25 |
+
'openai/gpt-5' AS model_key,
|
| 26 |
+
'openai/gpt-5' AS model_id,
|
| 27 |
+
'openai/gpt-5' AS id,
|
| 28 |
+
'openai%2Fgpt-5' AS route_id,
|
| 29 |
+
'openai%2Fgpt-5' AS model_route_id,
|
| 30 |
+
'openai/gpt-5' AS model_family_id,
|
| 31 |
+
'GPT 5' AS model_name,
|
| 32 |
+
'GPT 5' AS canonical_model_name,
|
| 33 |
+
'GPT 5' AS model_family_name,
|
| 34 |
+
'OpenAI' AS developer,
|
| 35 |
+
DATE '2026-01-01' AS release_date,
|
| 36 |
+
'https://example.test/model' AS model_url,
|
| 37 |
+
'transformer' AS architecture,
|
| 38 |
+
'100B' AS params,
|
| 39 |
+
100.0 AS params_billions,
|
| 40 |
+
['text']::VARCHAR[] AS input_modalities,
|
| 41 |
+
['text']::VARCHAR[] AS output_modalities,
|
| 42 |
+
'engine' AS inference_engine,
|
| 43 |
+
'platform' AS inference_platform,
|
| 44 |
+
1::BIGINT AS evaluations_count,
|
| 45 |
+
1::BIGINT AS benchmarks_count,
|
| 46 |
+
1::INTEGER AS variant_count,
|
| 47 |
+
1::BIGINT AS evaluator_count,
|
| 48 |
+
['OpenAI']::VARCHAR[] AS evaluator_names,
|
| 49 |
+
1::INTEGER AS source_type_count,
|
| 50 |
+
['documentation']::VARCHAR[] AS source_types,
|
| 51 |
+
0::BIGINT AS third_party_eval_count,
|
| 52 |
+
0.0 AS independent_verification_ratio,
|
| 53 |
+
1::BIGINT AS evidence_count,
|
| 54 |
+
0::INTEGER AS missing_generation_config_count,
|
| 55 |
+
TIMESTAMP '2026-05-03 00:00:00' AS latest_timestamp,
|
| 56 |
+
'OpenAI' AS latest_source_name,
|
| 57 |
+
['MMLU']::VARCHAR[] AS benchmark_names,
|
| 58 |
+
['Reasoning']::VARCHAR[] AS categories,
|
| 59 |
+
struct_pack("General" := 0, "Reasoning" := 1, "Agentic" := 0, "Safety" := 0, "Knowledge" := 0) AS category_stats,
|
| 60 |
+
'complete' AS reproducibility_status,
|
| 61 |
+
struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
|
| 62 |
+
struct_pack(
|
| 63 |
+
total_results := 1,
|
| 64 |
+
total_groups := 1,
|
| 65 |
+
multi_source_groups := 0,
|
| 66 |
+
first_party_only_groups := 1,
|
| 67 |
+
source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
|
| 68 |
+
) AS provenance_summary,
|
| 69 |
+
struct_pack(
|
| 70 |
+
total_groups := 1,
|
| 71 |
+
groups_with_variant_check := 0,
|
| 72 |
+
groups_with_cross_party_check := 0,
|
| 73 |
+
variant_divergent_count := 0,
|
| 74 |
+
cross_party_divergent_count := 0
|
| 75 |
+
) AS comparability_summary,
|
| 76 |
+
[struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR)] AS eval_libraries,
|
| 77 |
+
struct_pack(count := 1, min := 0.8, max := 0.8, average := 0.8) AS score_summary,
|
| 78 |
+
[struct_pack(benchmark := 'MMLU', benchmarkKey := 'mmlu', score := 0.8, metric := 'accuracy')] AS top_scores,
|
| 79 |
+
['https://example.test/source']::VARCHAR[] AS source_urls,
|
| 80 |
+
[]::VARCHAR[] AS detail_urls,
|
| 81 |
+
[struct_pack(
|
| 82 |
+
variant_id := 'default',
|
| 83 |
+
variant_key := 'default',
|
| 84 |
+
variant_label := 'Default',
|
| 85 |
+
variant_display_name := 'GPT 5',
|
| 86 |
+
raw_model_ids := ['openai/gpt-5']::VARCHAR[],
|
| 87 |
+
family_id := 'openai/gpt-5',
|
| 88 |
+
family_name := 'GPT 5',
|
| 89 |
+
version_date := NULL::VARCHAR,
|
| 90 |
+
version_qualifier := NULL::VARCHAR,
|
| 91 |
+
total_evaluations := 1,
|
| 92 |
+
last_updated := TIMESTAMP '2026-05-03 00:00:00',
|
| 93 |
+
categories_covered := ['Reasoning']::VARCHAR[]
|
| 94 |
+
)] AS variants,
|
| 95 |
+
['openai/gpt-5']::VARCHAR[] AS raw_model_ids
|
| 96 |
+
`,
|
| 97 |
+
path.join(snapshotDir, "models_view.parquet")
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
await copyParquet(
|
| 101 |
+
connection,
|
| 102 |
+
`
|
| 103 |
+
SELECT
|
| 104 |
+
TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
|
| 105 |
+
'mmlu' AS evaluation_id,
|
| 106 |
+
'mmlu' AS benchmark_id,
|
| 107 |
+
'accuracy' AS primary_metric_id,
|
| 108 |
+
'MMLU' AS evaluation_name,
|
| 109 |
+
'MMLU' AS canonical_display_name,
|
| 110 |
+
'mmlu' AS composite_benchmark_key,
|
| 111 |
+
'MMLU' AS composite_benchmark_name,
|
| 112 |
+
'mmlu' AS benchmark_family_key,
|
| 113 |
+
'mmlu' AS benchmark_leaf_key,
|
| 114 |
+
'Reasoning' AS category,
|
| 115 |
+
struct_pack(
|
| 116 |
+
evaluation_description := 'Accuracy on MMLU',
|
| 117 |
+
lower_is_better := false,
|
| 118 |
+
score_type := 'continuous',
|
| 119 |
+
min_score := 0.0,
|
| 120 |
+
max_score := 1.0,
|
| 121 |
+
unit := 'proportion'
|
| 122 |
+
) AS metric_config,
|
| 123 |
+
1::BIGINT AS models_count,
|
| 124 |
+
['OpenAI']::VARCHAR[] AS evaluator_names,
|
| 125 |
+
['documentation']::VARCHAR[] AS source_types,
|
| 126 |
+
'OpenAI' AS latest_source_name,
|
| 127 |
+
0.0 AS third_party_ratio,
|
| 128 |
+
0::INTEGER AS missing_generation_config_count,
|
| 129 |
+
struct_pack(name := 'GPT 5', score := 0.8) AS best_model,
|
| 130 |
+
struct_pack(name := 'GPT 5', score := 0.8) AS worst_model,
|
| 131 |
+
0.8 AS avg_score,
|
| 132 |
+
0.8 AS avg_score_norm,
|
| 133 |
+
0.8 AS top_score,
|
| 134 |
+
false AS has_card,
|
| 135 |
+
NULL AS benchmark_card,
|
| 136 |
+
false AS is_aggregated,
|
| 137 |
+
[] AS aggregate_sources,
|
| 138 |
+
false AS is_summary_score,
|
| 139 |
+
[]::VARCHAR[] AS summary_eval_ids,
|
| 140 |
+
struct_pack(domains := ['knowledge']::VARCHAR[], languages := ['en']::VARCHAR[], tasks := ['qa']::VARCHAR[]) AS tags,
|
| 141 |
+
struct_pack(
|
| 142 |
+
dataset_name := 'MMLU',
|
| 143 |
+
source_type := 'documentation',
|
| 144 |
+
hf_repo := NULL::VARCHAR,
|
| 145 |
+
hf_split := NULL::VARCHAR,
|
| 146 |
+
samples_number := 10,
|
| 147 |
+
url := ['https://example.test/mmlu']::VARCHAR[],
|
| 148 |
+
dataset_url := 'https://example.test/mmlu',
|
| 149 |
+
dataset_version := 'v1'
|
| 150 |
+
) AS source_data,
|
| 151 |
+
struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
|
| 152 |
+
struct_pack(
|
| 153 |
+
total_results := 1,
|
| 154 |
+
total_groups := 1,
|
| 155 |
+
multi_source_groups := 0,
|
| 156 |
+
first_party_only_groups := 1,
|
| 157 |
+
source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
|
| 158 |
+
) AS provenance_summary,
|
| 159 |
+
struct_pack(
|
| 160 |
+
total_groups := 1,
|
| 161 |
+
groups_with_variant_check := 0,
|
| 162 |
+
groups_with_cross_party_check := 0,
|
| 163 |
+
variant_divergent_count := 0,
|
| 164 |
+
cross_party_divergent_count := 0
|
| 165 |
+
) AS comparability_summary,
|
| 166 |
+
struct_pack(available := false, url_count := 0::BIGINT, sample_urls := []::VARCHAR[], models_with_loaded_instances := 0) AS instance_data,
|
| 167 |
+
1::INTEGER AS metrics_count,
|
| 168 |
+
['Accuracy']::VARCHAR[] AS metric_names,
|
| 169 |
+
[struct_pack(
|
| 170 |
+
column_key := 'root:accuracy',
|
| 171 |
+
metric_summary_id := 'mmlu%3Aaccuracy',
|
| 172 |
+
metric_id := 'accuracy',
|
| 173 |
+
metric_name := 'accuracy',
|
| 174 |
+
display_name := 'Accuracy',
|
| 175 |
+
canonical_display_name := 'Accuracy',
|
| 176 |
+
lower_is_better := false,
|
| 177 |
+
unit := 'proportion',
|
| 178 |
+
scope := 'root',
|
| 179 |
+
subtask_key := NULL::VARCHAR,
|
| 180 |
+
subtask_name := NULL::VARCHAR
|
| 181 |
+
)] AS leaderboard_metrics,
|
| 182 |
+
[] AS leaderboard_rows,
|
| 183 |
+
[struct_pack(
|
| 184 |
+
metric_summary_id := 'mmlu%3Aaccuracy',
|
| 185 |
+
metric_name := 'accuracy',
|
| 186 |
+
display_name := 'Accuracy',
|
| 187 |
+
canonical_display_name := 'Accuracy',
|
| 188 |
+
metric_key := 'accuracy',
|
| 189 |
+
lower_is_better := false,
|
| 190 |
+
models_count := 1,
|
| 191 |
+
top_score := 0.8,
|
| 192 |
+
unit := 'proportion'
|
| 193 |
+
)] AS root_metrics,
|
| 194 |
+
[] AS subtasks,
|
| 195 |
+
0::INTEGER AS subtasks_count
|
| 196 |
+
`,
|
| 197 |
+
path.join(snapshotDir, "evals_view.parquet")
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
await copyParquet(
|
| 201 |
+
connection,
|
| 202 |
+
`
|
| 203 |
+
SELECT
|
| 204 |
+
TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
|
| 205 |
+
'mmlu' AS evaluation_id,
|
| 206 |
+
'mmlu%3Aaccuracy' AS metric_summary_id,
|
| 207 |
+
'mmlu' AS benchmark_id,
|
| 208 |
+
'accuracy' AS metric_id,
|
| 209 |
+
'openai/gpt-5' AS model_key,
|
| 210 |
+
'openai/gpt-5' AS model_id,
|
| 211 |
+
'openai%2Fgpt-5' AS model_route_id,
|
| 212 |
+
struct_pack(
|
| 213 |
+
name := 'GPT 5',
|
| 214 |
+
id := 'openai/gpt-5',
|
| 215 |
+
developer := 'OpenAI',
|
| 216 |
+
inference_platform := 'platform',
|
| 217 |
+
inference_engine := 'engine',
|
| 218 |
+
model_version := NULL::VARCHAR,
|
| 219 |
+
architecture := 'transformer',
|
| 220 |
+
parameter_count := '100B',
|
| 221 |
+
release_date := '2026-01-01',
|
| 222 |
+
model_url := 'https://example.test/model',
|
| 223 |
+
modalities := struct_pack(input := ['text']::VARCHAR[], output := ['text']::VARCHAR[])
|
| 224 |
+
) AS model_info,
|
| 225 |
+
'Accuracy' AS metric_display_name,
|
| 226 |
+
'proportion' AS metric_unit,
|
| 227 |
+
false AS lower_is_better,
|
| 228 |
+
'Reasoning' AS category,
|
| 229 |
+
0.8 AS score,
|
| 230 |
+
struct_pack(
|
| 231 |
+
score := 0.8,
|
| 232 |
+
standard_error := 0.01,
|
| 233 |
+
sample_size := 10,
|
| 234 |
+
confidence_interval := struct_pack(lower := 0.7, upper := 0.9, confidence_level := 0.95)
|
| 235 |
+
) AS score_details,
|
| 236 |
+
1::INTEGER AS fact_row_count,
|
| 237 |
+
1::INTEGER AS position,
|
| 238 |
+
1::INTEGER AS total,
|
| 239 |
+
1.0 AS percentile,
|
| 240 |
+
TIMESTAMP '2026-05-03 00:00:00' AS evaluation_timestamp,
|
| 241 |
+
struct_pack(
|
| 242 |
+
source_name := 'OpenAI report',
|
| 243 |
+
source_type := 'documentation',
|
| 244 |
+
source_organization_name := 'OpenAI',
|
| 245 |
+
source_organization_url := 'https://example.test',
|
| 246 |
+
evaluator_relationship := 'first_party',
|
| 247 |
+
source_url := 'https://example.test/report',
|
| 248 |
+
publication_date := DATE '2026-05-03'
|
| 249 |
+
) AS source_metadata,
|
| 250 |
+
struct_pack(
|
| 251 |
+
dataset_name := 'MMLU',
|
| 252 |
+
source_type := 'documentation',
|
| 253 |
+
hf_repo := NULL::VARCHAR,
|
| 254 |
+
hf_split := NULL::VARCHAR,
|
| 255 |
+
samples_number := 10,
|
| 256 |
+
url := ['https://example.test/mmlu']::VARCHAR[],
|
| 257 |
+
dataset_url := 'https://example.test/mmlu',
|
| 258 |
+
dataset_version := 'v1'
|
| 259 |
+
) AS source_data,
|
| 260 |
+
'https://example.test/record.json' AS source_record_url,
|
| 261 |
+
struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR) AS eval_library,
|
| 262 |
+
['first_party']::VARCHAR[] AS evaluator_relationships,
|
| 263 |
+
true AS has_first_party,
|
| 264 |
+
false AS has_third_party,
|
| 265 |
+
'self' AS coverage_cell,
|
| 266 |
+
['OpenAI']::VARCHAR[] AS reporting_orgs,
|
| 267 |
+
map(['OpenAI'], [0.8]) AS scores_by_organization,
|
| 268 |
+
false AS is_summary_score,
|
| 269 |
+
NULL::VARCHAR AS summary_score_for,
|
| 270 |
+
[] AS aggregate_components,
|
| 271 |
+
false AS has_reproducibility_gap,
|
| 272 |
+
1.0 AS completeness_score,
|
| 273 |
+
false AS is_multi_source,
|
| 274 |
+
true AS first_party_only,
|
| 275 |
+
false AS has_variant_divergence,
|
| 276 |
+
false AS has_cross_party_divergence,
|
| 277 |
+
NULL AS evalcards_annotations,
|
| 278 |
+
NULL::VARCHAR AS instance_file_path,
|
| 279 |
+
NULL::VARCHAR AS instance_file_format,
|
| 280 |
+
0::INTEGER AS instance_rows
|
| 281 |
+
`,
|
| 282 |
+
path.join(snapshotDir, "eval_results_view.parquet")
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
await writeFile(
|
| 286 |
+
path.join(snapshotDir, "manifest.json"),
|
| 287 |
+
JSON.stringify({
|
| 288 |
+
generated_at: "2026-05-03T00:00:00Z",
|
| 289 |
+
config_version: 2,
|
| 290 |
+
skipped_configs: [],
|
| 291 |
+
model_count: 1,
|
| 292 |
+
eval_count: 1,
|
| 293 |
+
metric_eval_count: 1,
|
| 294 |
+
source_config_count: 1,
|
| 295 |
+
skipped_config_count: 0,
|
| 296 |
+
summary_artifacts: {
|
| 297 |
+
corpus_aggregates: "headline.json",
|
| 298 |
+
eval_hierarchy: "hierarchy.json",
|
| 299 |
+
},
|
| 300 |
+
})
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
const reproducibilityBlock = {
|
| 304 |
+
total_triples: 1,
|
| 305 |
+
triples_with_reproducibility_gap: 0,
|
| 306 |
+
reproducibility_gap_rate: 0,
|
| 307 |
+
agentic_triples: 0,
|
| 308 |
+
per_field_missingness: {
|
| 309 |
+
temperature: {
|
| 310 |
+
missing_count: 0,
|
| 311 |
+
missing_rate: 0,
|
| 312 |
+
denominator: "all_triples",
|
| 313 |
+
denominator_count: 1,
|
| 314 |
+
},
|
| 315 |
+
},
|
| 316 |
+
}
|
| 317 |
+
const completenessBlock = {
|
| 318 |
+
total_triples: 1,
|
| 319 |
+
completeness_avg: 0.75,
|
| 320 |
+
completeness_min: 0.75,
|
| 321 |
+
completeness_max: 0.75,
|
| 322 |
+
}
|
| 323 |
+
const provenanceBlock = {
|
| 324 |
+
total_triples: 1,
|
| 325 |
+
multi_source_triples: 0,
|
| 326 |
+
first_party_only_triples: 1,
|
| 327 |
+
source_type_distribution: {
|
| 328 |
+
first_party: 1,
|
| 329 |
+
third_party: 0,
|
| 330 |
+
collaborative: 0,
|
| 331 |
+
unspecified: 0,
|
| 332 |
+
},
|
| 333 |
+
}
|
| 334 |
+
const comparabilityBlock = {
|
| 335 |
+
total_triples: 1,
|
| 336 |
+
variant_divergent_count: 0,
|
| 337 |
+
cross_party_divergent_count: 0,
|
| 338 |
+
groups_with_variant_check: 1,
|
| 339 |
+
groups_with_cross_party_check: 0,
|
| 340 |
+
}
|
| 341 |
+
await writeFile(
|
| 342 |
+
path.join(snapshotDir, "headline.json"),
|
| 343 |
+
JSON.stringify({
|
| 344 |
+
generated_at: "2026-05-03T00:00:00Z",
|
| 345 |
+
signal_version: "1.0",
|
| 346 |
+
stratification_dimensions: ["category"],
|
| 347 |
+
reproducibility: {
|
| 348 |
+
overall: reproducibilityBlock,
|
| 349 |
+
by_category: { Reasoning: reproducibilityBlock },
|
| 350 |
+
},
|
| 351 |
+
completeness: {
|
| 352 |
+
overall: completenessBlock,
|
| 353 |
+
by_category: { Reasoning: completenessBlock },
|
| 354 |
+
},
|
| 355 |
+
provenance: {
|
| 356 |
+
overall: provenanceBlock,
|
| 357 |
+
by_category: { Reasoning: provenanceBlock },
|
| 358 |
+
},
|
| 359 |
+
comparability: {
|
| 360 |
+
overall: comparabilityBlock,
|
| 361 |
+
by_category: { Reasoning: comparabilityBlock },
|
| 362 |
+
},
|
| 363 |
+
developers: [
|
| 364 |
+
{
|
| 365 |
+
developer: "OpenAI",
|
| 366 |
+
route_id: "OpenAI",
|
| 367 |
+
model_count: 1,
|
| 368 |
+
benchmark_count: 1,
|
| 369 |
+
evaluation_count: 1,
|
| 370 |
+
popular_evals: [{ benchmark: "MMLU", model_count: 1 }],
|
| 371 |
+
},
|
| 372 |
+
],
|
| 373 |
+
})
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
await writeFile(
|
| 377 |
+
path.join(snapshotDir, "hierarchy.json"),
|
| 378 |
+
JSON.stringify({
|
| 379 |
+
stats: {
|
| 380 |
+
family_count: 1,
|
| 381 |
+
composite_count: 0,
|
| 382 |
+
standalone_benchmark_count: 1,
|
| 383 |
+
single_benchmark_count: 1,
|
| 384 |
+
slice_count: 0,
|
| 385 |
+
metric_count: 1,
|
| 386 |
+
metric_rows_scanned: 1,
|
| 387 |
+
},
|
| 388 |
+
families: [],
|
| 389 |
+
})
|
| 390 |
+
)
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
describe("Stage J view-layer backend", () => {
|
| 394 |
+
it("reads a pinned snapshot through the v2 accessors", async () => {
|
| 395 |
+
const snapshotDir = await mkdtemp(path.join(os.tmpdir(), "eval-card-stage-j-"))
|
| 396 |
+
const previousBackend = process.env.DATA_BACKEND
|
| 397 |
+
const previousSnapshotUrl = process.env.SNAPSHOT_URL
|
| 398 |
+
|
| 399 |
+
try {
|
| 400 |
+
await writeSyntheticStageJSnapshot(snapshotDir)
|
| 401 |
+
process.env.DATA_BACKEND = "v2"
|
| 402 |
+
process.env.SNAPSHOT_URL = `file://${snapshotDir}`
|
| 403 |
+
|
| 404 |
+
const dataBackend = await import("../lib/data-backend")
|
| 405 |
+
const hfData = await import("../lib/hf-data")
|
| 406 |
+
|
| 407 |
+
const [models, evalListData, modelSummary, evalSummary, developers, developerSummary, manifest, hierarchy, aggregates] =
|
| 408 |
+
await Promise.all([
|
| 409 |
+
dataBackend.getModelCardsLite(),
|
| 410 |
+
dataBackend.getEvalListLiteData(),
|
| 411 |
+
dataBackend.getModelSummaryById("openai%2Fgpt-5"),
|
| 412 |
+
dataBackend.getEvalSummaryById("mmlu"),
|
| 413 |
+
dataBackend.getDeveloperList(),
|
| 414 |
+
dataBackend.getDeveloperSummaryById("OpenAI"),
|
| 415 |
+
dataBackend.getBackendManifestData(),
|
| 416 |
+
dataBackend.getEvalHierarchyData(),
|
| 417 |
+
hfData.fetchCorpusAggregates(),
|
| 418 |
+
])
|
| 419 |
+
|
| 420 |
+
expect(models[0]).toMatchObject({
|
| 421 |
+
route_id: "openai%2Fgpt-5",
|
| 422 |
+
model_name: "GPT 5",
|
| 423 |
+
evaluations_count: 1,
|
| 424 |
+
})
|
| 425 |
+
expect(evalListData).toMatchObject({
|
| 426 |
+
totalModels: 1,
|
| 427 |
+
evals: [{ evaluation_id: "mmlu", evaluation_name: "MMLU", models_count: 1 }],
|
| 428 |
+
})
|
| 429 |
+
expect(modelSummary?.evaluations_by_category.Reasoning).toHaveLength(1)
|
| 430 |
+
expect(evalSummary?.model_results[0]).toMatchObject({
|
| 431 |
+
model_route_id: "openai%2Fgpt-5",
|
| 432 |
+
score: 0.8,
|
| 433 |
+
result: { metric_summary_id: "mmlu%3Aaccuracy" },
|
| 434 |
+
})
|
| 435 |
+
expect(developers[0]).toMatchObject({ developer: "OpenAI", route_id: "OpenAI" })
|
| 436 |
+
expect(developerSummary?.models).toHaveLength(1)
|
| 437 |
+
expect(manifest.model_count).toBe(1)
|
| 438 |
+
expect(hierarchy.stats?.metric_rows_scanned).toBe(1)
|
| 439 |
+
expect(aggregates?.completeness.overall).toMatchObject({
|
| 440 |
+
total_triples: 1,
|
| 441 |
+
completeness_avg: 0.75,
|
| 442 |
+
})
|
| 443 |
+
expect(aggregates?.provenance.overall).toMatchObject({
|
| 444 |
+
total_triples: 1,
|
| 445 |
+
first_party_only_triples: 1,
|
| 446 |
+
})
|
| 447 |
+
expect(aggregates?.comparability.overall).toMatchObject({
|
| 448 |
+
groups_with_variant_check: 1,
|
| 449 |
+
variant_divergent_count: 0,
|
| 450 |
+
})
|
| 451 |
+
expect(aggregates?.comparability.by_category.Reasoning).toBeDefined()
|
| 452 |
+
} finally {
|
| 453 |
+
if (previousBackend == null) {
|
| 454 |
+
delete process.env.DATA_BACKEND
|
| 455 |
+
} else {
|
| 456 |
+
process.env.DATA_BACKEND = previousBackend
|
| 457 |
+
}
|
| 458 |
+
if (previousSnapshotUrl == null) {
|
| 459 |
+
delete process.env.SNAPSHOT_URL
|
| 460 |
+
} else {
|
| 461 |
+
process.env.SNAPSHOT_URL = previousSnapshotUrl
|
| 462 |
+
}
|
| 463 |
+
await rm(snapshotDir, { recursive: true, force: true })
|
| 464 |
+
}
|
| 465 |
+
})
|
| 466 |
+
})
|