Spaces:
Running
Running
File size: 37,678 Bytes
fe99ffa 553b175 fe99ffa bfb71af fe99ffa 7a54021 fe99ffa d249d5b fe99ffa d249d5b 2ed4959 11542d9 fe99ffa 11542d9 2ed4959 11542d9 d249d5b 0641374 fe99ffa 0641374 d249d5b 0641374 fe99ffa 7c13d55 fe99ffa 0641374 f8940f7 0641374 fe99ffa 11542d9 2ed4959 11542d9 f8940f7 0641374 d249d5b fe99ffa d49f850 fe99ffa d49f850 fe99ffa 9b2a4b8 718288a 9b2a4b8 225b586 718288a a9845fb 718288a 9b2a4b8 718288a fe99ffa beb4e3a fe99ffa d249d5b f8940f7 d249d5b f8940f7 d249d5b f8940f7 d249d5b fe99ffa d249d5b fe99ffa 7c13d55 fe99ffa 7c13d55 fe99ffa 7c13d55 fe99ffa 7c13d55 fe99ffa 915be2f bfb71af fe99ffa 915be2f fe99ffa 915be2f fe99ffa 915be2f fe99ffa 915be2f fe99ffa d249d5b 2ed4959 fe99ffa 2ed4959 fe99ffa 915be2f fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b fe99ffa d249d5b beb4e3a fe99ffa 6e90b4d d249d5b 0641374 beb4e3a 0641374 d249d5b fe99ffa beb4e3a fe99ffa 11542d9 fe99ffa 11542d9 fe99ffa 11542d9 fe99ffa 553b175 fe99ffa 11542d9 fe99ffa 11542d9 fe99ffa 553b175 fe99ffa d249d5b 0641374 beb4e3a 0641374 fe99ffa d249d5b 553b175 7a54021 553b175 7a54021 553b175 fe99ffa d249d5b fe99ffa d249d5b fe99ffa 11542d9 fe99ffa beb4e3a fe99ffa 11542d9 fe99ffa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 | import "server-only"
import fs from "node:fs"
import path from "node:path"
import { getConnection } from "@/lib/duckdb"
import { fetchHeadline } from "@/lib/sidecars"
import {
type BenchmarkCard,
type BenchmarkEvaluation,
type EvaluationCardData,
type EvaluationResult,
type GenerationConfig,
type MetricConfig,
type ModelInfo,
type ModelEvaluationSummary,
type ModelVariantSummary,
type ScoreDetails,
type SourceData,
type SourceMetadata,
} from "@/lib/benchmark-schema"
import type { DeveloperListEntry, RowAnnotations } from "@/lib/backend-artifacts"
import type {
BenchmarkEvalListItem,
BenchmarkEvalSummary,
ModelResultForBenchmark,
} from "@/lib/eval-processing"
import { dedupeLeaderboardRowsByModelIdentity } from "@/lib/eval-processing"
type Row = Record<string, any>
const MODEL_CARD_COLUMNS = `
id, model_key, route_id, model_name, model_id, canonical_model_name, developer,
evaluations_count, benchmarks_count, variant_count,
derived_tags AS tags, tag_stats, latest_timestamp,
evaluator_count, evaluator_names, source_type_count, source_types,
evidence_count, missing_generation_config_count,
third_party_eval_count, independent_verification_ratio,
reproducibility_status, eval_libraries, latest_source_name,
params_billions, benchmark_names, score_summary,
reproducibility_summary, provenance_summary, comparability_summary,
top_scores, source_urls, detail_urls,
model_url, release_date,
architecture, params, inference_engine, inference_platform
`
// The composite/family/slice taxonomy replaced the legacy
// `composite_benchmark_key` /
// `composite_benchmark_name` columns with `composite_slug` /
// `composite_display_name`. The `family_id` / `family_display_name` /
// `is_slice` columns are the canonical identity surface; we still
// alias the composite_* legacy names for backward compat with
// consumers that haven't migrated yet. Mapping:
// composite_benchmark_key/name β composite_slug/display_name
// (the leaderboard, e.g. "wasp"/"WASP" β what the eval-detail
// "Composite" label shows)
const EVAL_LIST_COLUMNS = `
evaluation_id, evaluation_name, canonical_display_name,
benchmark_id,
composite_slug, composite_display_name,
family_id, family_display_name, is_slice,
parent_benchmark_id,
composite_slug AS composite_benchmark_key,
composite_display_name AS composite_benchmark_name,
family_display_name AS benchmark_family_name,
derived_tags,
CAST(to_json(metric_config) AS VARCHAR) AS metric_config,
models_count, evaluator_names, source_types,
latest_source_name, third_party_ratio,
missing_generation_config_count, best_model, worst_model,
avg_score, avg_score_norm, has_card, CAST(to_json(benchmark_card) AS VARCHAR) AS benchmark_card,
is_aggregated, CAST(to_json(aggregate_sources) AS VARCHAR) AS aggregate_sources, CAST(to_json(tags) AS VARCHAR) AS tags,
metrics_count, metric_names, CAST(to_json(instance_data) AS VARCHAR) AS instance_data, top_score,
subtasks_count, is_summary_score,
CAST(to_json(root_metrics) AS VARCHAR) AS root_metrics,
CAST(to_json(subtasks) AS VARCHAR) AS subtasks,
CAST(to_json(leaderboard_metrics) AS VARCHAR) AS leaderboard_metrics,
CAST(to_json(reproducibility_summary) AS VARCHAR) AS reproducibility_summary,
CAST(to_json(provenance_summary) AS VARCHAR) AS provenance_summary,
CAST(to_json(comparability_summary) AS VARCHAR) AS comparability_summary,
CAST(to_json(source_data) AS VARCHAR) AS source_data
`
// The deployed Space returns 500s ("Invalid Error: don't know what
// type:") on every eval-results / model-summary query because the
// DuckDB Node binding on linux-x64 can't materialise certain complex
// column types in the upstream parquet (nested JSON inside
// structs, MAP, and STRUCT[]). Wrap every non-primitive column with
// `to_json(...)` so the binding only ever sees VARCHAR per row;
// `parseMaybeJson` undoes the wrap in JS before downstream code
// reads the shapes.
const CELL_JOIN_COLUMNS = `
r.snapshot_id,
r.evaluation_id,
r.metric_summary_id,
r.benchmark_id,
r.metric_id,
r.model_key,
r.model_id,
r.model_route_id,
CAST(to_json(r.model_info) AS VARCHAR) AS model_info,
r.metric_display_name,
r.metric_unit,
r.lower_is_better,
CAST(to_json(r.derived_tags) AS VARCHAR) AS derived_tags,
r.score,
CAST(to_json(r.score_details) AS VARCHAR) AS score_details,
r.fact_row_count,
r.position,
r.total,
r.percentile,
r.evaluation_timestamp,
CAST(to_json(r.source_metadata) AS VARCHAR) AS source_metadata,
CAST(to_json(r.source_data) AS VARCHAR) AS source_data,
r.source_record_url,
CAST(to_json(r.eval_library) AS VARCHAR) AS eval_library,
CAST(to_json(r.evaluator_relationships) AS VARCHAR) AS evaluator_relationships,
r.has_first_party,
r.has_third_party,
r.coverage_cell,
CAST(to_json(r.reporting_orgs) AS VARCHAR) AS reporting_orgs,
CAST(to_json(r.scores_by_organization) AS VARCHAR) AS scores_by_organization,
r.is_summary_score,
r.summary_score_for,
CAST(to_json(r.aggregate_components) AS VARCHAR) AS aggregate_components,
r.has_reproducibility_gap,
r.completeness_score,
r.is_multi_source,
r.first_party_only,
r.has_variant_divergence,
r.has_cross_party_divergence,
CAST(to_json(r.evalcards_annotations) AS VARCHAR) AS evalcards_annotations,
r.instance_file_path,
r.instance_file_format,
r.instance_rows,
e.evaluation_name AS eval_evaluation_name,
e.canonical_display_name AS eval_canonical_display_name,
e.benchmark_id AS eval_benchmark_id,
e.composite_slug AS eval_composite_slug,
e.composite_display_name AS eval_composite_display_name,
e.family_id AS eval_family_id,
e.family_display_name AS eval_family_display_name,
e.is_slice AS eval_is_slice,
e.parent_benchmark_id AS eval_parent_benchmark_id,
e.composite_slug AS eval_composite_benchmark_key,
e.composite_display_name AS eval_composite_benchmark_name,
e.family_display_name AS eval_benchmark_family_name,
CAST(to_json(e.derived_tags) AS VARCHAR) AS eval_derived_tags,
CAST(to_json(e.metric_config) AS VARCHAR) AS eval_metric_config,
CAST(to_json(e.source_data) AS VARCHAR) AS eval_source_data,
CAST(to_json(e.benchmark_card) AS VARCHAR) AS eval_benchmark_card,
CAST(to_json(e.tags) AS VARCHAR) AS eval_tags,
e.is_summary_score AS eval_is_summary_score
`
// Matches an ASCII signed integer (no decimals, no leading zeros aside from
// "0" itself). Used to detect BIGINT columns that `getRowObjectsJson()`
// serialises as strings β the JSON form does this inconsistently per
// value (numbers within int32 range stay numeric, larger ones become
// strings), so consumers see a mixed-type field and `sum + value`
// silently concatenates instead of adding.
const BIGINT_STRING = /^-?(?:0|[1-9]\d*)$/
function normalizeDuckDBValue(value: unknown): unknown {
if (typeof value === "bigint") {
return Number(value)
}
// Recover BIGINT-encoded numeric strings back to numbers, but only
// when the value round-trips safely (so 64-bit ints that exceed
// Number.MAX_SAFE_INTEGER stay as strings instead of silently losing
// precision).
if (typeof value === "string" && BIGINT_STRING.test(value)) {
const numeric = Number(value)
if (Number.isSafeInteger(numeric)) return numeric
}
if (value instanceof Date) {
return value.toISOString()
}
if (value instanceof Map) {
return Object.fromEntries(
Array.from(value.entries()).map(([key, mapValue]) => [String(key), normalizeDuckDBValue(mapValue)])
)
}
if (Array.isArray(value)) {
return value.map(normalizeDuckDBValue)
}
if (value && typeof value === "object") {
const duckValue = value as {
constructor?: { name?: string }
entries?: unknown
items?: unknown
scale?: unknown
value?: unknown
toString?: () => string
}
const constructorName = duckValue.constructor?.name ?? ""
if (constructorName === "DuckDBStructValue" && duckValue.entries && typeof duckValue.entries === "object") {
return normalizeDuckDBValue(duckValue.entries)
}
if (
(constructorName === "DuckDBListValue" || constructorName === "DuckDBArrayValue") &&
Array.isArray(duckValue.items)
) {
return duckValue.items.map(normalizeDuckDBValue)
}
if (constructorName === "DuckDBMapValue" && Array.isArray(duckValue.entries)) {
return Object.fromEntries(
duckValue.entries.map((entry) => {
const pair = entry as { key: unknown; value: unknown }
return [String(pair.key), normalizeDuckDBValue(pair.value)]
})
)
}
if (constructorName === "DuckDBDecimalValue" && typeof duckValue.toString === "function") {
return Number(duckValue.toString())
}
if (constructorName.startsWith("DuckDB") && typeof duckValue.toString === "function") {
return duckValue.toString()
}
return Object.fromEntries(
Object.entries(value).map(([key, objectValue]) => [key, normalizeDuckDBValue(objectValue)])
)
}
return value
}
async function readRows<T = Row>(sql: string, params: unknown[] = []): Promise<T[]> {
const connection = await getConnection()
// Split the call so we can inspect column metadata even when the
// chunk-fetch step crashes. `runAndRead` returns a reader without
// fetching any chunks; `readAll` triggers the fetch loop, which is
// where the linux-x64 binding throws "Invalid Error: don't know
// what type: " for certain aliased logical types (JSON, etc.).
// `getRowObjectsJson()` is the lib's documented JSON-serialisable
// path β STRUCTβobject, LISTβarray, MAPβobject, decimalsβstring β
// which is what the rest of the file already expects.
// normalizeDuckDBValue is kept as a no-op safety net on top.
let reader
try {
reader = params.length > 0
? await connection.runAndRead(sql, params as any[])
: await connection.runAndRead(sql)
} catch (err) {
const sqlSnippet = sql.replace(/\s+/g, " ").slice(0, 1200)
const msg = err instanceof Error ? `${err.name}: ${err.message}` : String(err)
console.error(`[view-data] runAndRead failed (${msg}) β SQL: ${sqlSnippet}`)
throw err
}
try {
await reader.readAll()
return reader.getRowObjectsJson().map((row) => normalizeDuckDBValue(row) as T)
} catch (err) {
const sqlSnippet = sql.replace(/\s+/g, " ").slice(0, 1200)
const msg = err instanceof Error ? `${err.name}: ${err.message}` : String(err)
let columnSchema: string = "<unavailable>"
try {
columnSchema = JSON.stringify(reader.columnNameAndTypeObjectsJson())
} catch (introspectErr) {
columnSchema = `<introspect-failed: ${
introspectErr instanceof Error ? introspectErr.message : String(introspectErr)
}>`
}
console.error(
`[view-data] readAll/getRows failed (${msg}) β columnCount=${reader.columnCount} ` +
`columns=${columnSchema} β SQL: ${sqlSnippet}`
)
throw err
}
}
function asNumber(value: unknown, fallback = 0) {
if (typeof value === "number" && Number.isFinite(value)) return value
if (typeof value === "bigint") return Number(value)
if (typeof value === "string" && value.trim() !== "") {
const parsed = Number(value)
if (Number.isFinite(parsed)) return parsed
}
return fallback
}
function optionalNumber(value: unknown) {
if (value == null) return undefined
const parsed = asNumber(value, Number.NaN)
return Number.isFinite(parsed) ? parsed : undefined
}
function asString(value: unknown, fallback = "") {
return typeof value === "string" ? value : fallback
}
function optionalString(value: unknown) {
return typeof value === "string" && value.length > 0 ? value : undefined
}
// Some parquet columns ship JSON-typed fields nested inside structs
// that the DuckDB Node binding can't materialise (crashes the entire
// query with "don't know what type:"). For those columns the SELECT
// wraps the value in `to_json(...)` so the binding sees a single
// VARCHAR; this helper undoes the wrap. If the value is already an
// object (legacy snapshots without the to_json wrap, or local dev
// where the binding handled the type), pass it through unchanged.
function parseMaybeJson(value: unknown): unknown {
if (typeof value !== "string") return value
if (value === "" || value === "null") return null
try {
return JSON.parse(value)
} catch {
return value
}
}
function asArray<T>(value: unknown): T[] {
return Array.isArray(value) ? value as T[] : []
}
// derived_tags arrives as a native list (models_view: VARCHAR[]) or a
// JSON-encoded string (evals_view / eval_results_view: VARCHAR). Coerce
// either into a string[].
function coerceTags(value: unknown): string[] {
let current: unknown = value
for (let depth = 0; depth < 3; depth += 1) {
if (Array.isArray(current)) {
return current.filter((t): t is string => typeof t === "string")
}
if (typeof current !== "string" || current.length === 0) {
return []
}
try {
current = JSON.parse(current)
} catch {
return []
}
}
return []
}
// tag_stats is a JSON column ({tag: count}); coerce string-or-object into
// a plain Record<string, number>.
function coerceTagStats(value: unknown): Record<string, number> {
let obj: unknown = value
if (typeof value === "string" && value.length > 0) {
try { obj = JSON.parse(value) } catch { return {} }
}
if (obj && typeof obj === "object" && !Array.isArray(obj)) {
const out: Record<string, number> = {}
for (const [k, v] of Object.entries(obj as Record<string, unknown>)) {
out[k] = Number(v) || 0
}
return out
}
return {}
}
// Model-card rows carry `tags` (derived_tags AS tags) and `tag_stats`
// straight off the parquet; normalise their runtime shapes.
function finalizeModelCard(row: Row): EvaluationCardData {
return {
...row,
tags: coerceTags(row.tags),
tag_stats: coerceTagStats(row.tag_stats),
} as EvaluationCardData
}
function sourceMetadataFromRow(row: Row): SourceMetadata {
const sm = parseMaybeJson(row.source_metadata)
if (sm && typeof sm === "object") {
return sm as SourceMetadata
}
return {
source_type: "documentation",
source_organization_name: asString(row.latest_source_name, "Unknown"),
evaluator_relationship: "other",
}
}
function sourceDataFromRow(row: Row): BenchmarkEvaluation["source_data"] {
const sourceData = parseMaybeJson(row.source_data) ?? parseMaybeJson(row.eval_source_data)
if (sourceData) {
return sourceData as BenchmarkEvaluation["source_data"]
}
return {
dataset_name: asString(row.eval_evaluation_name ?? row.evaluation_name ?? row.benchmark_id, "Unknown dataset"),
} satisfies SourceData
}
function scoreDetailsFromRow(row: Row): ScoreDetails {
const parsed = parseMaybeJson(row.score_details)
const details = parsed && typeof parsed === "object"
? parsed as Partial<ScoreDetails>
: {}
const score = asNumber(details.score ?? row.score)
return {
...details,
score,
} as ScoreDetails
}
function metricConfigFromRow(row: Row): MetricConfig {
const config = (parseMaybeJson(row.metric_config) ?? parseMaybeJson(row.eval_metric_config) ?? {}) as Partial<MetricConfig>
const scoreType = config.score_type === "binary" || config.score_type === "discrete"
? config.score_type
: "continuous"
return {
evaluation_description: asString(
config.evaluation_description ??
row.metric_description ??
row.metric_display_name ??
row.eval_evaluation_name ??
row.evaluation_name,
""
),
lower_is_better: Boolean(row.lower_is_better ?? config.lower_is_better ?? false),
score_type: scoreType,
min_score: optionalNumber(config.min_score ?? row.min_score),
max_score: optionalNumber(config.max_score ?? row.max_score),
unit: optionalString(row.metric_unit ?? config.unit),
}
}
function modelInfoFromModelRow(row: Row): ModelInfo {
return {
name: asString(row.model_name ?? row.model_family_name ?? row.model_id ?? row.model_key, "Unknown model"),
id: asString(row.model_key ?? row.model_id ?? row.id ?? row.route_id, "unknown-model"),
developer: optionalString(row.developer),
inference_platform: optionalString(row.inference_platform),
inference_engine: optionalString(row.inference_engine),
architecture: optionalString(row.architecture),
parameter_count: optionalString(row.params),
release_date: optionalString(row.release_date),
model_url: optionalString(row.model_url),
additional_details: {
params_billions: row.params_billions,
},
modalities: {
input: asArray<string>(row.input_modalities),
output: asArray<string>(row.output_modalities),
},
}
}
function resultFromCell(row: Row): EvaluationResult {
const scoreDetails = scoreDetailsFromRow(row)
// model_info / generation_config / source_metadata / ... all arrive
// JSON-encoded β CELL_JOIN_COLUMNS wraps every non-primitive column
// in to_json() + CAST AS VARCHAR to dodge the binding's
// "don't know what type:" crash. parseMaybeJson reverses the wrap;
// it passes through unchanged when the value is already an object
// (legacy snapshots / future binding fixes).
const generationConfig = parseMaybeJson(row.generation_config) as GenerationConfig | undefined
const annotations = parseMaybeJson(row.evalcards_annotations) as RowAnnotations | undefined
return {
evaluation_name: asString(row.metric_display_name ?? row.eval_evaluation_name ?? row.metric_id, "Score"),
display_name: optionalString(row.metric_display_name),
canonical_display_name: optionalString(row.metric_display_name),
metric_summary_id: optionalString(row.metric_summary_id),
metric_key: optionalString(row.metric_id),
evaluation_timestamp: asString(row.evaluation_timestamp, ""),
source_data: sourceDataFromRow(row),
metric_config: metricConfigFromRow(row),
score_details: scoreDetails,
generation_config: generationConfig,
detailed_evaluation_results_url: optionalString(row.instance_file_path),
evalcards: annotations ? { annotations } : undefined,
}
}
function reshapeCellToModelResult(row: Row): ModelResultForBenchmark {
const scoreDetails = scoreDetailsFromRow(row)
// Every wrapped column needs parseMaybeJson to come back to its
// object shape β see CELL_JOIN_COLUMNS for the wrapping sites.
const modelInfo = parseMaybeJson(row.model_info)
const aggregateComponents = parseMaybeJson(row.aggregate_components)
return {
model_info: (modelInfo ?? modelInfoFromModelRow(row)) as ModelInfo,
model_route_id: optionalString(row.model_route_id),
score: scoreDetails.score,
score_details: scoreDetails,
evaluation_timestamp: asString(row.evaluation_timestamp, ""),
source_metadata: sourceMetadataFromRow(row),
source_data: sourceDataFromRow(row),
source_record_url: optionalString(row.source_record_url),
aggregate_components: asArray<NonNullable<ModelResultForBenchmark["aggregate_components"]>[number]>(
aggregateComponents
),
result: resultFromCell(row),
}
}
function reshapeCellToBenchmarkEvaluation(row: Row): BenchmarkEvaluation {
const result = resultFromCell(row)
const modelInfo = parseMaybeJson(row.model_info)
const evalLibrary = parseMaybeJson(row.eval_library)
const generationConfig = parseMaybeJson(row.generation_config)
return {
schema_version: "1.0",
eval_summary_id: optionalString(row.evaluation_id),
evaluation_id: asString(row.evaluation_id ?? row.benchmark_id, "unknown-evaluation"),
retrieved_timestamp: asString(row.evaluation_timestamp, ""),
benchmark: optionalString(row.eval_evaluation_name ?? row.benchmark_id),
display_name: optionalString(row.eval_evaluation_name),
canonical_display_name: optionalString(row.eval_canonical_display_name),
derived_tags: coerceTags(row.eval_derived_tags ?? row.derived_tags),
family_id: optionalString(row.eval_family_id),
benchmark_family_name: optionalString(row.eval_family_display_name),
parent_benchmark_id: optionalString(row.eval_parent_benchmark_id),
benchmark_parent_name: optionalString(row.eval_composite_benchmark_name),
benchmark_leaf_name: optionalString(row.eval_evaluation_name),
is_slice: Boolean(row.eval_is_slice),
is_summary_score: Boolean(row.eval_is_summary_score ?? row.is_summary_score),
source_data: sourceDataFromRow(row),
source_metadata: sourceMetadataFromRow(row),
eval_library: evalLibrary as BenchmarkEvaluation["eval_library"],
model_info: (modelInfo ?? modelInfoFromModelRow(row)) as ModelInfo,
generation_config: generationConfig as BenchmarkEvaluation["generation_config"],
evaluation_results: [result],
}
}
function modelSummaryFromRows(modelRow: Row, cellRows: Row[]): ModelEvaluationSummary {
// An evaluation can carry several tags, so it appears under each of its
// tags (multi-membership), unlike the old single-category grouping.
const evaluationsByTag: Record<string, BenchmarkEvaluation[]> = {}
for (const cellRow of cellRows) {
const evaluation = reshapeCellToBenchmarkEvaluation(cellRow)
const tags = evaluation.derived_tags && evaluation.derived_tags.length > 0
? evaluation.derived_tags
: ["general"]
for (const tag of tags) {
(evaluationsByTag[tag] ??= []).push(evaluation)
}
}
const tagsCovered = coerceTags(modelRow.tags ?? modelRow.derived_tags)
const modelInfo = (modelRow.model_info ?? modelInfoFromModelRow(modelRow)) as ModelInfo
const totalEvaluations = asNumber(modelRow.total_evaluations ?? modelRow.evaluations_count)
const lastUpdated = asString(modelRow.last_updated ?? modelRow.latest_timestamp, "")
const rawModelIds = asArray<string>(modelRow.raw_model_ids)
const core = {
model_info: modelInfo,
evaluations_by_tag: evaluationsByTag,
total_evaluations: totalEvaluations,
last_updated: lastUpdated,
tags_covered: tagsCovered.length > 0 ? tagsCovered : Object.keys(evaluationsByTag),
reproducibility_summary: modelRow.reproducibility_summary,
provenance_summary: modelRow.provenance_summary,
comparability_summary: modelRow.comparability_summary,
}
const variants = asArray<Row>(modelRow.variants).map((variant, index) => ({
...core,
...variant,
variant_id: asString(variant.variant_id ?? variant.variant_key, `variant-${index}`),
variant_key: asString(variant.variant_key, `variant-${index}`),
variant_label: asString(variant.variant_label ?? variant.variant_display_name, "Default"),
variant_display_name: asString(variant.variant_display_name ?? variant.variant_label ?? modelRow.model_name, modelRow.model_name),
raw_model_ids: asArray<string>(variant.raw_model_ids),
family_id: asString(variant.family_id ?? modelRow.model_family_id, modelRow.model_family_id),
family_name: asString(variant.family_name ?? modelRow.model_family_name, modelRow.model_family_name),
total_evaluations: asNumber(variant.total_evaluations ?? totalEvaluations),
last_updated: asString(variant.last_updated ?? lastUpdated, lastUpdated),
tags_covered: coerceTags(variant.tags_covered ?? variant.derived_tags).length > 0
? coerceTags(variant.tags_covered ?? variant.derived_tags)
: core.tags_covered,
model_info: {
...modelInfo,
name: asString(variant.variant_display_name ?? variant.variant_label ?? modelInfo.name, modelInfo.name),
},
})) as ModelVariantSummary[]
return {
...core,
model_family_id: asString(modelRow.model_family_id ?? modelRow.model_key ?? modelRow.model_id, modelRow.model_key ?? modelRow.model_id),
model_route_id: asString(modelRow.model_route_id ?? modelRow.route_id, modelRow.route_id),
model_family_name: asString(modelRow.model_family_name ?? modelRow.model_name, modelRow.model_name),
raw_model_ids: rawModelIds.length > 0 ? rawModelIds : [asString(modelRow.model_key ?? modelRow.model_id, "")].filter(Boolean),
variants,
}
}
async function getModelEvaluationRows(modelKey: string): Promise<Row[]> {
// model_key is the producer's addressable identifier β non-null for both
// resolved and unresolved models (the latter fall back to the raw source
// name). Querying by model_id alone would silently miss unresolved models.
return readRows<Row>(
`SELECT ${CELL_JOIN_COLUMNS}
FROM eval_results_view r
LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
WHERE r.model_key = ?
AND r.score IS NOT NULL
ORDER BY r.percentile DESC NULLS LAST`,
[modelKey]
)
}
export async function getModelCards(): Promise<EvaluationCardData[]> {
const rows = await readRows<Row>(
`SELECT ${MODEL_CARD_COLUMNS}
FROM models_view
ORDER BY latest_timestamp DESC NULLS LAST`
)
return rows.map(finalizeModelCard)
}
export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
const rows = await readRows<Row>(
`SELECT ${MODEL_CARD_COLUMNS}
FROM models_view
ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`
)
return rows.map(finalizeModelCard)
}
export async function getEvalListData(): Promise<{
evals: BenchmarkEvalListItem[]
totalModels: number
}> {
const [evalRows, countRows] = await Promise.all([
readRows<BenchmarkEvalListItem & { benchmark_card?: unknown }>(
`SELECT ${EVAL_LIST_COLUMNS}
FROM evals_view
ORDER BY evaluation_name ASC`
),
readRows<{ n: number }>("SELECT COUNT(*) AS n FROM models_view"),
])
// benchmark_card is JSON-encoded at the SQL layer; parse it, and coerce
// derived_tags, before handing rows to consumers that expect object shapes.
const decoded = evalRows.map((row) => ({
...row,
derived_tags: coerceTags(row.derived_tags),
metric_config: parseMaybeJson(row.metric_config),
benchmark_card: parseMaybeJson(row.benchmark_card),
aggregate_sources: parseMaybeJson(row.aggregate_sources),
tags: parseMaybeJson(row.tags),
instance_data: parseMaybeJson(row.instance_data),
root_metrics: parseMaybeJson(row.root_metrics),
subtasks: parseMaybeJson(row.subtasks),
leaderboard_metrics: parseMaybeJson(row.leaderboard_metrics),
reproducibility_summary: parseMaybeJson(row.reproducibility_summary),
provenance_summary: parseMaybeJson(row.provenance_summary),
comparability_summary: parseMaybeJson(row.comparability_summary),
source_data: parseMaybeJson(row.source_data),
})) as unknown as BenchmarkEvalListItem[]
return {
evals: decoded,
totalModels: asNumber(countRows[0]?.n),
}
}
export async function getEvalListLiteData(): Promise<{
evals: BenchmarkEvalListItem[]
totalModels: number
}> {
return getEvalListData()
}
export async function getEvalList() {
const { evals } = await getEvalListData()
return evals
}
export async function getDashboardData() {
const [models, evals] = await Promise.all([
getModelCards(),
getEvalList(),
])
return { models, evals }
}
export async function getModelSummaryById(routeId: string): Promise<ModelEvaluationSummary | null> {
// Lookups use the addressable identifier (`model_key`/`route_id`/
// `model_route_id`/`model_family_id`) so unresolved models β whose
// `model_id` is NULL β are still findable. `model_id` is kept in the
// OR chain as a back-compat fallback for old links.
//
// Three slug shapes flow into this route handler:
// - URL-encoded form (canonical, e.g. `google%2Fgemini-3-pro`) β
// Next.js already decodes path params before they reach here, so
// `routeId` lands as `google/gemini-3-pro`.
// - Plain canonical id with `/` (same shape after Next.js decode).
// - Legacy `__`-separated form (e.g. `google__gemini-3-pro`) β old
// `getModelFamilyRouteId` emitted this; bookmarks may still use
// it. Convert `__` β `/` for lookup.
const dunder = routeId.includes("__") ? routeId.replace(/__/g, "/") : routeId
const rows = await readRows<Row>(
`SELECT *
FROM models_view
WHERE model_key = ? OR route_id = ? OR model_route_id = ? OR model_family_id = ? OR model_id = ?
OR model_key = ? OR model_id = ?
LIMIT 1`,
[routeId, routeId, routeId, routeId, routeId, dunder, dunder]
)
const modelRow = rows[0]
if (!modelRow) return null
const cellRows = await getModelEvaluationRows(asString(modelRow.model_key ?? modelRow.model_id, routeId))
return modelSummaryFromRows(modelRow, cellRows)
}
// Build-time precomputed multi-metric / per-slice matrix produced by
// `scripts/build-eval-matrices.mjs`. Read once on first request and
// cached in module scope β the file is image-baked so this is a single
// disk read per server start. When the file is missing (local dev where
// nobody ran `pnpm build-eval-matrices` yet), we fall through and the
// summary degrades to single-metric exactly like before.
type MatrixEntry = {
leaderboard_rows: Array<{ model_route_id: string; values: Record<string, number | null> }>
subtask_metrics: Array<Record<string, unknown>>
}
let evalMatrixCache: Record<string, MatrixEntry> | null | undefined
function loadEvalMatrices(): Record<string, MatrixEntry> | null {
if (evalMatrixCache !== undefined) return evalMatrixCache
try {
const matrixPath = path.join(process.cwd(), "data", "eval-matrices.json")
const text = fs.readFileSync(matrixPath, "utf8")
const parsed = JSON.parse(text) as { evals?: Record<string, MatrixEntry> }
evalMatrixCache = parsed.evals ?? {}
} catch {
evalMatrixCache = null
}
return evalMatrixCache
}
export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
// Use the same aliased projection as EVAL_LIST_COLUMNS so the legacy
// `composite_benchmark_*` / `benchmark_family_*` consumer fields are
// populated. A bare `SELECT *` returns the raw v2 column names which
// leaves the legacy fields NULL on the deserialised summary.
const evalRows = await readRows<Row>(
`SELECT ${EVAL_LIST_COLUMNS}
FROM evals_view
WHERE evaluation_id = ?
LIMIT 1`,
[evalId]
)
const evalRow = evalRows[0]
if (!evalRow) return null
let cellRows = await readRows<Row>(
`SELECT ${CELL_JOIN_COLUMNS}
FROM eval_results_view r
LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
WHERE r.evaluation_id = ?
AND r.metric_id = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
AND r.score IS NOT NULL
ORDER BY r.position ASC NULLS LAST`,
[evalId, evalId]
)
if (cellRows.length === 0) {
cellRows = await readRows<Row>(
`SELECT ${CELL_JOIN_COLUMNS}
FROM eval_results_view r
LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
WHERE r.evaluation_id = ?
AND r.score IS NOT NULL
ORDER BY r.position ASC NULLS LAST`,
[evalId]
)
}
const summary = {
...evalRow,
derived_tags: coerceTags(evalRow.derived_tags),
metric_config: parseMaybeJson(evalRow.metric_config),
// benchmark_card arrives JSON-encoded (the parquet schema nests a
// JSON-typed field β see CELL_JOIN_COLUMNS / EVAL_LIST_COLUMNS).
benchmark_card: parseMaybeJson(evalRow.benchmark_card),
aggregate_sources: parseMaybeJson(evalRow.aggregate_sources),
tags: parseMaybeJson(evalRow.tags),
instance_data: parseMaybeJson(evalRow.instance_data),
root_metrics: parseMaybeJson(evalRow.root_metrics),
subtasks: parseMaybeJson(evalRow.subtasks),
leaderboard_metrics: parseMaybeJson(evalRow.leaderboard_metrics),
reproducibility_summary: parseMaybeJson(evalRow.reproducibility_summary),
provenance_summary: parseMaybeJson(evalRow.provenance_summary),
comparability_summary: parseMaybeJson(evalRow.comparability_summary),
source_data: parseMaybeJson(evalRow.source_data),
model_results: cellRows.map(reshapeCellToModelResult),
} as unknown as BenchmarkEvalSummary
// Splice in precomputed multi-metric leaderboard_rows and subtask
// leaderboard_metrics from data/eval-matrices.json. Models in the matrix
// but not in cellRows (zero-coverage primary metric) are also surfaced
// so a user can still see per-slice or non-primary scores. The base row
// shape comes from any matching cellRow when one exists.
const matrices = loadEvalMatrices()
const matrix = matrices?.[evalId]
if (matrix) {
const baseRowByRoute = new Map<string, ModelResultForBenchmark>()
for (const result of summary.model_results) {
if (result.model_route_id) {
baseRowByRoute.set(result.model_route_id, result)
}
}
const leaderboardRows = matrix.leaderboard_rows
.map((row) => {
const base = baseRowByRoute.get(row.model_route_id)
if (!base) return null
return {
model_info: base.model_info,
model_route_id: row.model_route_id,
evaluation_timestamp: base.evaluation_timestamp,
source_metadata: base.source_metadata,
source_data: base.source_data,
values: row.values,
metrics_present: Object.values(row.values).filter(
(v): v is number => typeof v === "number" && Number.isFinite(v),
).length,
}
})
.filter((row): row is NonNullable<typeof row> => row !== null)
if (leaderboardRows.length > 0) {
summary.leaderboard_rows = dedupeLeaderboardRowsByModelIdentity(leaderboardRows)
}
if (matrix.subtask_metrics.length > 0) {
const existing = (summary.leaderboard_metrics ?? []) as Array<{ column_key: string }>
const seen = new Set(existing.map((m) => m.column_key))
const merged = [
...existing,
...matrix.subtask_metrics.filter(
(m): m is typeof m & { column_key: string } =>
typeof m.column_key === "string" && !seen.has(m.column_key),
),
]
summary.leaderboard_metrics =
merged as unknown as BenchmarkEvalSummary["leaderboard_metrics"]
}
}
// Fallback for single-metric leaderboards with no precomputed matrix
// entry (e.g. big-bench-hard): the matrix block above only populates
// `leaderboard_rows` when a matrix exists, but consumers like the
// embed leaderboard read exclusively from that field. Synthesize one
// row per `model_results` entry using the primary metric's column_key
// as the values key, so the data is present regardless of whether
// build-time precomputation ran for this eval.
const hasRows = (summary.leaderboard_rows?.length ?? 0) > 0
if (!hasRows && (summary.model_results?.length ?? 0) > 0) {
const primaryMetric = (summary.leaderboard_metrics ?? []).find(
(m): m is typeof m & { column_key: string } =>
typeof (m as { column_key?: unknown }).column_key === "string"
&& (m as { scope?: string }).scope !== "subtask",
)
const columnKey = primaryMetric?.column_key
?? (summary.leaderboard_metrics ?? [])[0]?.column_key
?? "score"
summary.leaderboard_rows = summary.model_results
.filter((mr) => Number.isFinite(mr.score) && mr.model_route_id)
.map((mr) => ({
model_info: mr.model_info,
model_route_id: mr.model_route_id,
evaluation_timestamp: mr.evaluation_timestamp,
source_metadata: mr.source_metadata,
source_data: mr.source_data,
values: { [columnKey]: mr.score as number },
metrics_present: 1,
})) as BenchmarkEvalSummary["leaderboard_rows"]
}
// Belt-and-suspenders: when leaderboard_rows arrived from the parquet
// pre-baked (no matrix) the same two-source duplication can appear, so
// dedup whatever is set on the summary before returning.
if (summary.leaderboard_rows && summary.leaderboard_rows.length > 1) {
summary.leaderboard_rows = dedupeLeaderboardRowsByModelIdentity(summary.leaderboard_rows)
}
return summary
}
export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
const headline = await fetchHeadline()
return [...(headline.developers ?? [])].sort((a, b) => a.developer.localeCompare(b.developer))
}
export async function getDeveloperSummaryById(routeId: string) {
const developers = await getDeveloperList()
const developer = developers.find((entry) => entry.route_id === routeId)
if (!developer) return null
const modelRows = await readRows<Row>(
`SELECT ${MODEL_CARD_COLUMNS}
FROM models_view
WHERE developer = ?
ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`,
[developer.developer]
)
return {
...developer,
models: modelRows.map(finalizeModelCard),
}
}
export async function getBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
const rows = await readRows<Row>(
`SELECT evaluation_id, evaluation_name,
family_id AS composite_benchmark_key,
benchmark_id,
benchmark_card
FROM evals_view
WHERE benchmark_card IS NOT NULL`
)
const result: Record<string, BenchmarkCard> = {}
for (const row of rows) {
const card = parseMaybeJson(row.benchmark_card) as BenchmarkCard | null | undefined
if (!card) continue
const keys = [
row.evaluation_id,
row.evaluation_name,
row.composite_benchmark_key,
row.benchmark_id,
card.benchmark_details?.name,
].filter((key): key is string => typeof key === "string" && key.length > 0)
for (const key of keys) {
result[key] = card
}
}
return result
}
|