general-eval-card / tests /view-data.test.ts
j-chim's picture
Refactor to align on benchmark hierarchy
2ed4959
import { mkdir, mkdtemp, rm, writeFile } from "fs/promises"
import os from "os"
import path from "path"
import { DuckDBConnection } from "@duckdb/node-api"
import { describe, expect, it } from "vitest"
function sqlString(value: string) {
return `'${value.replace(/'/g, "''")}'`
}
async function copyParquet(connection: DuckDBConnection, sql: string, outputPath: string) {
await connection.run(`COPY (${sql}) TO ${sqlString(outputPath)} (FORMAT parquet)`)
}
async function writeSyntheticStageJSnapshot(snapshotDir: string) {
await mkdir(snapshotDir, { recursive: true })
const connection = await DuckDBConnection.create()
await copyParquet(
connection,
`
SELECT
TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
'openai/gpt-5' AS model_key,
'openai/gpt-5' AS model_id,
'openai/gpt-5' AS id,
'openai%2Fgpt-5' AS route_id,
'openai%2Fgpt-5' AS model_route_id,
'openai/gpt-5' AS model_family_id,
'GPT 5' AS model_name,
'GPT 5' AS canonical_model_name,
'GPT 5' AS model_family_name,
'OpenAI' AS developer,
DATE '2026-01-01' AS release_date,
'https://example.test/model' AS model_url,
'transformer' AS architecture,
'100B' AS params,
100.0 AS params_billions,
['text']::VARCHAR[] AS input_modalities,
['text']::VARCHAR[] AS output_modalities,
'engine' AS inference_engine,
'platform' AS inference_platform,
1::BIGINT AS evaluations_count,
1::BIGINT AS benchmarks_count,
1::INTEGER AS variant_count,
1::BIGINT AS evaluator_count,
['OpenAI']::VARCHAR[] AS evaluator_names,
1::INTEGER AS source_type_count,
['documentation']::VARCHAR[] AS source_types,
0::BIGINT AS third_party_eval_count,
0.0 AS independent_verification_ratio,
1::BIGINT AS evidence_count,
0::INTEGER AS missing_generation_config_count,
TIMESTAMP '2026-05-03 00:00:00' AS latest_timestamp,
'OpenAI' AS latest_source_name,
['MMLU']::VARCHAR[] AS benchmark_names,
['Reasoning']::VARCHAR[] AS categories,
struct_pack("General" := 0, "Reasoning" := 1, "Agentic" := 0, "Safety" := 0, "Knowledge" := 0) AS category_stats,
'complete' AS reproducibility_status,
struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
struct_pack(
total_results := 1,
total_groups := 1,
multi_source_groups := 0,
first_party_only_groups := 1,
source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
) AS provenance_summary,
struct_pack(
total_groups := 1,
groups_with_variant_check := 0,
groups_with_cross_party_check := 0,
variant_divergent_count := 0,
cross_party_divergent_count := 0
) AS comparability_summary,
[struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR)] AS eval_libraries,
struct_pack(count := 1, min := 0.8, max := 0.8, average := 0.8) AS score_summary,
[struct_pack(benchmark := 'MMLU', benchmarkKey := 'mmlu', score := 0.8, metric := 'accuracy')] AS top_scores,
['https://example.test/source']::VARCHAR[] AS source_urls,
[]::VARCHAR[] AS detail_urls,
[struct_pack(
variant_id := 'default',
variant_key := 'default',
variant_label := 'Default',
variant_display_name := 'GPT 5',
raw_model_ids := ['openai/gpt-5']::VARCHAR[],
family_id := 'openai/gpt-5',
family_name := 'GPT 5',
version_date := NULL::VARCHAR,
version_qualifier := NULL::VARCHAR,
total_evaluations := 1,
last_updated := TIMESTAMP '2026-05-03 00:00:00',
categories_covered := ['Reasoning']::VARCHAR[]
)] AS variants,
['openai/gpt-5']::VARCHAR[] AS raw_model_ids
`,
path.join(snapshotDir, "models_view.parquet")
)
await copyParquet(
connection,
`
SELECT
TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
'mmlu' AS evaluation_id,
'mmlu' AS benchmark_id,
'accuracy' AS primary_metric_id,
'MMLU' AS evaluation_name,
'MMLU' AS canonical_display_name,
'mmlu' AS composite_benchmark_key,
'MMLU' AS composite_benchmark_name,
'mmlu' AS composite_slug,
'MMLU' AS composite_display_name,
'mmlu' AS family_id,
'MMLU' AS family_display_name,
false AS is_slice,
NULL AS parent_benchmark_id,
'Reasoning' AS category,
struct_pack(
evaluation_description := 'Accuracy on MMLU',
lower_is_better := false,
score_type := 'continuous',
min_score := 0.0,
max_score := 1.0,
unit := 'proportion'
) AS metric_config,
1::BIGINT AS models_count,
['OpenAI']::VARCHAR[] AS evaluator_names,
['documentation']::VARCHAR[] AS source_types,
'OpenAI' AS latest_source_name,
0.0 AS third_party_ratio,
0::INTEGER AS missing_generation_config_count,
struct_pack(name := 'GPT 5', score := 0.8) AS best_model,
struct_pack(name := 'GPT 5', score := 0.8) AS worst_model,
0.8 AS avg_score,
0.8 AS avg_score_norm,
0.8 AS top_score,
false AS has_card,
NULL AS benchmark_card,
false AS is_aggregated,
[] AS aggregate_sources,
false AS is_summary_score,
[]::VARCHAR[] AS summary_eval_ids,
struct_pack(domains := ['knowledge']::VARCHAR[], languages := ['en']::VARCHAR[], tasks := ['qa']::VARCHAR[]) AS tags,
struct_pack(
dataset_name := 'MMLU',
source_type := 'documentation',
hf_repo := NULL::VARCHAR,
hf_split := NULL::VARCHAR,
samples_number := 10,
url := ['https://example.test/mmlu']::VARCHAR[],
dataset_url := 'https://example.test/mmlu',
dataset_version := 'v1'
) AS source_data,
struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
struct_pack(
total_results := 1,
total_groups := 1,
multi_source_groups := 0,
first_party_only_groups := 1,
source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
) AS provenance_summary,
struct_pack(
total_groups := 1,
groups_with_variant_check := 0,
groups_with_cross_party_check := 0,
variant_divergent_count := 0,
cross_party_divergent_count := 0
) AS comparability_summary,
struct_pack(available := false, url_count := 0::BIGINT, sample_urls := []::VARCHAR[], models_with_loaded_instances := 0) AS instance_data,
1::INTEGER AS metrics_count,
['Accuracy']::VARCHAR[] AS metric_names,
[struct_pack(
column_key := 'root:accuracy',
metric_summary_id := 'mmlu%3Aaccuracy',
metric_id := 'accuracy',
metric_name := 'accuracy',
display_name := 'Accuracy',
canonical_display_name := 'Accuracy',
lower_is_better := false,
unit := 'proportion',
scope := 'root',
subtask_key := NULL::VARCHAR,
subtask_name := NULL::VARCHAR
)] AS leaderboard_metrics,
[] AS leaderboard_rows,
[struct_pack(
metric_summary_id := 'mmlu%3Aaccuracy',
metric_name := 'accuracy',
display_name := 'Accuracy',
canonical_display_name := 'Accuracy',
metric_key := 'accuracy',
lower_is_better := false,
models_count := 1,
top_score := 0.8,
unit := 'proportion'
)] AS root_metrics,
[] AS subtasks,
0::INTEGER AS subtasks_count
`,
path.join(snapshotDir, "evals_view.parquet")
)
await copyParquet(
connection,
`
SELECT
TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
'mmlu' AS evaluation_id,
'mmlu%3Aaccuracy' AS metric_summary_id,
'mmlu' AS benchmark_id,
'accuracy' AS metric_id,
'openai/gpt-5' AS model_key,
'openai/gpt-5' AS model_id,
'openai%2Fgpt-5' AS model_route_id,
struct_pack(
name := 'GPT 5',
id := 'openai/gpt-5',
developer := 'OpenAI',
inference_platform := 'platform',
inference_engine := 'engine',
model_version := NULL::VARCHAR,
architecture := 'transformer',
parameter_count := '100B',
release_date := '2026-01-01',
model_url := 'https://example.test/model',
modalities := struct_pack(input := ['text']::VARCHAR[], output := ['text']::VARCHAR[])
) AS model_info,
'Accuracy' AS metric_display_name,
'proportion' AS metric_unit,
false AS lower_is_better,
'Reasoning' AS category,
0.8 AS score,
struct_pack(
score := 0.8,
standard_error := 0.01,
sample_size := 10,
confidence_interval := struct_pack(lower := 0.7, upper := 0.9, confidence_level := 0.95)
) AS score_details,
1::INTEGER AS fact_row_count,
1::INTEGER AS position,
1::INTEGER AS total,
1.0 AS percentile,
TIMESTAMP '2026-05-03 00:00:00' AS evaluation_timestamp,
struct_pack(
source_name := 'OpenAI report',
source_type := 'documentation',
source_organization_name := 'OpenAI',
source_organization_url := 'https://example.test',
evaluator_relationship := 'first_party',
source_url := 'https://example.test/report',
publication_date := DATE '2026-05-03'
) AS source_metadata,
struct_pack(
dataset_name := 'MMLU',
source_type := 'documentation',
hf_repo := NULL::VARCHAR,
hf_split := NULL::VARCHAR,
samples_number := 10,
url := ['https://example.test/mmlu']::VARCHAR[],
dataset_url := 'https://example.test/mmlu',
dataset_version := 'v1'
) AS source_data,
'https://example.test/record.json' AS source_record_url,
struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR) AS eval_library,
['first_party']::VARCHAR[] AS evaluator_relationships,
true AS has_first_party,
false AS has_third_party,
'self' AS coverage_cell,
['OpenAI']::VARCHAR[] AS reporting_orgs,
map(['OpenAI'], [0.8]) AS scores_by_organization,
false AS is_summary_score,
NULL::VARCHAR AS summary_score_for,
[] AS aggregate_components,
false AS has_reproducibility_gap,
1.0 AS completeness_score,
false AS is_multi_source,
true AS first_party_only,
false AS has_variant_divergence,
false AS has_cross_party_divergence,
NULL AS evalcards_annotations,
NULL::VARCHAR AS instance_file_path,
NULL::VARCHAR AS instance_file_format,
0::INTEGER AS instance_rows
`,
path.join(snapshotDir, "eval_results_view.parquet")
)
await writeFile(
path.join(snapshotDir, "manifest.json"),
JSON.stringify({
generated_at: "2026-05-03T00:00:00Z",
config_version: 2,
skipped_configs: [],
model_count: 1,
eval_count: 1,
metric_eval_count: 1,
source_config_count: 1,
skipped_config_count: 0,
summary_artifacts: {
corpus_aggregates: "headline.json",
eval_hierarchy: "hierarchy.json",
},
})
)
const reproducibilityBlock = {
total_triples: 1,
triples_with_reproducibility_gap: 0,
reproducibility_gap_rate: 0,
agentic_triples: 0,
per_field_missingness: {
temperature: {
missing_count: 0,
missing_rate: 0,
denominator: "all_triples",
denominator_count: 1,
},
},
}
const completenessBlock = {
total_triples: 1,
completeness_avg: 0.75,
completeness_min: 0.75,
completeness_max: 0.75,
}
const provenanceBlock = {
total_triples: 1,
multi_source_triples: 0,
first_party_only_triples: 1,
source_type_distribution: {
first_party: 1,
third_party: 0,
collaborative: 0,
unspecified: 0,
},
}
const comparabilityBlock = {
total_triples: 1,
variant_divergent_count: 0,
cross_party_divergent_count: 0,
groups_with_variant_check: 1,
groups_with_cross_party_check: 0,
}
await writeFile(
path.join(snapshotDir, "headline.json"),
JSON.stringify({
generated_at: "2026-05-03T00:00:00Z",
signal_version: "1.0",
stratification_dimensions: ["category"],
reproducibility: {
overall: reproducibilityBlock,
by_category: { Reasoning: reproducibilityBlock },
},
completeness: {
overall: completenessBlock,
by_category: { Reasoning: completenessBlock },
},
provenance: {
overall: provenanceBlock,
by_category: { Reasoning: provenanceBlock },
},
comparability: {
overall: comparabilityBlock,
by_category: { Reasoning: comparabilityBlock },
},
developers: [
{
developer: "OpenAI",
route_id: "OpenAI",
model_count: 1,
benchmark_count: 1,
evaluation_count: 1,
popular_evals: [{ benchmark: "MMLU", model_count: 1 }],
},
],
})
)
await writeFile(
path.join(snapshotDir, "hierarchy.json"),
JSON.stringify({
stats: {
family_count: 1,
composite_count: 0,
standalone_benchmark_count: 1,
single_benchmark_count: 1,
slice_count: 0,
metric_count: 1,
metric_rows_scanned: 1,
},
families: [],
})
)
}
describe("Stage J view-layer backend", () => {
it("reads a pinned snapshot through the v2 accessors", async () => {
const snapshotDir = await mkdtemp(path.join(os.tmpdir(), "eval-card-stage-j-"))
const previousBackend = process.env.DATA_BACKEND
const previousSnapshotUrl = process.env.SNAPSHOT_URL
try {
await writeSyntheticStageJSnapshot(snapshotDir)
process.env.DATA_BACKEND = "v2"
process.env.SNAPSHOT_URL = `file://${snapshotDir}`
const dataBackend = await import("../lib/data-backend")
const hfData = await import("../lib/hf-data")
const [models, evalListData, modelSummary, evalSummary, developers, developerSummary, manifest, hierarchy, aggregates] =
await Promise.all([
dataBackend.getModelCardsLite(),
dataBackend.getEvalListLiteData(),
dataBackend.getModelSummaryById("openai%2Fgpt-5"),
dataBackend.getEvalSummaryById("mmlu"),
dataBackend.getDeveloperList(),
dataBackend.getDeveloperSummaryById("OpenAI"),
dataBackend.getBackendManifestData(),
dataBackend.getEvalHierarchyData(),
hfData.fetchCorpusAggregates(),
])
expect(models[0]).toMatchObject({
route_id: "openai%2Fgpt-5",
model_name: "GPT 5",
evaluations_count: 1,
})
expect(evalListData).toMatchObject({
totalModels: 1,
evals: [{ evaluation_id: "mmlu", evaluation_name: "MMLU", models_count: 1 }],
})
expect(modelSummary?.evaluations_by_category.Reasoning).toHaveLength(1)
expect(evalSummary?.model_results[0]).toMatchObject({
model_route_id: "openai%2Fgpt-5",
score: 0.8,
result: { metric_summary_id: "mmlu%3Aaccuracy" },
})
expect(developers[0]).toMatchObject({ developer: "OpenAI", route_id: "OpenAI" })
expect(developerSummary?.models).toHaveLength(1)
expect(manifest.model_count).toBe(1)
expect(hierarchy.stats?.metric_rows_scanned).toBe(1)
expect(aggregates?.completeness.overall).toMatchObject({
total_triples: 1,
completeness_avg: 0.75,
})
expect(aggregates?.provenance.overall).toMatchObject({
total_triples: 1,
first_party_only_triples: 1,
})
expect(aggregates?.comparability.overall).toMatchObject({
groups_with_variant_check: 1,
variant_divergent_count: 0,
})
expect(aggregates?.comparability.by_category.Reasoning).toBeDefined()
} finally {
if (previousBackend == null) {
delete process.env.DATA_BACKEND
} else {
process.env.DATA_BACKEND = previousBackend
}
if (previousSnapshotUrl == null) {
delete process.env.SNAPSHOT_URL
} else {
process.env.SNAPSHOT_URL = previousSnapshotUrl
}
await rm(snapshotDir, { recursive: true, force: true })
}
})
})