Spaces:
Running
Running
| import { mkdir, mkdtemp, rm, writeFile } from "fs/promises" | |
| import os from "os" | |
| import path from "path" | |
| import { DuckDBConnection } from "@duckdb/node-api" | |
| import { describe, expect, it } from "vitest" | |
| function sqlString(value: string) { | |
| return `'${value.replace(/'/g, "''")}'` | |
| } | |
| async function copyParquet(connection: DuckDBConnection, sql: string, outputPath: string) { | |
| await connection.run(`COPY (${sql}) TO ${sqlString(outputPath)} (FORMAT parquet)`) | |
| } | |
| async function writeSyntheticStageJSnapshot(snapshotDir: string) { | |
| await mkdir(snapshotDir, { recursive: true }) | |
| const connection = await DuckDBConnection.create() | |
| await copyParquet( | |
| connection, | |
| ` | |
| SELECT | |
| TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id, | |
| 'openai/gpt-5' AS model_key, | |
| 'openai/gpt-5' AS model_id, | |
| 'openai/gpt-5' AS id, | |
| 'openai%2Fgpt-5' AS route_id, | |
| 'openai%2Fgpt-5' AS model_route_id, | |
| 'openai/gpt-5' AS model_family_id, | |
| 'GPT 5' AS model_name, | |
| 'GPT 5' AS canonical_model_name, | |
| 'GPT 5' AS model_family_name, | |
| 'OpenAI' AS developer, | |
| DATE '2026-01-01' AS release_date, | |
| 'https://example.test/model' AS model_url, | |
| 'transformer' AS architecture, | |
| '100B' AS params, | |
| 100.0 AS params_billions, | |
| ['text']::VARCHAR[] AS input_modalities, | |
| ['text']::VARCHAR[] AS output_modalities, | |
| 'engine' AS inference_engine, | |
| 'platform' AS inference_platform, | |
| 1::BIGINT AS evaluations_count, | |
| 1::BIGINT AS benchmarks_count, | |
| 1::INTEGER AS variant_count, | |
| 1::BIGINT AS evaluator_count, | |
| ['OpenAI']::VARCHAR[] AS evaluator_names, | |
| 1::INTEGER AS source_type_count, | |
| ['documentation']::VARCHAR[] AS source_types, | |
| 0::BIGINT AS third_party_eval_count, | |
| 0.0 AS independent_verification_ratio, | |
| 1::BIGINT AS evidence_count, | |
| 0::INTEGER AS missing_generation_config_count, | |
| TIMESTAMP '2026-05-03 00:00:00' AS latest_timestamp, | |
| 'OpenAI' AS latest_source_name, | |
| ['MMLU']::VARCHAR[] AS benchmark_names, | |
| ['Reasoning']::VARCHAR[] AS categories, | |
| struct_pack("General" := 0, "Reasoning" := 1, "Agentic" := 0, "Safety" := 0, "Knowledge" := 0) AS category_stats, | |
| 'complete' AS reproducibility_status, | |
| struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary, | |
| struct_pack( | |
| total_results := 1, | |
| total_groups := 1, | |
| multi_source_groups := 0, | |
| first_party_only_groups := 1, | |
| source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0) | |
| ) AS provenance_summary, | |
| struct_pack( | |
| total_groups := 1, | |
| groups_with_variant_check := 0, | |
| groups_with_cross_party_check := 0, | |
| variant_divergent_count := 0, | |
| cross_party_divergent_count := 0 | |
| ) AS comparability_summary, | |
| [struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR)] AS eval_libraries, | |
| struct_pack(count := 1, min := 0.8, max := 0.8, average := 0.8) AS score_summary, | |
| [struct_pack(benchmark := 'MMLU', benchmarkKey := 'mmlu', score := 0.8, metric := 'accuracy')] AS top_scores, | |
| ['https://example.test/source']::VARCHAR[] AS source_urls, | |
| []::VARCHAR[] AS detail_urls, | |
| [struct_pack( | |
| variant_id := 'default', | |
| variant_key := 'default', | |
| variant_label := 'Default', | |
| variant_display_name := 'GPT 5', | |
| raw_model_ids := ['openai/gpt-5']::VARCHAR[], | |
| family_id := 'openai/gpt-5', | |
| family_name := 'GPT 5', | |
| version_date := NULL::VARCHAR, | |
| version_qualifier := NULL::VARCHAR, | |
| total_evaluations := 1, | |
| last_updated := TIMESTAMP '2026-05-03 00:00:00', | |
| categories_covered := ['Reasoning']::VARCHAR[] | |
| )] AS variants, | |
| ['openai/gpt-5']::VARCHAR[] AS raw_model_ids | |
| `, | |
| path.join(snapshotDir, "models_view.parquet") | |
| ) | |
| await copyParquet( | |
| connection, | |
| ` | |
| SELECT | |
| TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id, | |
| 'mmlu' AS evaluation_id, | |
| 'mmlu' AS benchmark_id, | |
| 'accuracy' AS primary_metric_id, | |
| 'MMLU' AS evaluation_name, | |
| 'MMLU' AS canonical_display_name, | |
| 'mmlu' AS composite_benchmark_key, | |
| 'MMLU' AS composite_benchmark_name, | |
| 'mmlu' AS composite_slug, | |
| 'MMLU' AS composite_display_name, | |
| 'mmlu' AS family_id, | |
| 'MMLU' AS family_display_name, | |
| false AS is_slice, | |
| NULL AS parent_benchmark_id, | |
| 'Reasoning' AS category, | |
| struct_pack( | |
| evaluation_description := 'Accuracy on MMLU', | |
| lower_is_better := false, | |
| score_type := 'continuous', | |
| min_score := 0.0, | |
| max_score := 1.0, | |
| unit := 'proportion' | |
| ) AS metric_config, | |
| 1::BIGINT AS models_count, | |
| ['OpenAI']::VARCHAR[] AS evaluator_names, | |
| ['documentation']::VARCHAR[] AS source_types, | |
| 'OpenAI' AS latest_source_name, | |
| 0.0 AS third_party_ratio, | |
| 0::INTEGER AS missing_generation_config_count, | |
| struct_pack(name := 'GPT 5', score := 0.8) AS best_model, | |
| struct_pack(name := 'GPT 5', score := 0.8) AS worst_model, | |
| 0.8 AS avg_score, | |
| 0.8 AS avg_score_norm, | |
| 0.8 AS top_score, | |
| false AS has_card, | |
| NULL AS benchmark_card, | |
| false AS is_aggregated, | |
| [] AS aggregate_sources, | |
| false AS is_summary_score, | |
| []::VARCHAR[] AS summary_eval_ids, | |
| struct_pack(domains := ['knowledge']::VARCHAR[], languages := ['en']::VARCHAR[], tasks := ['qa']::VARCHAR[]) AS tags, | |
| struct_pack( | |
| dataset_name := 'MMLU', | |
| source_type := 'documentation', | |
| hf_repo := NULL::VARCHAR, | |
| hf_split := NULL::VARCHAR, | |
| samples_number := 10, | |
| url := ['https://example.test/mmlu']::VARCHAR[], | |
| dataset_url := 'https://example.test/mmlu', | |
| dataset_version := 'v1' | |
| ) AS source_data, | |
| struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary, | |
| struct_pack( | |
| total_results := 1, | |
| total_groups := 1, | |
| multi_source_groups := 0, | |
| first_party_only_groups := 1, | |
| source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0) | |
| ) AS provenance_summary, | |
| struct_pack( | |
| total_groups := 1, | |
| groups_with_variant_check := 0, | |
| groups_with_cross_party_check := 0, | |
| variant_divergent_count := 0, | |
| cross_party_divergent_count := 0 | |
| ) AS comparability_summary, | |
| struct_pack(available := false, url_count := 0::BIGINT, sample_urls := []::VARCHAR[], models_with_loaded_instances := 0) AS instance_data, | |
| 1::INTEGER AS metrics_count, | |
| ['Accuracy']::VARCHAR[] AS metric_names, | |
| [struct_pack( | |
| column_key := 'root:accuracy', | |
| metric_summary_id := 'mmlu%3Aaccuracy', | |
| metric_id := 'accuracy', | |
| metric_name := 'accuracy', | |
| display_name := 'Accuracy', | |
| canonical_display_name := 'Accuracy', | |
| lower_is_better := false, | |
| unit := 'proportion', | |
| scope := 'root', | |
| subtask_key := NULL::VARCHAR, | |
| subtask_name := NULL::VARCHAR | |
| )] AS leaderboard_metrics, | |
| [] AS leaderboard_rows, | |
| [struct_pack( | |
| metric_summary_id := 'mmlu%3Aaccuracy', | |
| metric_name := 'accuracy', | |
| display_name := 'Accuracy', | |
| canonical_display_name := 'Accuracy', | |
| metric_key := 'accuracy', | |
| lower_is_better := false, | |
| models_count := 1, | |
| top_score := 0.8, | |
| unit := 'proportion' | |
| )] AS root_metrics, | |
| [] AS subtasks, | |
| 0::INTEGER AS subtasks_count | |
| `, | |
| path.join(snapshotDir, "evals_view.parquet") | |
| ) | |
| await copyParquet( | |
| connection, | |
| ` | |
| SELECT | |
| TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id, | |
| 'mmlu' AS evaluation_id, | |
| 'mmlu%3Aaccuracy' AS metric_summary_id, | |
| 'mmlu' AS benchmark_id, | |
| 'accuracy' AS metric_id, | |
| 'openai/gpt-5' AS model_key, | |
| 'openai/gpt-5' AS model_id, | |
| 'openai%2Fgpt-5' AS model_route_id, | |
| struct_pack( | |
| name := 'GPT 5', | |
| id := 'openai/gpt-5', | |
| developer := 'OpenAI', | |
| inference_platform := 'platform', | |
| inference_engine := 'engine', | |
| model_version := NULL::VARCHAR, | |
| architecture := 'transformer', | |
| parameter_count := '100B', | |
| release_date := '2026-01-01', | |
| model_url := 'https://example.test/model', | |
| modalities := struct_pack(input := ['text']::VARCHAR[], output := ['text']::VARCHAR[]) | |
| ) AS model_info, | |
| 'Accuracy' AS metric_display_name, | |
| 'proportion' AS metric_unit, | |
| false AS lower_is_better, | |
| 'Reasoning' AS category, | |
| 0.8 AS score, | |
| struct_pack( | |
| score := 0.8, | |
| standard_error := 0.01, | |
| sample_size := 10, | |
| confidence_interval := struct_pack(lower := 0.7, upper := 0.9, confidence_level := 0.95) | |
| ) AS score_details, | |
| 1::INTEGER AS fact_row_count, | |
| 1::INTEGER AS position, | |
| 1::INTEGER AS total, | |
| 1.0 AS percentile, | |
| TIMESTAMP '2026-05-03 00:00:00' AS evaluation_timestamp, | |
| struct_pack( | |
| source_name := 'OpenAI report', | |
| source_type := 'documentation', | |
| source_organization_name := 'OpenAI', | |
| source_organization_url := 'https://example.test', | |
| evaluator_relationship := 'first_party', | |
| source_url := 'https://example.test/report', | |
| publication_date := DATE '2026-05-03' | |
| ) AS source_metadata, | |
| struct_pack( | |
| dataset_name := 'MMLU', | |
| source_type := 'documentation', | |
| hf_repo := NULL::VARCHAR, | |
| hf_split := NULL::VARCHAR, | |
| samples_number := 10, | |
| url := ['https://example.test/mmlu']::VARCHAR[], | |
| dataset_url := 'https://example.test/mmlu', | |
| dataset_version := 'v1' | |
| ) AS source_data, | |
| 'https://example.test/record.json' AS source_record_url, | |
| struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR) AS eval_library, | |
| ['first_party']::VARCHAR[] AS evaluator_relationships, | |
| true AS has_first_party, | |
| false AS has_third_party, | |
| 'self' AS coverage_cell, | |
| ['OpenAI']::VARCHAR[] AS reporting_orgs, | |
| map(['OpenAI'], [0.8]) AS scores_by_organization, | |
| false AS is_summary_score, | |
| NULL::VARCHAR AS summary_score_for, | |
| [] AS aggregate_components, | |
| false AS has_reproducibility_gap, | |
| 1.0 AS completeness_score, | |
| false AS is_multi_source, | |
| true AS first_party_only, | |
| false AS has_variant_divergence, | |
| false AS has_cross_party_divergence, | |
| NULL AS evalcards_annotations, | |
| NULL::VARCHAR AS instance_file_path, | |
| NULL::VARCHAR AS instance_file_format, | |
| 0::INTEGER AS instance_rows | |
| `, | |
| path.join(snapshotDir, "eval_results_view.parquet") | |
| ) | |
| await writeFile( | |
| path.join(snapshotDir, "manifest.json"), | |
| JSON.stringify({ | |
| generated_at: "2026-05-03T00:00:00Z", | |
| config_version: 2, | |
| skipped_configs: [], | |
| model_count: 1, | |
| eval_count: 1, | |
| metric_eval_count: 1, | |
| source_config_count: 1, | |
| skipped_config_count: 0, | |
| summary_artifacts: { | |
| corpus_aggregates: "headline.json", | |
| eval_hierarchy: "hierarchy.json", | |
| }, | |
| }) | |
| ) | |
| const reproducibilityBlock = { | |
| total_triples: 1, | |
| triples_with_reproducibility_gap: 0, | |
| reproducibility_gap_rate: 0, | |
| agentic_triples: 0, | |
| per_field_missingness: { | |
| temperature: { | |
| missing_count: 0, | |
| missing_rate: 0, | |
| denominator: "all_triples", | |
| denominator_count: 1, | |
| }, | |
| }, | |
| } | |
| const completenessBlock = { | |
| total_triples: 1, | |
| completeness_avg: 0.75, | |
| completeness_min: 0.75, | |
| completeness_max: 0.75, | |
| } | |
| const provenanceBlock = { | |
| total_triples: 1, | |
| multi_source_triples: 0, | |
| first_party_only_triples: 1, | |
| source_type_distribution: { | |
| first_party: 1, | |
| third_party: 0, | |
| collaborative: 0, | |
| unspecified: 0, | |
| }, | |
| } | |
| const comparabilityBlock = { | |
| total_triples: 1, | |
| variant_divergent_count: 0, | |
| cross_party_divergent_count: 0, | |
| groups_with_variant_check: 1, | |
| groups_with_cross_party_check: 0, | |
| } | |
| await writeFile( | |
| path.join(snapshotDir, "headline.json"), | |
| JSON.stringify({ | |
| generated_at: "2026-05-03T00:00:00Z", | |
| signal_version: "1.0", | |
| stratification_dimensions: ["category"], | |
| reproducibility: { | |
| overall: reproducibilityBlock, | |
| by_category: { Reasoning: reproducibilityBlock }, | |
| }, | |
| completeness: { | |
| overall: completenessBlock, | |
| by_category: { Reasoning: completenessBlock }, | |
| }, | |
| provenance: { | |
| overall: provenanceBlock, | |
| by_category: { Reasoning: provenanceBlock }, | |
| }, | |
| comparability: { | |
| overall: comparabilityBlock, | |
| by_category: { Reasoning: comparabilityBlock }, | |
| }, | |
| developers: [ | |
| { | |
| developer: "OpenAI", | |
| route_id: "OpenAI", | |
| model_count: 1, | |
| benchmark_count: 1, | |
| evaluation_count: 1, | |
| popular_evals: [{ benchmark: "MMLU", model_count: 1 }], | |
| }, | |
| ], | |
| }) | |
| ) | |
| await writeFile( | |
| path.join(snapshotDir, "hierarchy.json"), | |
| JSON.stringify({ | |
| stats: { | |
| family_count: 1, | |
| composite_count: 0, | |
| standalone_benchmark_count: 1, | |
| single_benchmark_count: 1, | |
| slice_count: 0, | |
| metric_count: 1, | |
| metric_rows_scanned: 1, | |
| }, | |
| families: [], | |
| }) | |
| ) | |
| } | |
| describe("Stage J view-layer backend", () => { | |
| it("reads a pinned snapshot through the v2 accessors", async () => { | |
| const snapshotDir = await mkdtemp(path.join(os.tmpdir(), "eval-card-stage-j-")) | |
| const previousBackend = process.env.DATA_BACKEND | |
| const previousSnapshotUrl = process.env.SNAPSHOT_URL | |
| try { | |
| await writeSyntheticStageJSnapshot(snapshotDir) | |
| process.env.DATA_BACKEND = "v2" | |
| process.env.SNAPSHOT_URL = `file://${snapshotDir}` | |
| const dataBackend = await import("../lib/data-backend") | |
| const hfData = await import("../lib/hf-data") | |
| const [models, evalListData, modelSummary, evalSummary, developers, developerSummary, manifest, hierarchy, aggregates] = | |
| await Promise.all([ | |
| dataBackend.getModelCardsLite(), | |
| dataBackend.getEvalListLiteData(), | |
| dataBackend.getModelSummaryById("openai%2Fgpt-5"), | |
| dataBackend.getEvalSummaryById("mmlu"), | |
| dataBackend.getDeveloperList(), | |
| dataBackend.getDeveloperSummaryById("OpenAI"), | |
| dataBackend.getBackendManifestData(), | |
| dataBackend.getEvalHierarchyData(), | |
| hfData.fetchCorpusAggregates(), | |
| ]) | |
| expect(models[0]).toMatchObject({ | |
| route_id: "openai%2Fgpt-5", | |
| model_name: "GPT 5", | |
| evaluations_count: 1, | |
| }) | |
| expect(evalListData).toMatchObject({ | |
| totalModels: 1, | |
| evals: [{ evaluation_id: "mmlu", evaluation_name: "MMLU", models_count: 1 }], | |
| }) | |
| expect(modelSummary?.evaluations_by_category.Reasoning).toHaveLength(1) | |
| expect(evalSummary?.model_results[0]).toMatchObject({ | |
| model_route_id: "openai%2Fgpt-5", | |
| score: 0.8, | |
| result: { metric_summary_id: "mmlu%3Aaccuracy" }, | |
| }) | |
| expect(developers[0]).toMatchObject({ developer: "OpenAI", route_id: "OpenAI" }) | |
| expect(developerSummary?.models).toHaveLength(1) | |
| expect(manifest.model_count).toBe(1) | |
| expect(hierarchy.stats?.metric_rows_scanned).toBe(1) | |
| expect(aggregates?.completeness.overall).toMatchObject({ | |
| total_triples: 1, | |
| completeness_avg: 0.75, | |
| }) | |
| expect(aggregates?.provenance.overall).toMatchObject({ | |
| total_triples: 1, | |
| first_party_only_triples: 1, | |
| }) | |
| expect(aggregates?.comparability.overall).toMatchObject({ | |
| groups_with_variant_check: 1, | |
| variant_divergent_count: 0, | |
| }) | |
| expect(aggregates?.comparability.by_category.Reasoning).toBeDefined() | |
| } finally { | |
| if (previousBackend == null) { | |
| delete process.env.DATA_BACKEND | |
| } else { | |
| process.env.DATA_BACKEND = previousBackend | |
| } | |
| if (previousSnapshotUrl == null) { | |
| delete process.env.SNAPSHOT_URL | |
| } else { | |
| process.env.SNAPSHOT_URL = previousSnapshotUrl | |
| } | |
| await rm(snapshotDir, { recursive: true, force: true }) | |
| } | |
| }) | |
| }) | |