Spaces:
Running
Running
| // Leaderboard-parity gate — proves the runtime DuckDB leaderboard query reproduces | |
| // the producer's comparison-index `evals[].metrics[].scores[]` BYTE-FOR-BYTE, on the | |
| // prod-pinned binding, over the live snapshot. This is the core regression gate for | |
| // the comparison-index migration: if the scoped query ever diverges from what the | |
| // monolith served (rank semantics, ordering, identity column, membership), this fails. | |
| // | |
| // Run on linux/amd64 via scripts/migration-gate.sh. SNAPSHOT_URL selects the snapshot | |
| // (default = pinned prod; override to re-verify against a post-rebaseline snapshot). | |
| // | |
| // NOTE this is a MIGRATION-phase gate (it diffs against the comparison-index, which the | |
| // migration eventually deletes). Post-deletion it converts to a golden-fixture test. | |
| import { DuckDBConnection } from "@duckdb/node-api" | |
| const SNAP = (process.env.SNAPSHOT_URL || "").replace(/\/+$/, "") | |
| if (!SNAP) { console.error("parity: SNAPSHOT_URL required"); process.exit(2) } | |
| const c = await DuckDBConnection.create() | |
| for (const [v, f] of [["models_view", "models_view.parquet"], ["evals_view", "evals_view.parquet"], ["eval_results_view", "eval_results_view.parquet"]]) | |
| await c.run(`CREATE OR REPLACE TABLE ${v} AS SELECT * FROM read_parquet('${SNAP}/${f}')`) | |
| const rows = async (sql) => { const r = await c.runAndRead(sql); await r.readAll(); return r.getRowObjectsJson() } | |
| const num = (x) => (typeof x === "bigint" || typeof x === "string") ? Number(x) : x | |
| // Group-key column resolution: scores[].model_family_id is fed the | |
| // GROUP KEY. On the current pre-split snapshot that lives under `model_family_id`; on a | |
| // post-rebaseline snapshot it's `model_group_id` (and `model_family_id` becomes the | |
| // distinct structural id). Detect and use whichever carries the group key. | |
| const cols = new Set((await rows(`DESCRIBE models_view`)).map((r) => r.column_name)) | |
| const GROUP_KEY = cols.has("model_group_id") ? "model_group_id" : "model_family_id" | |
| console.log(`parity: group-key column = mv.${GROUP_KEY}`) | |
| const SQL = ` | |
| WITH src AS ( | |
| SELECT erv.evaluation_id, erv.metric_summary_id, erv.score, erv.model_route_id, | |
| mv.${GROUP_KEY} AS model_family_id, COALESCE(mv.model_family_name,'') AS model_family_name, | |
| COALESCE(mv.developer,'') AS developer, erv.lower_is_better | |
| FROM eval_results_view erv LEFT JOIN models_view mv ON mv.model_key = erv.model_key | |
| WHERE erv.score IS NOT NULL AND erv.evaluation_id IS NOT NULL | |
| AND erv.metric_summary_id IS NOT NULL AND erv.model_route_id IS NOT NULL ) | |
| SELECT evaluation_id, metric_summary_id, model_route_id, model_family_id, score, | |
| RANK() OVER (PARTITION BY evaluation_id, metric_summary_id | |
| ORDER BY (CASE WHEN lower_is_better THEN score ELSE -score END) ASC) AS rank, | |
| COUNT(*) OVER (PARTITION BY evaluation_id, metric_summary_id) AS total | |
| FROM src | |
| ORDER BY evaluation_id, metric_summary_id, | |
| (CASE WHEN lower_is_better THEN score ELSE -score END) ASC, model_route_id ASC` | |
| const cand = new Map() | |
| for (const r of await rows(SQL)) { const k = r.evaluation_id + "|" + r.metric_summary_id; (cand.get(k) ?? cand.set(k, []).get(k)).push(r) } | |
| const ci = await (await fetch(`${SNAP}/comparison-index.json`)).json() | |
| const live = new Map() | |
| for (const [evalId, ev] of Object.entries(ci.evals)) for (const m of ev.metrics || []) live.set(evalId + "|" + m.metric_summary_id, m.scores || []) | |
| let exact = 0, lenDiff = 0, orderDiff = 0, rankDiff = 0, totalDiff = 0, scoreDiff = 0, famDiff = 0 | |
| const onlyLive = [...live.keys()].filter((k) => !cand.has(k)) | |
| const onlyCand = [...cand.keys()].filter((k) => !live.has(k)) | |
| const samples = [] | |
| for (const [k, L] of live) { | |
| const C = cand.get(k); if (!C) continue | |
| if (C.length !== L.length) { lenDiff++; if (samples.length < 8) samples.push(`LEN ${k}: live=${L.length} cand=${C.length}`); continue } | |
| let ok = true | |
| for (let i = 0; i < L.length; i++) { | |
| const a = L[i], b = C[i] | |
| if (a.model_route_id !== b.model_route_id) { orderDiff++; ok = false; if (samples.length < 8) samples.push(`ORDER ${k}[${i}] live=${a.model_route_id} cand=${b.model_route_id}`); break } | |
| if (num(a.rank) !== num(b.rank)) { rankDiff++; ok = false } | |
| if (num(a.total) !== num(b.total)) { totalDiff++; ok = false } | |
| if (Math.abs(num(a.score) - num(b.score)) > 1e-9) { scoreDiff++; ok = false } | |
| if ((a.model_family_id ?? null) !== (b.model_family_id ?? null)) { famDiff++; ok = false } | |
| } | |
| if (ok) exact++ | |
| } | |
| console.log(`parity: live=${live.size} candidate=${cand.size} | onlyLive=${onlyLive.length} onlyCand=${onlyCand.length} | exact=${exact}`) | |
| console.log(`parity diffs -> length:${lenDiff} order:${orderDiff} rank:${rankDiff} total:${totalDiff} score:${scoreDiff} familyId:${famDiff}`) | |
| if (samples.length) console.log(" " + samples.join("\n ")) | |
| const PASS = onlyLive.length === 0 && onlyCand.length === 0 && lenDiff === 0 && orderDiff === 0 && rankDiff === 0 && totalDiff === 0 && scoreDiff === 0 && famDiff === 0 | |
| console.log(PASS ? "LEADERBOARD PARITY: PASS" : "LEADERBOARD PARITY: FAIL") | |
| process.exit(PASS ? 0 : 1) | |