j-chim's picture
Update readme for pre-push script
82e8fdb
Raw
History Blame Contribute Delete
5.05 kB
// Leaderboard-parity gate — proves the runtime DuckDB leaderboard query reproduces
// the producer's comparison-index `evals[].metrics[].scores[]` BYTE-FOR-BYTE, on the
// prod-pinned binding, over the live snapshot. This is the core regression gate for
// the comparison-index migration: if the scoped query ever diverges from what the
// monolith served (rank semantics, ordering, identity column, membership), this fails.
//
// Run on linux/amd64 via scripts/migration-gate.sh. SNAPSHOT_URL selects the snapshot
// (default = pinned prod; override to re-verify against a post-rebaseline snapshot).
//
// NOTE this is a MIGRATION-phase gate (it diffs against the comparison-index, which the
// migration eventually deletes). Post-deletion it converts to a golden-fixture test.
import { DuckDBConnection } from "@duckdb/node-api"
const SNAP = (process.env.SNAPSHOT_URL || "").replace(/\/+$/, "")
if (!SNAP) { console.error("parity: SNAPSHOT_URL required"); process.exit(2) }
const c = await DuckDBConnection.create()
for (const [v, f] of [["models_view", "models_view.parquet"], ["evals_view", "evals_view.parquet"], ["eval_results_view", "eval_results_view.parquet"]])
await c.run(`CREATE OR REPLACE TABLE ${v} AS SELECT * FROM read_parquet('${SNAP}/${f}')`)
const rows = async (sql) => { const r = await c.runAndRead(sql); await r.readAll(); return r.getRowObjectsJson() }
const num = (x) => (typeof x === "bigint" || typeof x === "string") ? Number(x) : x
// Group-key column resolution: scores[].model_family_id is fed the
// GROUP KEY. On the current pre-split snapshot that lives under `model_family_id`; on a
// post-rebaseline snapshot it's `model_group_id` (and `model_family_id` becomes the
// distinct structural id). Detect and use whichever carries the group key.
const cols = new Set((await rows(`DESCRIBE models_view`)).map((r) => r.column_name))
const GROUP_KEY = cols.has("model_group_id") ? "model_group_id" : "model_family_id"
console.log(`parity: group-key column = mv.${GROUP_KEY}`)
const SQL = `
WITH src AS (
SELECT erv.evaluation_id, erv.metric_summary_id, erv.score, erv.model_route_id,
mv.${GROUP_KEY} AS model_family_id, COALESCE(mv.model_family_name,'') AS model_family_name,
COALESCE(mv.developer,'') AS developer, erv.lower_is_better
FROM eval_results_view erv LEFT JOIN models_view mv ON mv.model_key = erv.model_key
WHERE erv.score IS NOT NULL AND erv.evaluation_id IS NOT NULL
AND erv.metric_summary_id IS NOT NULL AND erv.model_route_id IS NOT NULL )
SELECT evaluation_id, metric_summary_id, model_route_id, model_family_id, score,
RANK() OVER (PARTITION BY evaluation_id, metric_summary_id
ORDER BY (CASE WHEN lower_is_better THEN score ELSE -score END) ASC) AS rank,
COUNT(*) OVER (PARTITION BY evaluation_id, metric_summary_id) AS total
FROM src
ORDER BY evaluation_id, metric_summary_id,
(CASE WHEN lower_is_better THEN score ELSE -score END) ASC, model_route_id ASC`
const cand = new Map()
for (const r of await rows(SQL)) { const k = r.evaluation_id + "|" + r.metric_summary_id; (cand.get(k) ?? cand.set(k, []).get(k)).push(r) }
const ci = await (await fetch(`${SNAP}/comparison-index.json`)).json()
const live = new Map()
for (const [evalId, ev] of Object.entries(ci.evals)) for (const m of ev.metrics || []) live.set(evalId + "|" + m.metric_summary_id, m.scores || [])
let exact = 0, lenDiff = 0, orderDiff = 0, rankDiff = 0, totalDiff = 0, scoreDiff = 0, famDiff = 0
const onlyLive = [...live.keys()].filter((k) => !cand.has(k))
const onlyCand = [...cand.keys()].filter((k) => !live.has(k))
const samples = []
for (const [k, L] of live) {
const C = cand.get(k); if (!C) continue
if (C.length !== L.length) { lenDiff++; if (samples.length < 8) samples.push(`LEN ${k}: live=${L.length} cand=${C.length}`); continue }
let ok = true
for (let i = 0; i < L.length; i++) {
const a = L[i], b = C[i]
if (a.model_route_id !== b.model_route_id) { orderDiff++; ok = false; if (samples.length < 8) samples.push(`ORDER ${k}[${i}] live=${a.model_route_id} cand=${b.model_route_id}`); break }
if (num(a.rank) !== num(b.rank)) { rankDiff++; ok = false }
if (num(a.total) !== num(b.total)) { totalDiff++; ok = false }
if (Math.abs(num(a.score) - num(b.score)) > 1e-9) { scoreDiff++; ok = false }
if ((a.model_family_id ?? null) !== (b.model_family_id ?? null)) { famDiff++; ok = false }
}
if (ok) exact++
}
console.log(`parity: live=${live.size} candidate=${cand.size} | onlyLive=${onlyLive.length} onlyCand=${onlyCand.length} | exact=${exact}`)
console.log(`parity diffs -> length:${lenDiff} order:${orderDiff} rank:${rankDiff} total:${totalDiff} score:${scoreDiff} familyId:${famDiff}`)
if (samples.length) console.log(" " + samples.join("\n "))
const PASS = onlyLive.length === 0 && onlyCand.length === 0 && lenDiff === 0 && orderDiff === 0 && rankDiff === 0 && totalDiff === 0 && scoreDiff === 0 && famDiff === 0
console.log(PASS ? "LEADERBOARD PARITY: PASS" : "LEADERBOARD PARITY: FAIL")
process.exit(PASS ? 0 : 1)