general-eval-card / scripts /verify-instance-level-data.mjs
Jenny Chim
Deploy DuckDB-backed frontend to
da8db3e
import "./server-only-shim.mjs"
import fs from "fs"
import path from "path"
// Audit script for `notes/transformations/12-instance-level-data.md`.
//
// Walks .cache/hf-data/models/*.json, finds every result row that has
// `instance_level_data.instance_examples`, replicates the parser's branch
// logic, and reports:
// - Prevalence (% of rows / files with sample data)
// - ild-level shape uniformity (which top-level keys appear)
// - interaction_type distribution
// - Per-output-field branch firing rates (which fallback path wins for each)
// - Distinct example top-level key signatures (top 5)
//
// Output is the raw distribution; the spec's "Pipeline status" section quotes
// these numbers. Run after `pnpm cache-hf-data` to refresh.
const CACHE_DIR = ".cache/hf-data"
const MODELS_DIR = path.join(CACHE_DIR, "models")
if (!fs.existsSync(MODELS_DIR)) {
console.error("=== ERROR: HF data cache missing ===")
console.error(` Expected: ${MODELS_DIR}`)
console.error(" Prime it first: pnpm cache-hf-data")
process.exit(1)
}
const files = fs.readdirSync(MODELS_DIR).filter((f) => f.endsWith(".json"))
// Tally trackers
const branchHits = {
input: { string: 0, "input.raw": 0, prompt: 0, question: 0, "doc.question": 0, "doc.JSON": 0, EMPTY: 0 },
ground_truth: { "input.reference": 0, ground_truth: 0, target: 0, gold: 0, "doc.answer": 0, NONE: 0 },
response: { output: 0, response: 0, model_output: 0, answer_attribution: 0, messages: 0, filtered_resps: 0, resps: 0, EMPTY: 0 },
is_correct: { "evaluation.is_correct": 0, is_correct: 0, "metrics.exact_match": 0, NONE: 0 },
sample_id: { sample_id: 0, doc_id: 0, id: 0, index_fallback: 0 },
choices: { choices: 0, "doc.choices": 0, NONE: 0 },
}
let totalFiles = 0
let filesWithAnyIld = 0
let totalEvals = 0
let evalsWithIld = 0
let totalResults = 0
let resultsWithIld = 0
let totalExamples = 0
let totalInstanceCountSum = 0 // sum of full-set sizes (i.e. URL JSONL totals)
const ildKeysSeen = new Map()
const interactionTypes = new Map()
const exampleKeysHistogram = new Map()
function classifyExample(raw) {
if (!raw || typeof raw !== "object") return
totalExamples++
// Track first-level keys present
const sig = Object.keys(raw).sort().join(",")
exampleKeysHistogram.set(sig, (exampleKeysHistogram.get(sig) ?? 0) + 1)
// input
if (typeof raw.input === "string") branchHits.input.string++
else if (raw.input?.raw != null) branchHits.input["input.raw"]++
else if (raw.prompt) branchHits.input.prompt++
else if (raw.question) branchHits.input.question++
else if (raw.doc?.question) branchHits.input["doc.question"]++
else if (raw.doc) branchHits.input["doc.JSON"]++
else branchHits.input.EMPTY++
// ground_truth
if (raw.input?.reference != null) branchHits.ground_truth["input.reference"]++
else if (raw.ground_truth != null) branchHits.ground_truth.ground_truth++
else if (raw.target != null) branchHits.ground_truth.target++
else if (raw.gold != null) branchHits.ground_truth.gold++
else if (raw.doc?.answer != null) branchHits.ground_truth["doc.answer"]++
else branchHits.ground_truth.NONE++
// response
if (raw.output != null) branchHits.response.output++
else if (raw.response) branchHits.response.response++
else if (raw.model_output) branchHits.response.model_output++
else if (Array.isArray(raw.answer_attribution) && raw.answer_attribution.length > 0) branchHits.response.answer_attribution++
else if (Array.isArray(raw.messages) && raw.messages.length > 0) branchHits.response.messages++
else if (raw.filtered_resps?.[0]?.[0]) branchHits.response.filtered_resps++
else if (raw.resps?.[0]?.[0]) branchHits.response.resps++
else branchHits.response.EMPTY++
// is_correct
if (raw.evaluation?.is_correct !== undefined) branchHits.is_correct["evaluation.is_correct"]++
else if (raw.is_correct !== undefined) branchHits.is_correct.is_correct++
else if (raw.metrics?.exact_match === 1 || raw.metrics?.exact_match === 0) branchHits.is_correct["metrics.exact_match"]++
else branchHits.is_correct.NONE++
// sample_id
if (raw.sample_id != null) branchHits.sample_id.sample_id++
else if (raw.doc_id != null) branchHits.sample_id.doc_id++
else if (raw.id != null) branchHits.sample_id.id++
else branchHits.sample_id.index_fallback++
// choices
if (raw.choices != null) branchHits.choices.choices++
else if (raw.doc?.choices != null) branchHits.choices["doc.choices"]++
else branchHits.choices.NONE++
}
function walk(node) {
let nodeHasIld = false
for (const m of node.metrics ?? []) {
totalEvals++
let evalHasIld = false
for (const r of m.model_results ?? []) {
totalResults++
const ild = r.instance_level_data
if (ild != null && typeof ild === "object" && Array.isArray(ild.instance_examples) && ild.instance_examples.length > 0) {
resultsWithIld++
evalHasIld = true
nodeHasIld = true
const ildSig = Object.keys(ild).sort().join(",")
ildKeysSeen.set(ildSig, (ildKeysSeen.get(ildSig) ?? 0) + 1)
if (typeof ild.interaction_type === "string") {
interactionTypes.set(ild.interaction_type, (interactionTypes.get(ild.interaction_type) ?? 0) + 1)
}
if (typeof ild.instance_count === "number") {
totalInstanceCountSum += ild.instance_count
}
for (const ex of ild.instance_examples) {
classifyExample(ex)
}
}
}
if (evalHasIld) evalsWithIld++
}
for (const s of node.subtasks ?? []) walk(s)
return nodeHasIld
}
for (const f of files) {
totalFiles++
let fileHasIld = false
let data
try {
data = JSON.parse(fs.readFileSync(path.join(MODELS_DIR, f), "utf-8"))
} catch {
continue
}
for (const cat of Object.values(data.hierarchy_by_category ?? {})) {
for (const node of cat) {
if (walk(node)) fileHasIld = true
}
}
if (fileHasIld) filesWithAnyIld++
}
const pct = (n, total = totalExamples) => (total ? ((100 * n) / total).toFixed(2) + "%" : "-")
console.log("=== Audit: instance_level_data prevalence ===")
console.log(` Total model files: ${totalFiles}`)
console.log(` Files with any ild: ${filesWithAnyIld} (${pct(filesWithAnyIld, totalFiles)})`)
console.log(` Total (metric × result) rows: ${totalResults}`)
console.log(` Result rows with ild: ${resultsWithIld} (${pct(resultsWithIld, totalResults)})`)
console.log(` Total inline preview examples (≤5 per row): ${totalExamples}`)
console.log(` Total full samples (sum of instance_count, accessible via source_url): ${totalInstanceCountSum}`)
console.log("\n=== ild-level top-level key signatures ===")
for (const [k, v] of [...ildKeysSeen.entries()].sort((a, b) => b[1] - a[1])) {
console.log(` ${v}x: ${k}`)
}
console.log("\n=== interaction_type distribution ===")
for (const [k, v] of [...interactionTypes.entries()].sort((a, b) => b[1] - a[1])) {
console.log(` ${v}x: ${k}`)
}
console.log("\n=== Per-field branch firing rates ===\n")
for (const [field, branches] of Object.entries(branchHits)) {
console.log(`--- ${field} ---`)
for (const [b, n] of Object.entries(branches)) {
console.log(` ${b}: ${n} (${pct(n)})`)
}
console.log()
}
console.log("=== Distinct example top-level key signatures (top 5) ===")
const sorted = [...exampleKeysHistogram.entries()].sort((a, b) => b[1] - a[1]).slice(0, 5)
for (const [sig, count] of sorted) {
console.log(` ${count}x: ${sig.substring(0, 200)}${sig.length > 200 ? "…" : ""}`)
}
console.log("\n=== Summary for spec ===")
console.log(`Inline preview prevalence: ${resultsWithIld}/${totalResults} rows (${pct(resultsWithIld, totalResults)})`)
console.log(`Full-corpus samples behind source_url: ~${totalInstanceCountSum} (vs ${totalExamples} loaded inline)`)
console.log(`Defensive scaffolding firing rate: see per-field breakdown above. Most branches are 0.00%.`)