import "./server-only-shim.mjs" import fs from "fs" import path from "path" // Audit script for `notes/transformations/12-instance-level-data.md`. // // Walks .cache/hf-data/models/*.json, finds every result row that has // `instance_level_data.instance_examples`, replicates the parser's branch // logic, and reports: // - Prevalence (% of rows / files with sample data) // - ild-level shape uniformity (which top-level keys appear) // - interaction_type distribution // - Per-output-field branch firing rates (which fallback path wins for each) // - Distinct example top-level key signatures (top 5) // // Output is the raw distribution; the spec's "Pipeline status" section quotes // these numbers. Run after `pnpm cache-hf-data` to refresh. const CACHE_DIR = ".cache/hf-data" const MODELS_DIR = path.join(CACHE_DIR, "models") if (!fs.existsSync(MODELS_DIR)) { console.error("=== ERROR: HF data cache missing ===") console.error(` Expected: ${MODELS_DIR}`) console.error(" Prime it first: pnpm cache-hf-data") process.exit(1) } const files = fs.readdirSync(MODELS_DIR).filter((f) => f.endsWith(".json")) // Tally trackers const branchHits = { input: { string: 0, "input.raw": 0, prompt: 0, question: 0, "doc.question": 0, "doc.JSON": 0, EMPTY: 0 }, ground_truth: { "input.reference": 0, ground_truth: 0, target: 0, gold: 0, "doc.answer": 0, NONE: 0 }, response: { output: 0, response: 0, model_output: 0, answer_attribution: 0, messages: 0, filtered_resps: 0, resps: 0, EMPTY: 0 }, is_correct: { "evaluation.is_correct": 0, is_correct: 0, "metrics.exact_match": 0, NONE: 0 }, sample_id: { sample_id: 0, doc_id: 0, id: 0, index_fallback: 0 }, choices: { choices: 0, "doc.choices": 0, NONE: 0 }, } let totalFiles = 0 let filesWithAnyIld = 0 let totalEvals = 0 let evalsWithIld = 0 let totalResults = 0 let resultsWithIld = 0 let totalExamples = 0 let totalInstanceCountSum = 0 // sum of full-set sizes (i.e. URL JSONL totals) const ildKeysSeen = new Map() const interactionTypes = new Map() const exampleKeysHistogram = new Map() function classifyExample(raw) { if (!raw || typeof raw !== "object") return totalExamples++ // Track first-level keys present const sig = Object.keys(raw).sort().join(",") exampleKeysHistogram.set(sig, (exampleKeysHistogram.get(sig) ?? 0) + 1) // input if (typeof raw.input === "string") branchHits.input.string++ else if (raw.input?.raw != null) branchHits.input["input.raw"]++ else if (raw.prompt) branchHits.input.prompt++ else if (raw.question) branchHits.input.question++ else if (raw.doc?.question) branchHits.input["doc.question"]++ else if (raw.doc) branchHits.input["doc.JSON"]++ else branchHits.input.EMPTY++ // ground_truth if (raw.input?.reference != null) branchHits.ground_truth["input.reference"]++ else if (raw.ground_truth != null) branchHits.ground_truth.ground_truth++ else if (raw.target != null) branchHits.ground_truth.target++ else if (raw.gold != null) branchHits.ground_truth.gold++ else if (raw.doc?.answer != null) branchHits.ground_truth["doc.answer"]++ else branchHits.ground_truth.NONE++ // response if (raw.output != null) branchHits.response.output++ else if (raw.response) branchHits.response.response++ else if (raw.model_output) branchHits.response.model_output++ else if (Array.isArray(raw.answer_attribution) && raw.answer_attribution.length > 0) branchHits.response.answer_attribution++ else if (Array.isArray(raw.messages) && raw.messages.length > 0) branchHits.response.messages++ else if (raw.filtered_resps?.[0]?.[0]) branchHits.response.filtered_resps++ else if (raw.resps?.[0]?.[0]) branchHits.response.resps++ else branchHits.response.EMPTY++ // is_correct if (raw.evaluation?.is_correct !== undefined) branchHits.is_correct["evaluation.is_correct"]++ else if (raw.is_correct !== undefined) branchHits.is_correct.is_correct++ else if (raw.metrics?.exact_match === 1 || raw.metrics?.exact_match === 0) branchHits.is_correct["metrics.exact_match"]++ else branchHits.is_correct.NONE++ // sample_id if (raw.sample_id != null) branchHits.sample_id.sample_id++ else if (raw.doc_id != null) branchHits.sample_id.doc_id++ else if (raw.id != null) branchHits.sample_id.id++ else branchHits.sample_id.index_fallback++ // choices if (raw.choices != null) branchHits.choices.choices++ else if (raw.doc?.choices != null) branchHits.choices["doc.choices"]++ else branchHits.choices.NONE++ } function walk(node) { let nodeHasIld = false for (const m of node.metrics ?? []) { totalEvals++ let evalHasIld = false for (const r of m.model_results ?? []) { totalResults++ const ild = r.instance_level_data if (ild != null && typeof ild === "object" && Array.isArray(ild.instance_examples) && ild.instance_examples.length > 0) { resultsWithIld++ evalHasIld = true nodeHasIld = true const ildSig = Object.keys(ild).sort().join(",") ildKeysSeen.set(ildSig, (ildKeysSeen.get(ildSig) ?? 0) + 1) if (typeof ild.interaction_type === "string") { interactionTypes.set(ild.interaction_type, (interactionTypes.get(ild.interaction_type) ?? 0) + 1) } if (typeof ild.instance_count === "number") { totalInstanceCountSum += ild.instance_count } for (const ex of ild.instance_examples) { classifyExample(ex) } } } if (evalHasIld) evalsWithIld++ } for (const s of node.subtasks ?? []) walk(s) return nodeHasIld } for (const f of files) { totalFiles++ let fileHasIld = false let data try { data = JSON.parse(fs.readFileSync(path.join(MODELS_DIR, f), "utf-8")) } catch { continue } for (const cat of Object.values(data.hierarchy_by_category ?? {})) { for (const node of cat) { if (walk(node)) fileHasIld = true } } if (fileHasIld) filesWithAnyIld++ } const pct = (n, total = totalExamples) => (total ? ((100 * n) / total).toFixed(2) + "%" : "-") console.log("=== Audit: instance_level_data prevalence ===") console.log(` Total model files: ${totalFiles}`) console.log(` Files with any ild: ${filesWithAnyIld} (${pct(filesWithAnyIld, totalFiles)})`) console.log(` Total (metric × result) rows: ${totalResults}`) console.log(` Result rows with ild: ${resultsWithIld} (${pct(resultsWithIld, totalResults)})`) console.log(` Total inline preview examples (≤5 per row): ${totalExamples}`) console.log(` Total full samples (sum of instance_count, accessible via source_url): ${totalInstanceCountSum}`) console.log("\n=== ild-level top-level key signatures ===") for (const [k, v] of [...ildKeysSeen.entries()].sort((a, b) => b[1] - a[1])) { console.log(` ${v}x: ${k}`) } console.log("\n=== interaction_type distribution ===") for (const [k, v] of [...interactionTypes.entries()].sort((a, b) => b[1] - a[1])) { console.log(` ${v}x: ${k}`) } console.log("\n=== Per-field branch firing rates ===\n") for (const [field, branches] of Object.entries(branchHits)) { console.log(`--- ${field} ---`) for (const [b, n] of Object.entries(branches)) { console.log(` ${b}: ${n} (${pct(n)})`) } console.log() } console.log("=== Distinct example top-level key signatures (top 5) ===") const sorted = [...exampleKeysHistogram.entries()].sort((a, b) => b[1] - a[1]).slice(0, 5) for (const [sig, count] of sorted) { console.log(` ${count}x: ${sig.substring(0, 200)}${sig.length > 200 ? "…" : ""}`) } console.log("\n=== Summary for spec ===") console.log(`Inline preview prevalence: ${resultsWithIld}/${totalResults} rows (${pct(resultsWithIld, totalResults)})`) console.log(`Full-corpus samples behind source_url: ~${totalInstanceCountSum} (vs ${totalExamples} loaded inline)`) console.log(`Defensive scaffolding firing rate: see per-field breakdown above. Most branches are 0.00%.`)