Spaces:

evaleval
/

general-eval-card

Running

File size: 7,904 Bytes

da8db3e

import "./server-only-shim.mjs"
import fs from "fs"
import path from "path"

// Audit script for `notes/transformations/12-instance-level-data.md`.
//
// Walks .cache/hf-data/models/*.json, finds every result row that has
// `instance_level_data.instance_examples`, replicates the parser's branch
// logic, and reports:
//   - Prevalence (% of rows / files with sample data)
//   - ild-level shape uniformity (which top-level keys appear)
//   - interaction_type distribution
//   - Per-output-field branch firing rates (which fallback path wins for each)
//   - Distinct example top-level key signatures (top 5)
//
// Output is the raw distribution; the spec's "Pipeline status" section quotes
// these numbers. Run after `pnpm cache-hf-data` to refresh.

const CACHE_DIR = ".cache/hf-data"
const MODELS_DIR = path.join(CACHE_DIR, "models")

if (!fs.existsSync(MODELS_DIR)) {
  console.error("=== ERROR: HF data cache missing ===")
  console.error(`  Expected: ${MODELS_DIR}`)
  console.error("  Prime it first:  pnpm cache-hf-data")
  process.exit(1)
}

const files = fs.readdirSync(MODELS_DIR).filter((f) => f.endsWith(".json"))

// Tally trackers
const branchHits = {
  input: { string: 0, "input.raw": 0, prompt: 0, question: 0, "doc.question": 0, "doc.JSON": 0, EMPTY: 0 },
  ground_truth: { "input.reference": 0, ground_truth: 0, target: 0, gold: 0, "doc.answer": 0, NONE: 0 },
  response: { output: 0, response: 0, model_output: 0, answer_attribution: 0, messages: 0, filtered_resps: 0, resps: 0, EMPTY: 0 },
  is_correct: { "evaluation.is_correct": 0, is_correct: 0, "metrics.exact_match": 0, NONE: 0 },
  sample_id: { sample_id: 0, doc_id: 0, id: 0, index_fallback: 0 },
  choices: { choices: 0, "doc.choices": 0, NONE: 0 },
}

let totalFiles = 0
let filesWithAnyIld = 0
let totalEvals = 0
let evalsWithIld = 0
let totalResults = 0
let resultsWithIld = 0
let totalExamples = 0
let totalInstanceCountSum = 0  // sum of full-set sizes (i.e. URL JSONL totals)

const ildKeysSeen = new Map()
const interactionTypes = new Map()
const exampleKeysHistogram = new Map()

function classifyExample(raw) {
  if (!raw || typeof raw !== "object") return
  totalExamples++

  // Track first-level keys present
  const sig = Object.keys(raw).sort().join(",")
  exampleKeysHistogram.set(sig, (exampleKeysHistogram.get(sig) ?? 0) + 1)

  // input
  if (typeof raw.input === "string") branchHits.input.string++
  else if (raw.input?.raw != null) branchHits.input["input.raw"]++
  else if (raw.prompt) branchHits.input.prompt++
  else if (raw.question) branchHits.input.question++
  else if (raw.doc?.question) branchHits.input["doc.question"]++
  else if (raw.doc) branchHits.input["doc.JSON"]++
  else branchHits.input.EMPTY++

  // ground_truth
  if (raw.input?.reference != null) branchHits.ground_truth["input.reference"]++
  else if (raw.ground_truth != null) branchHits.ground_truth.ground_truth++
  else if (raw.target != null) branchHits.ground_truth.target++
  else if (raw.gold != null) branchHits.ground_truth.gold++
  else if (raw.doc?.answer != null) branchHits.ground_truth["doc.answer"]++
  else branchHits.ground_truth.NONE++

  // response
  if (raw.output != null) branchHits.response.output++
  else if (raw.response) branchHits.response.response++
  else if (raw.model_output) branchHits.response.model_output++
  else if (Array.isArray(raw.answer_attribution) && raw.answer_attribution.length > 0) branchHits.response.answer_attribution++
  else if (Array.isArray(raw.messages) && raw.messages.length > 0) branchHits.response.messages++
  else if (raw.filtered_resps?.[0]?.[0]) branchHits.response.filtered_resps++
  else if (raw.resps?.[0]?.[0]) branchHits.response.resps++
  else branchHits.response.EMPTY++

  // is_correct
  if (raw.evaluation?.is_correct !== undefined) branchHits.is_correct["evaluation.is_correct"]++
  else if (raw.is_correct !== undefined) branchHits.is_correct.is_correct++
  else if (raw.metrics?.exact_match === 1 || raw.metrics?.exact_match === 0) branchHits.is_correct["metrics.exact_match"]++
  else branchHits.is_correct.NONE++

  // sample_id
  if (raw.sample_id != null) branchHits.sample_id.sample_id++
  else if (raw.doc_id != null) branchHits.sample_id.doc_id++
  else if (raw.id != null) branchHits.sample_id.id++
  else branchHits.sample_id.index_fallback++

  // choices
  if (raw.choices != null) branchHits.choices.choices++
  else if (raw.doc?.choices != null) branchHits.choices["doc.choices"]++
  else branchHits.choices.NONE++
}

function walk(node) {
  let nodeHasIld = false
  for (const m of node.metrics ?? []) {
    totalEvals++
    let evalHasIld = false
    for (const r of m.model_results ?? []) {
      totalResults++
      const ild = r.instance_level_data
      if (ild != null && typeof ild === "object" && Array.isArray(ild.instance_examples) && ild.instance_examples.length > 0) {
        resultsWithIld++
        evalHasIld = true
        nodeHasIld = true

        const ildSig = Object.keys(ild).sort().join(",")
        ildKeysSeen.set(ildSig, (ildKeysSeen.get(ildSig) ?? 0) + 1)
        if (typeof ild.interaction_type === "string") {
          interactionTypes.set(ild.interaction_type, (interactionTypes.get(ild.interaction_type) ?? 0) + 1)
        }
        if (typeof ild.instance_count === "number") {
          totalInstanceCountSum += ild.instance_count
        }
        for (const ex of ild.instance_examples) {
          classifyExample(ex)
        }
      }
    }
    if (evalHasIld) evalsWithIld++
  }
  for (const s of node.subtasks ?? []) walk(s)
  return nodeHasIld
}

for (const f of files) {
  totalFiles++
  let fileHasIld = false
  let data
  try {
    data = JSON.parse(fs.readFileSync(path.join(MODELS_DIR, f), "utf-8"))
  } catch {
    continue
  }
  for (const cat of Object.values(data.hierarchy_by_category ?? {})) {
    for (const node of cat) {
      if (walk(node)) fileHasIld = true
    }
  }
  if (fileHasIld) filesWithAnyIld++
}

const pct = (n, total = totalExamples) => (total ? ((100 * n) / total).toFixed(2) + "%" : "-")

console.log("=== Audit: instance_level_data prevalence ===")
console.log(`  Total model files:       ${totalFiles}`)
console.log(`  Files with any ild:      ${filesWithAnyIld}  (${pct(filesWithAnyIld, totalFiles)})`)
console.log(`  Total (metric × result) rows: ${totalResults}`)
console.log(`  Result rows with ild:    ${resultsWithIld}  (${pct(resultsWithIld, totalResults)})`)
console.log(`  Total inline preview examples (≤5 per row): ${totalExamples}`)
console.log(`  Total full samples (sum of instance_count, accessible via source_url):  ${totalInstanceCountSum}`)

console.log("\n=== ild-level top-level key signatures ===")
for (const [k, v] of [...ildKeysSeen.entries()].sort((a, b) => b[1] - a[1])) {
  console.log(`  ${v}x: ${k}`)
}

console.log("\n=== interaction_type distribution ===")
for (const [k, v] of [...interactionTypes.entries()].sort((a, b) => b[1] - a[1])) {
  console.log(`  ${v}x: ${k}`)
}

console.log("\n=== Per-field branch firing rates ===\n")
for (const [field, branches] of Object.entries(branchHits)) {
  console.log(`--- ${field} ---`)
  for (const [b, n] of Object.entries(branches)) {
    console.log(`  ${b}: ${n}  (${pct(n)})`)
  }
  console.log()
}

console.log("=== Distinct example top-level key signatures (top 5) ===")
const sorted = [...exampleKeysHistogram.entries()].sort((a, b) => b[1] - a[1]).slice(0, 5)
for (const [sig, count] of sorted) {
  console.log(`  ${count}x: ${sig.substring(0, 200)}${sig.length > 200 ? "…" : ""}`)
}

console.log("\n=== Summary for spec ===")
console.log(`Inline preview prevalence: ${resultsWithIld}/${totalResults} rows (${pct(resultsWithIld, totalResults)})`)
console.log(`Full-corpus samples behind source_url: ~${totalInstanceCountSum} (vs ${totalExamples} loaded inline)`)
console.log(`Defensive scaffolding firing rate: see per-field breakdown above. Most branches are 0.00%.`)