File size: 7,904 Bytes
da8db3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import "./server-only-shim.mjs"
import fs from "fs"
import path from "path"

// Audit script for `notes/transformations/12-instance-level-data.md`.
//
// Walks .cache/hf-data/models/*.json, finds every result row that has
// `instance_level_data.instance_examples`, replicates the parser's branch
// logic, and reports:
//   - Prevalence (% of rows / files with sample data)
//   - ild-level shape uniformity (which top-level keys appear)
//   - interaction_type distribution
//   - Per-output-field branch firing rates (which fallback path wins for each)
//   - Distinct example top-level key signatures (top 5)
//
// Output is the raw distribution; the spec's "Pipeline status" section quotes
// these numbers. Run after `pnpm cache-hf-data` to refresh.

const CACHE_DIR = ".cache/hf-data"
const MODELS_DIR = path.join(CACHE_DIR, "models")

if (!fs.existsSync(MODELS_DIR)) {
  console.error("=== ERROR: HF data cache missing ===")
  console.error(`  Expected: ${MODELS_DIR}`)
  console.error("  Prime it first:  pnpm cache-hf-data")
  process.exit(1)
}

const files = fs.readdirSync(MODELS_DIR).filter((f) => f.endsWith(".json"))

// Tally trackers
const branchHits = {
  input: { string: 0, "input.raw": 0, prompt: 0, question: 0, "doc.question": 0, "doc.JSON": 0, EMPTY: 0 },
  ground_truth: { "input.reference": 0, ground_truth: 0, target: 0, gold: 0, "doc.answer": 0, NONE: 0 },
  response: { output: 0, response: 0, model_output: 0, answer_attribution: 0, messages: 0, filtered_resps: 0, resps: 0, EMPTY: 0 },
  is_correct: { "evaluation.is_correct": 0, is_correct: 0, "metrics.exact_match": 0, NONE: 0 },
  sample_id: { sample_id: 0, doc_id: 0, id: 0, index_fallback: 0 },
  choices: { choices: 0, "doc.choices": 0, NONE: 0 },
}

let totalFiles = 0
let filesWithAnyIld = 0
let totalEvals = 0
let evalsWithIld = 0
let totalResults = 0
let resultsWithIld = 0
let totalExamples = 0
let totalInstanceCountSum = 0  // sum of full-set sizes (i.e. URL JSONL totals)

const ildKeysSeen = new Map()
const interactionTypes = new Map()
const exampleKeysHistogram = new Map()

function classifyExample(raw) {
  if (!raw || typeof raw !== "object") return
  totalExamples++

  // Track first-level keys present
  const sig = Object.keys(raw).sort().join(",")
  exampleKeysHistogram.set(sig, (exampleKeysHistogram.get(sig) ?? 0) + 1)

  // input
  if (typeof raw.input === "string") branchHits.input.string++
  else if (raw.input?.raw != null) branchHits.input["input.raw"]++
  else if (raw.prompt) branchHits.input.prompt++
  else if (raw.question) branchHits.input.question++
  else if (raw.doc?.question) branchHits.input["doc.question"]++
  else if (raw.doc) branchHits.input["doc.JSON"]++
  else branchHits.input.EMPTY++

  // ground_truth
  if (raw.input?.reference != null) branchHits.ground_truth["input.reference"]++
  else if (raw.ground_truth != null) branchHits.ground_truth.ground_truth++
  else if (raw.target != null) branchHits.ground_truth.target++
  else if (raw.gold != null) branchHits.ground_truth.gold++
  else if (raw.doc?.answer != null) branchHits.ground_truth["doc.answer"]++
  else branchHits.ground_truth.NONE++

  // response
  if (raw.output != null) branchHits.response.output++
  else if (raw.response) branchHits.response.response++
  else if (raw.model_output) branchHits.response.model_output++
  else if (Array.isArray(raw.answer_attribution) && raw.answer_attribution.length > 0) branchHits.response.answer_attribution++
  else if (Array.isArray(raw.messages) && raw.messages.length > 0) branchHits.response.messages++
  else if (raw.filtered_resps?.[0]?.[0]) branchHits.response.filtered_resps++
  else if (raw.resps?.[0]?.[0]) branchHits.response.resps++
  else branchHits.response.EMPTY++

  // is_correct
  if (raw.evaluation?.is_correct !== undefined) branchHits.is_correct["evaluation.is_correct"]++
  else if (raw.is_correct !== undefined) branchHits.is_correct.is_correct++
  else if (raw.metrics?.exact_match === 1 || raw.metrics?.exact_match === 0) branchHits.is_correct["metrics.exact_match"]++
  else branchHits.is_correct.NONE++

  // sample_id
  if (raw.sample_id != null) branchHits.sample_id.sample_id++
  else if (raw.doc_id != null) branchHits.sample_id.doc_id++
  else if (raw.id != null) branchHits.sample_id.id++
  else branchHits.sample_id.index_fallback++

  // choices
  if (raw.choices != null) branchHits.choices.choices++
  else if (raw.doc?.choices != null) branchHits.choices["doc.choices"]++
  else branchHits.choices.NONE++
}

function walk(node) {
  let nodeHasIld = false
  for (const m of node.metrics ?? []) {
    totalEvals++
    let evalHasIld = false
    for (const r of m.model_results ?? []) {
      totalResults++
      const ild = r.instance_level_data
      if (ild != null && typeof ild === "object" && Array.isArray(ild.instance_examples) && ild.instance_examples.length > 0) {
        resultsWithIld++
        evalHasIld = true
        nodeHasIld = true

        const ildSig = Object.keys(ild).sort().join(",")
        ildKeysSeen.set(ildSig, (ildKeysSeen.get(ildSig) ?? 0) + 1)
        if (typeof ild.interaction_type === "string") {
          interactionTypes.set(ild.interaction_type, (interactionTypes.get(ild.interaction_type) ?? 0) + 1)
        }
        if (typeof ild.instance_count === "number") {
          totalInstanceCountSum += ild.instance_count
        }
        for (const ex of ild.instance_examples) {
          classifyExample(ex)
        }
      }
    }
    if (evalHasIld) evalsWithIld++
  }
  for (const s of node.subtasks ?? []) walk(s)
  return nodeHasIld
}

for (const f of files) {
  totalFiles++
  let fileHasIld = false
  let data
  try {
    data = JSON.parse(fs.readFileSync(path.join(MODELS_DIR, f), "utf-8"))
  } catch {
    continue
  }
  for (const cat of Object.values(data.hierarchy_by_category ?? {})) {
    for (const node of cat) {
      if (walk(node)) fileHasIld = true
    }
  }
  if (fileHasIld) filesWithAnyIld++
}

const pct = (n, total = totalExamples) => (total ? ((100 * n) / total).toFixed(2) + "%" : "-")

console.log("=== Audit: instance_level_data prevalence ===")
console.log(`  Total model files:       ${totalFiles}`)
console.log(`  Files with any ild:      ${filesWithAnyIld}  (${pct(filesWithAnyIld, totalFiles)})`)
console.log(`  Total (metric × result) rows: ${totalResults}`)
console.log(`  Result rows with ild:    ${resultsWithIld}  (${pct(resultsWithIld, totalResults)})`)
console.log(`  Total inline preview examples (≤5 per row): ${totalExamples}`)
console.log(`  Total full samples (sum of instance_count, accessible via source_url):  ${totalInstanceCountSum}`)

console.log("\n=== ild-level top-level key signatures ===")
for (const [k, v] of [...ildKeysSeen.entries()].sort((a, b) => b[1] - a[1])) {
  console.log(`  ${v}x: ${k}`)
}

console.log("\n=== interaction_type distribution ===")
for (const [k, v] of [...interactionTypes.entries()].sort((a, b) => b[1] - a[1])) {
  console.log(`  ${v}x: ${k}`)
}

console.log("\n=== Per-field branch firing rates ===\n")
for (const [field, branches] of Object.entries(branchHits)) {
  console.log(`--- ${field} ---`)
  for (const [b, n] of Object.entries(branches)) {
    console.log(`  ${b}: ${n}  (${pct(n)})`)
  }
  console.log()
}

console.log("=== Distinct example top-level key signatures (top 5) ===")
const sorted = [...exampleKeysHistogram.entries()].sort((a, b) => b[1] - a[1]).slice(0, 5)
for (const [sig, count] of sorted) {
  console.log(`  ${count}x: ${sig.substring(0, 200)}${sig.length > 200 ? "…" : ""}`)
}

console.log("\n=== Summary for spec ===")
console.log(`Inline preview prevalence: ${resultsWithIld}/${totalResults} rows (${pct(resultsWithIld, totalResults)})`)
console.log(`Full-corpus samples behind source_url: ~${totalInstanceCountSum} (vs ${totalExamples} loaded inline)`)
console.log(`Defensive scaffolding firing rate: see per-field breakdown above. Most branches are 0.00%.`)