general-eval-card / tests /transformations /instance-level-data.test.ts
Jenny Chim
Deploy DuckDB-backed frontend to
da8db3e
import { describe, expect, it } from "vitest"
// Executable spec for `notes/transformations/12-instance-level-data.md`.
//
// Replicates `parseInstanceLevelData` from `lib/hf-data.ts:933-1043` verbatim.
// `fetchInstanceLevelData` (lib/hf-data.ts:890-917) is currently orphaned;
// not tested here (it just wraps fetch + JSON.parse + re-uses the parser).
interface SampleResult {
sample_id: string
input: string
ground_truth?: string
response: string
choices?: unknown
is_correct?: boolean
metadata?: Record<string, unknown>
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
function parseInstanceLevelData(data: unknown): SampleResult[] {
if (!data || typeof data !== "object") return []
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const obj = data as Record<string, any>
const examples: unknown[] = Array.isArray(obj.instance_examples)
? obj.instance_examples
: Array.isArray(data)
? (data as unknown[])
: []
if (examples.length === 0) return []
return examples
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.map((raw: any, i: number) => {
if (!raw || typeof raw !== "object") return null
let input = ""
if (typeof raw.input === "string") {
input = raw.input
} else if (raw.input?.raw) {
input = String(raw.input.raw)
} else if (raw.prompt) {
input = raw.prompt
} else if (raw.question) {
input = raw.question
} else if (raw.doc?.question) {
input = raw.doc.question
} else if (raw.doc) {
input = JSON.stringify(raw.doc).slice(0, 500)
}
let groundTruth: string | undefined
if (raw.input?.reference) {
groundTruth = Array.isArray(raw.input.reference)
? raw.input.reference.join(", ")
: String(raw.input.reference)
} else if (raw.ground_truth != null) {
groundTruth = String(raw.ground_truth)
} else if (raw.target != null) {
groundTruth = String(raw.target)
} else if (raw.gold != null) {
groundTruth = String(raw.gold)
} else if (raw.doc?.answer != null) {
groundTruth = String(raw.doc.answer)
}
let response = ""
if (raw.output != null) {
response = typeof raw.output === "string" ? raw.output : JSON.stringify(raw.output)
} else if (raw.response) {
response = raw.response
} else if (raw.model_output) {
response = raw.model_output
} else if (Array.isArray(raw.answer_attribution) && raw.answer_attribution.length > 0) {
const attr = raw.answer_attribution[raw.answer_attribution.length - 1]
response = attr.extracted_value ?? ""
} else if (Array.isArray(raw.messages) && raw.messages.length > 0) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const lastAssistant = [...raw.messages].reverse().find((m: any) => m.role === "assistant")
if (lastAssistant) {
response = typeof lastAssistant.content === "string"
? lastAssistant.content
: JSON.stringify(lastAssistant.content)
}
} else if (raw.filtered_resps?.[0]?.[0]) {
response = raw.filtered_resps[0][0]
} else if (raw.resps?.[0]?.[0]) {
response = raw.resps[0][0]
}
const isCorrect =
raw.evaluation?.is_correct ??
raw.is_correct ??
(raw.metrics?.exact_match === 1 ? true :
raw.metrics?.exact_match === 0 ? false : undefined)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const metadata: Record<string, any> = {}
if (raw.evaluation && typeof raw.evaluation === "object") {
Object.assign(metadata, raw.evaluation)
}
if (raw.performance && typeof raw.performance === "object") {
Object.assign(metadata, raw.performance)
}
if (raw.metadata && typeof raw.metadata === "object") {
Object.assign(metadata, raw.metadata)
}
if (raw.metrics && typeof raw.metrics === "object") {
Object.assign(metadata, raw.metrics)
}
return {
sample_id: raw.sample_id ?? raw.doc_id ?? raw.id ?? String(i),
input,
ground_truth: groundTruth,
response,
choices: raw.choices ?? raw.doc?.choices ?? undefined,
is_correct: isCorrect,
metadata: Object.keys(metadata).length > 0 ? metadata : undefined,
} as SampleResult
})
.filter((s): s is SampleResult => s !== null)
}
// ---------------------------------------------------------------------------
// Group A β€” top-level shape guards
// ---------------------------------------------------------------------------
describe("Group A β€” top-level shape guards", () => {
it("returns [] for null", () => {
expect(parseInstanceLevelData(null)).toEqual([])
})
it("returns [] for undefined", () => {
expect(parseInstanceLevelData(undefined)).toEqual([])
})
it("returns [] for non-object (string)", () => {
expect(parseInstanceLevelData("hello")).toEqual([])
})
it("returns [] for non-object (number)", () => {
expect(parseInstanceLevelData(42)).toEqual([])
})
it("returns [] when instance_examples is missing AND data is not an array", () => {
expect(parseInstanceLevelData({ foo: "bar" })).toEqual([])
})
it("returns [] when instance_examples is empty array", () => {
expect(parseInstanceLevelData({ instance_examples: [] })).toEqual([])
})
it("uses data array directly when instance_examples is missing AND data is an array", () => {
const result = parseInstanceLevelData([{ sample_id: "s1", input: "hello" }])
expect(result).toHaveLength(1)
expect(result[0].sample_id).toBe("s1")
expect(result[0].input).toBe("hello")
})
it("filters out null/non-object examples but keeps valid ones", () => {
const result = parseInstanceLevelData({
instance_examples: [
{ sample_id: "s1", input: "ok" },
null,
"string-not-object",
{ sample_id: "s2", input: "ok2" },
],
})
expect(result).toHaveLength(2)
expect(result.map((r) => r.sample_id)).toEqual(["s1", "s2"])
})
})
// ---------------------------------------------------------------------------
// Group B β€” input field fallback chain
// ---------------------------------------------------------------------------
describe("Group B β€” input fallback chain (in priority order)", () => {
const inputCases = [
{
label: "raw.input as string (branch #1)",
example: { input: "direct string" },
expected: "direct string",
},
{
label: "raw.input.raw (branch #2; the production-canonical path, 100% of rows)",
example: { input: { raw: "from input.raw" } },
expected: "from input.raw",
},
{
label: "raw.input.raw with non-string value gets stringified",
example: { input: { raw: 42 } },
expected: "42",
},
{
label: "raw.prompt (branch #3; defensive β€” 0% in current production)",
example: { prompt: "the prompt" },
expected: "the prompt",
},
{
label: "raw.question (branch #4; defensive)",
example: { question: "the question" },
expected: "the question",
},
{
label: "raw.doc.question (branch #5; HELM-style, defensive)",
example: { doc: { question: "doc question" } },
expected: "doc question",
},
{
label: "raw.doc fallback to JSON.stringify (branch #6; truncated to 500)",
example: { doc: { question: null, foo: "bar" } },
// ?.question is null which is falsy, so falls through to JSON.stringify(raw.doc)
expected: JSON.stringify({ question: null, foo: "bar" }).slice(0, 500),
},
{
label: "no input fields β†’ empty string",
example: { sample_id: "s1" },
expected: "",
},
{
label: "input.raw takes precedence over prompt",
example: { input: { raw: "winner" }, prompt: "loser" },
expected: "winner",
},
{
label: "raw.input is empty string β†’ falls through to next branches (because falsy)",
example: { input: "", prompt: "fallback" },
// typeof "" === "string" so branch #1 fires β†’ input = ""... wait, let me trace
// Actually: typeof raw.input === "string" is TRUE for "", so branch #1 fires and assigns input = ""
// No fallthrough.
expected: "",
},
]
it.each(inputCases)("$label", ({ example, expected }) => {
const result = parseInstanceLevelData({ instance_examples: [example] })
expect(result[0]?.input).toBe(expected)
})
})
// ---------------------------------------------------------------------------
// Group C β€” ground_truth fallback chain
// ---------------------------------------------------------------------------
describe("Group C β€” ground_truth fallback chain (in priority order)", () => {
const cases = [
{
label: "raw.input.reference as string (branch #1; the production-canonical path)",
example: { input: { reference: "the answer" } },
expected: "the answer",
},
{
label: "raw.input.reference as array β†’ joined with ', '",
example: { input: { reference: ["a", "b", "c"] } },
expected: "a, b, c",
},
{
label: "raw.input.reference as number β†’ String()",
example: { input: { reference: 42 } },
expected: "42",
},
{
label: "raw.ground_truth (branch #2; defensive)",
example: { ground_truth: "gt" },
expected: "gt",
},
{
label: "raw.target (branch #3; defensive)",
example: { target: "the target" },
expected: "the target",
},
{
label: "raw.gold (branch #4; defensive)",
example: { gold: "the gold" },
expected: "the gold",
},
{
label: "raw.doc.answer (branch #5; HELM-style, defensive)",
example: { doc: { answer: "doc answer" } },
expected: "doc answer",
},
{
label: "no ground_truth fields β†’ undefined",
example: { sample_id: "s1" },
expected: undefined,
},
{
label: "raw.input.reference takes precedence over raw.target",
example: { input: { reference: "winner" }, target: "loser" },
expected: "winner",
},
]
it.each(cases)("$label", ({ example, expected }) => {
const result = parseInstanceLevelData({ instance_examples: [example] })
expect(result[0]?.ground_truth).toBe(expected)
})
})
// ---------------------------------------------------------------------------
// Group D β€” response fallback chain
// ---------------------------------------------------------------------------
describe("Group D β€” response fallback chain (in priority order)", () => {
it("raw.output as string (branch #1)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ output: "the output" }] })
expect(r[0]?.response).toBe("the output")
})
it("raw.output as object β†’ JSON.stringify (branch #1)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ output: { foo: "bar" } }] })
expect(r[0]?.response).toBe('{"foo":"bar"}')
})
it("raw.response (branch #2; defensive β€” 0% in current production)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ response: "hello" }] })
expect(r[0]?.response).toBe("hello")
})
it("raw.model_output (branch #3; defensive)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ model_output: "model said" }] })
expect(r[0]?.response).toBe("model said")
})
it("raw.answer_attribution (branch #4; the production majority path, 97.31%)", () => {
const r = parseInstanceLevelData({
instance_examples: [{ answer_attribution: [{ extracted_value: "first" }, { extracted_value: "last wins" }] }],
})
expect(r[0]?.response).toBe("last wins")
})
it("raw.answer_attribution last-element with no extracted_value β†’ empty string", () => {
const r = parseInstanceLevelData({
instance_examples: [{ answer_attribution: [{ foo: "bar" }] }],
})
expect(r[0]?.response).toBe("")
})
it("raw.answer_attribution as empty array β†’ falls through (length check)", () => {
const r = parseInstanceLevelData({
instance_examples: [{ answer_attribution: [], messages: [{ role: "assistant", content: "via messages" }] }],
})
expect(r[0]?.response).toBe("via messages")
})
it("raw.messages reversed-find last assistant (branch #5; production minority 2.49%)", () => {
const r = parseInstanceLevelData({
instance_examples: [{
messages: [
{ role: "user", content: "hi" },
{ role: "assistant", content: "first reply" },
{ role: "user", content: "more" },
{ role: "assistant", content: "last reply" },
],
}],
})
expect(r[0]?.response).toBe("last reply")
})
it("raw.messages with no assistant role β†’ empty string (no last-assistant found)", () => {
const r = parseInstanceLevelData({
instance_examples: [{ messages: [{ role: "user", content: "hi" }] }],
})
expect(r[0]?.response).toBe("")
})
it("raw.messages assistant content as object β†’ JSON.stringify", () => {
const r = parseInstanceLevelData({
instance_examples: [{ messages: [{ role: "assistant", content: { tool: "x" } }] }],
})
expect(r[0]?.response).toBe('{"tool":"x"}')
})
it("raw.filtered_resps[0][0] (branch #6; lm-eval-harness, defensive)", () => {
const r = parseInstanceLevelData({
instance_examples: [{ filtered_resps: [["filtered answer"]] }],
})
expect(r[0]?.response).toBe("filtered answer")
})
it("raw.resps[0][0] (branch #7; lm-eval-harness, defensive)", () => {
const r = parseInstanceLevelData({
instance_examples: [{ resps: [["resps answer"]] }],
})
expect(r[0]?.response).toBe("resps answer")
})
it("no response fields β†’ empty string", () => {
const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
expect(r[0]?.response).toBe("")
})
it("raw.output takes precedence over answer_attribution", () => {
const r = parseInstanceLevelData({
instance_examples: [{ output: "winner", answer_attribution: [{ extracted_value: "loser" }] }],
})
expect(r[0]?.response).toBe("winner")
})
})
// ---------------------------------------------------------------------------
// Group E β€” is_correct fallback chain
// ---------------------------------------------------------------------------
describe("Group E β€” is_correct fallback chain", () => {
it("raw.evaluation.is_correct true (branch #1; production-canonical, 100%)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ evaluation: { is_correct: true } }] })
expect(r[0]?.is_correct).toBe(true)
})
it("raw.evaluation.is_correct false", () => {
const r = parseInstanceLevelData({ instance_examples: [{ evaluation: { is_correct: false } }] })
expect(r[0]?.is_correct).toBe(false)
})
it("raw.is_correct (branch #2; defensive)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ is_correct: true }] })
expect(r[0]?.is_correct).toBe(true)
})
it("raw.metrics.exact_match === 1 β†’ true (branch #3; defensive)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ metrics: { exact_match: 1 } }] })
expect(r[0]?.is_correct).toBe(true)
})
it("raw.metrics.exact_match === 0 β†’ false", () => {
const r = parseInstanceLevelData({ instance_examples: [{ metrics: { exact_match: 0 } }] })
expect(r[0]?.is_correct).toBe(false)
})
it("raw.metrics.exact_match === 0.5 (between) β†’ undefined (only literal 0/1 register)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ metrics: { exact_match: 0.5 } }] })
expect(r[0]?.is_correct).toBeUndefined()
})
it("no is_correct fields β†’ undefined", () => {
const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
expect(r[0]?.is_correct).toBeUndefined()
})
})
// ---------------------------------------------------------------------------
// Group F β€” sample_id fallback chain
// ---------------------------------------------------------------------------
describe("Group F β€” sample_id fallback chain", () => {
it("raw.sample_id (branch #1; production-canonical, 100%)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s-direct" }] })
expect(r[0]?.sample_id).toBe("s-direct")
})
it("raw.doc_id (branch #2; defensive)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ doc_id: "d-1" }] })
expect(r[0]?.sample_id).toBe("d-1")
})
it("raw.id (branch #3; defensive)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ id: "id-1" }] })
expect(r[0]?.sample_id).toBe("id-1")
})
it("none of the above β†’ array index (branch #4)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ foo: "bar" }, { baz: "qux" }] })
expect(r[0]?.sample_id).toBe("0")
expect(r[1]?.sample_id).toBe("1")
})
it("raw.sample_id takes precedence over doc_id and id", () => {
const r = parseInstanceLevelData({
instance_examples: [{ sample_id: "winner", doc_id: "loser1", id: "loser2" }],
})
expect(r[0]?.sample_id).toBe("winner")
})
it("numeric sample_id (e.g. 0) β†’ preserved as-is via ?? (NOT falsy-rejected)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ sample_id: 0 }] })
// sample_id ?? doc_id ?? id ?? String(i) β€” `??` only triggers on null/undefined.
// 0 is preserved as a number; the type assertion forces it through.
expect(r[0]?.sample_id).toBe(0 as unknown as string)
})
})
// ---------------------------------------------------------------------------
// Group G β€” choices fallback chain
// ---------------------------------------------------------------------------
describe("Group G β€” choices fallback chain (0% in current production)", () => {
it("raw.choices (branch #1)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ choices: ["A", "B", "C"] }] })
expect(r[0]?.choices).toEqual(["A", "B", "C"])
})
it("raw.doc.choices (branch #2; HELM-style)", () => {
const r = parseInstanceLevelData({ instance_examples: [{ doc: { choices: ["X", "Y"] } }] })
expect(r[0]?.choices).toEqual(["X", "Y"])
})
it("none β†’ undefined", () => {
const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
expect(r[0]?.choices).toBeUndefined()
})
})
// ---------------------------------------------------------------------------
// Group H β€” metadata merge
// ---------------------------------------------------------------------------
describe("Group H β€” metadata merge (in order: evaluation, performance, metadata, metrics)", () => {
it("merges all four sources", () => {
const r = parseInstanceLevelData({
instance_examples: [{
evaluation: { is_correct: true, eval_field: "e" },
performance: { latency_ms: 100 },
metadata: { tag: "x" },
metrics: { exact_match: 1 },
}],
})
expect(r[0]?.metadata).toEqual({
is_correct: true,
eval_field: "e",
latency_ms: 100,
tag: "x",
exact_match: 1,
})
})
it("later sources overwrite earlier on key collision (metadata wins over evaluation)", () => {
const r = parseInstanceLevelData({
instance_examples: [{
evaluation: { common: "from-eval" },
metadata: { common: "from-metadata" },
}],
})
expect(r[0]?.metadata?.common).toBe("from-metadata")
})
it("returns undefined when all four sources absent", () => {
const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
expect(r[0]?.metadata).toBeUndefined()
})
it("returns undefined when all four sources are empty objects", () => {
const r = parseInstanceLevelData({
instance_examples: [{ evaluation: {}, performance: {}, metadata: {}, metrics: {} }],
})
expect(r[0]?.metadata).toBeUndefined()
})
it("ignores non-object sources (e.g. evaluation as string)", () => {
const r = parseInstanceLevelData({
instance_examples: [{
evaluation: "not an object",
metadata: { ok: 1 },
}],
})
expect(r[0]?.metadata).toEqual({ ok: 1 })
})
})
// ---------------------------------------------------------------------------
// Group I β€” production-canonical full example (end-to-end)
// ---------------------------------------------------------------------------
describe("Group I β€” production-canonical full example", () => {
// Models the exact shape audited 2026-04-28 across all 712 production rows.
const productionExample = {
schema_version: "1.0",
evaluation_id: "swe_bench_verified_mini::abc123",
model_id: "anthropic__anthropic-claude-3-7-sonnet",
evaluation_name: "swe_bench_verified_mini",
sample_id: "instance_42",
sample_hash: "abc123def456",
interaction_type: "multi_turn",
input: { raw: "Fix this bug in this Python file" },
output: null,
messages: [
{ role: "user", content: "Fix this bug" },
{ role: "assistant", content: "Here's the fix..." },
],
answer_attribution: [
{ extracted_value: "diff --git a/foo.py b/foo.py..." },
],
evaluation: { is_correct: true, score: 1.0 },
performance: { latency_ms: 5421 },
metadata: { difficulty: "medium" },
token_usage: { input: 1000, output: 500 },
}
it("extracts fields per the canonical-shape branches", () => {
const r = parseInstanceLevelData({ instance_examples: [productionExample] })
expect(r).toHaveLength(1)
const sample = r[0]
expect(sample.sample_id).toBe("instance_42")
expect(sample.input).toBe("Fix this bug in this Python file") // from input.raw
// ground_truth is undefined because input.reference is unset (this example doesn't have it)
expect(sample.ground_truth).toBeUndefined()
expect(sample.response).toBe("diff --git a/foo.py b/foo.py...") // answer_attribution wins (output is null)
expect(sample.is_correct).toBe(true)
expect(sample.choices).toBeUndefined()
expect(sample.metadata).toEqual({
is_correct: true,
score: 1.0,
latency_ms: 5421,
difficulty: "medium",
})
})
it("with input.reference present β†’ ground_truth is set", () => {
const r = parseInstanceLevelData({
instance_examples: [{ ...productionExample, input: { raw: "Q?", reference: "expected answer" } }],
})
expect(r[0].ground_truth).toBe("expected answer")
})
})