Spaces:

evaleval
/

general-eval-card

Running

File size: 22,662 Bytes

da8db3e

import { describe, expect, it } from "vitest"

// Executable spec for `notes/transformations/12-instance-level-data.md`.
//
// Replicates `parseInstanceLevelData` from `lib/hf-data.ts:933-1043` verbatim.
// `fetchInstanceLevelData` (lib/hf-data.ts:890-917) is currently orphaned;
// not tested here (it just wraps fetch + JSON.parse + re-uses the parser).

interface SampleResult {
  sample_id: string
  input: string
  ground_truth?: string
  response: string
  choices?: unknown
  is_correct?: boolean
  metadata?: Record<string, unknown>
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
function parseInstanceLevelData(data: unknown): SampleResult[] {
  if (!data || typeof data !== "object") return []

  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  const obj = data as Record<string, any>
  const examples: unknown[] = Array.isArray(obj.instance_examples)
    ? obj.instance_examples
    : Array.isArray(data)
      ? (data as unknown[])
      : []

  if (examples.length === 0) return []

  return examples
    // eslint-disable-next-line @typescript-eslint/no-explicit-any
    .map((raw: any, i: number) => {
      if (!raw || typeof raw !== "object") return null

      let input = ""
      if (typeof raw.input === "string") {
        input = raw.input
      } else if (raw.input?.raw) {
        input = String(raw.input.raw)
      } else if (raw.prompt) {
        input = raw.prompt
      } else if (raw.question) {
        input = raw.question
      } else if (raw.doc?.question) {
        input = raw.doc.question
      } else if (raw.doc) {
        input = JSON.stringify(raw.doc).slice(0, 500)
      }

      let groundTruth: string | undefined
      if (raw.input?.reference) {
        groundTruth = Array.isArray(raw.input.reference)
          ? raw.input.reference.join(", ")
          : String(raw.input.reference)
      } else if (raw.ground_truth != null) {
        groundTruth = String(raw.ground_truth)
      } else if (raw.target != null) {
        groundTruth = String(raw.target)
      } else if (raw.gold != null) {
        groundTruth = String(raw.gold)
      } else if (raw.doc?.answer != null) {
        groundTruth = String(raw.doc.answer)
      }

      let response = ""
      if (raw.output != null) {
        response = typeof raw.output === "string" ? raw.output : JSON.stringify(raw.output)
      } else if (raw.response) {
        response = raw.response
      } else if (raw.model_output) {
        response = raw.model_output
      } else if (Array.isArray(raw.answer_attribution) && raw.answer_attribution.length > 0) {
        const attr = raw.answer_attribution[raw.answer_attribution.length - 1]
        response = attr.extracted_value ?? ""
      } else if (Array.isArray(raw.messages) && raw.messages.length > 0) {
        // eslint-disable-next-line @typescript-eslint/no-explicit-any
        const lastAssistant = [...raw.messages].reverse().find((m: any) => m.role === "assistant")
        if (lastAssistant) {
          response = typeof lastAssistant.content === "string"
            ? lastAssistant.content
            : JSON.stringify(lastAssistant.content)
        }
      } else if (raw.filtered_resps?.[0]?.[0]) {
        response = raw.filtered_resps[0][0]
      } else if (raw.resps?.[0]?.[0]) {
        response = raw.resps[0][0]
      }

      const isCorrect =
        raw.evaluation?.is_correct ??
        raw.is_correct ??
        (raw.metrics?.exact_match === 1 ? true :
         raw.metrics?.exact_match === 0 ? false : undefined)

      // eslint-disable-next-line @typescript-eslint/no-explicit-any
      const metadata: Record<string, any> = {}
      if (raw.evaluation && typeof raw.evaluation === "object") {
        Object.assign(metadata, raw.evaluation)
      }
      if (raw.performance && typeof raw.performance === "object") {
        Object.assign(metadata, raw.performance)
      }
      if (raw.metadata && typeof raw.metadata === "object") {
        Object.assign(metadata, raw.metadata)
      }
      if (raw.metrics && typeof raw.metrics === "object") {
        Object.assign(metadata, raw.metrics)
      }

      return {
        sample_id: raw.sample_id ?? raw.doc_id ?? raw.id ?? String(i),
        input,
        ground_truth: groundTruth,
        response,
        choices: raw.choices ?? raw.doc?.choices ?? undefined,
        is_correct: isCorrect,
        metadata: Object.keys(metadata).length > 0 ? metadata : undefined,
      } as SampleResult
    })
    .filter((s): s is SampleResult => s !== null)
}

// ---------------------------------------------------------------------------
// Group A — top-level shape guards
// ---------------------------------------------------------------------------

describe("Group A — top-level shape guards", () => {
  it("returns [] for null", () => {
    expect(parseInstanceLevelData(null)).toEqual([])
  })

  it("returns [] for undefined", () => {
    expect(parseInstanceLevelData(undefined)).toEqual([])
  })

  it("returns [] for non-object (string)", () => {
    expect(parseInstanceLevelData("hello")).toEqual([])
  })

  it("returns [] for non-object (number)", () => {
    expect(parseInstanceLevelData(42)).toEqual([])
  })

  it("returns [] when instance_examples is missing AND data is not an array", () => {
    expect(parseInstanceLevelData({ foo: "bar" })).toEqual([])
  })

  it("returns [] when instance_examples is empty array", () => {
    expect(parseInstanceLevelData({ instance_examples: [] })).toEqual([])
  })

  it("uses data array directly when instance_examples is missing AND data is an array", () => {
    const result = parseInstanceLevelData([{ sample_id: "s1", input: "hello" }])
    expect(result).toHaveLength(1)
    expect(result[0].sample_id).toBe("s1")
    expect(result[0].input).toBe("hello")
  })

  it("filters out null/non-object examples but keeps valid ones", () => {
    const result = parseInstanceLevelData({
      instance_examples: [
        { sample_id: "s1", input: "ok" },
        null,
        "string-not-object",
        { sample_id: "s2", input: "ok2" },
      ],
    })
    expect(result).toHaveLength(2)
    expect(result.map((r) => r.sample_id)).toEqual(["s1", "s2"])
  })
})

// ---------------------------------------------------------------------------
// Group B — input field fallback chain
// ---------------------------------------------------------------------------

describe("Group B — input fallback chain (in priority order)", () => {
  const inputCases = [
    {
      label: "raw.input as string (branch #1)",
      example: { input: "direct string" },
      expected: "direct string",
    },
    {
      label: "raw.input.raw (branch #2; the production-canonical path, 100% of rows)",
      example: { input: { raw: "from input.raw" } },
      expected: "from input.raw",
    },
    {
      label: "raw.input.raw with non-string value gets stringified",
      example: { input: { raw: 42 } },
      expected: "42",
    },
    {
      label: "raw.prompt (branch #3; defensive — 0% in current production)",
      example: { prompt: "the prompt" },
      expected: "the prompt",
    },
    {
      label: "raw.question (branch #4; defensive)",
      example: { question: "the question" },
      expected: "the question",
    },
    {
      label: "raw.doc.question (branch #5; HELM-style, defensive)",
      example: { doc: { question: "doc question" } },
      expected: "doc question",
    },
    {
      label: "raw.doc fallback to JSON.stringify (branch #6; truncated to 500)",
      example: { doc: { question: null, foo: "bar" } },
      // ?.question is null which is falsy, so falls through to JSON.stringify(raw.doc)
      expected: JSON.stringify({ question: null, foo: "bar" }).slice(0, 500),
    },
    {
      label: "no input fields → empty string",
      example: { sample_id: "s1" },
      expected: "",
    },
    {
      label: "input.raw takes precedence over prompt",
      example: { input: { raw: "winner" }, prompt: "loser" },
      expected: "winner",
    },
    {
      label: "raw.input is empty string → falls through to next branches (because falsy)",
      example: { input: "", prompt: "fallback" },
      // typeof "" === "string" so branch #1 fires → input = ""... wait, let me trace
      // Actually: typeof raw.input === "string" is TRUE for "", so branch #1 fires and assigns input = ""
      // No fallthrough.
      expected: "",
    },
  ]

  it.each(inputCases)("$label", ({ example, expected }) => {
    const result = parseInstanceLevelData({ instance_examples: [example] })
    expect(result[0]?.input).toBe(expected)
  })
})

// ---------------------------------------------------------------------------
// Group C — ground_truth fallback chain
// ---------------------------------------------------------------------------

describe("Group C — ground_truth fallback chain (in priority order)", () => {
  const cases = [
    {
      label: "raw.input.reference as string (branch #1; the production-canonical path)",
      example: { input: { reference: "the answer" } },
      expected: "the answer",
    },
    {
      label: "raw.input.reference as array → joined with ', '",
      example: { input: { reference: ["a", "b", "c"] } },
      expected: "a, b, c",
    },
    {
      label: "raw.input.reference as number → String()",
      example: { input: { reference: 42 } },
      expected: "42",
    },
    {
      label: "raw.ground_truth (branch #2; defensive)",
      example: { ground_truth: "gt" },
      expected: "gt",
    },
    {
      label: "raw.target (branch #3; defensive)",
      example: { target: "the target" },
      expected: "the target",
    },
    {
      label: "raw.gold (branch #4; defensive)",
      example: { gold: "the gold" },
      expected: "the gold",
    },
    {
      label: "raw.doc.answer (branch #5; HELM-style, defensive)",
      example: { doc: { answer: "doc answer" } },
      expected: "doc answer",
    },
    {
      label: "no ground_truth fields → undefined",
      example: { sample_id: "s1" },
      expected: undefined,
    },
    {
      label: "raw.input.reference takes precedence over raw.target",
      example: { input: { reference: "winner" }, target: "loser" },
      expected: "winner",
    },
  ]

  it.each(cases)("$label", ({ example, expected }) => {
    const result = parseInstanceLevelData({ instance_examples: [example] })
    expect(result[0]?.ground_truth).toBe(expected)
  })
})

// ---------------------------------------------------------------------------
// Group D — response fallback chain
// ---------------------------------------------------------------------------

describe("Group D — response fallback chain (in priority order)", () => {
  it("raw.output as string (branch #1)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ output: "the output" }] })
    expect(r[0]?.response).toBe("the output")
  })

  it("raw.output as object → JSON.stringify (branch #1)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ output: { foo: "bar" } }] })
    expect(r[0]?.response).toBe('{"foo":"bar"}')
  })

  it("raw.response (branch #2; defensive — 0% in current production)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ response: "hello" }] })
    expect(r[0]?.response).toBe("hello")
  })

  it("raw.model_output (branch #3; defensive)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ model_output: "model said" }] })
    expect(r[0]?.response).toBe("model said")
  })

  it("raw.answer_attribution (branch #4; the production majority path, 97.31%)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ answer_attribution: [{ extracted_value: "first" }, { extracted_value: "last wins" }] }],
    })
    expect(r[0]?.response).toBe("last wins")
  })

  it("raw.answer_attribution last-element with no extracted_value → empty string", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ answer_attribution: [{ foo: "bar" }] }],
    })
    expect(r[0]?.response).toBe("")
  })

  it("raw.answer_attribution as empty array → falls through (length check)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ answer_attribution: [], messages: [{ role: "assistant", content: "via messages" }] }],
    })
    expect(r[0]?.response).toBe("via messages")
  })

  it("raw.messages reversed-find last assistant (branch #5; production minority 2.49%)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{
        messages: [
          { role: "user", content: "hi" },
          { role: "assistant", content: "first reply" },
          { role: "user", content: "more" },
          { role: "assistant", content: "last reply" },
        ],
      }],
    })
    expect(r[0]?.response).toBe("last reply")
  })

  it("raw.messages with no assistant role → empty string (no last-assistant found)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ messages: [{ role: "user", content: "hi" }] }],
    })
    expect(r[0]?.response).toBe("")
  })

  it("raw.messages assistant content as object → JSON.stringify", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ messages: [{ role: "assistant", content: { tool: "x" } }] }],
    })
    expect(r[0]?.response).toBe('{"tool":"x"}')
  })

  it("raw.filtered_resps[0][0] (branch #6; lm-eval-harness, defensive)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ filtered_resps: [["filtered answer"]] }],
    })
    expect(r[0]?.response).toBe("filtered answer")
  })

  it("raw.resps[0][0] (branch #7; lm-eval-harness, defensive)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ resps: [["resps answer"]] }],
    })
    expect(r[0]?.response).toBe("resps answer")
  })

  it("no response fields → empty string", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
    expect(r[0]?.response).toBe("")
  })

  it("raw.output takes precedence over answer_attribution", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ output: "winner", answer_attribution: [{ extracted_value: "loser" }] }],
    })
    expect(r[0]?.response).toBe("winner")
  })
})

// ---------------------------------------------------------------------------
// Group E — is_correct fallback chain
// ---------------------------------------------------------------------------

describe("Group E — is_correct fallback chain", () => {
  it("raw.evaluation.is_correct true (branch #1; production-canonical, 100%)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ evaluation: { is_correct: true } }] })
    expect(r[0]?.is_correct).toBe(true)
  })

  it("raw.evaluation.is_correct false", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ evaluation: { is_correct: false } }] })
    expect(r[0]?.is_correct).toBe(false)
  })

  it("raw.is_correct (branch #2; defensive)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ is_correct: true }] })
    expect(r[0]?.is_correct).toBe(true)
  })

  it("raw.metrics.exact_match === 1 → true (branch #3; defensive)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ metrics: { exact_match: 1 } }] })
    expect(r[0]?.is_correct).toBe(true)
  })

  it("raw.metrics.exact_match === 0 → false", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ metrics: { exact_match: 0 } }] })
    expect(r[0]?.is_correct).toBe(false)
  })

  it("raw.metrics.exact_match === 0.5 (between) → undefined (only literal 0/1 register)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ metrics: { exact_match: 0.5 } }] })
    expect(r[0]?.is_correct).toBeUndefined()
  })

  it("no is_correct fields → undefined", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
    expect(r[0]?.is_correct).toBeUndefined()
  })
})

// ---------------------------------------------------------------------------
// Group F — sample_id fallback chain
// ---------------------------------------------------------------------------

describe("Group F — sample_id fallback chain", () => {
  it("raw.sample_id (branch #1; production-canonical, 100%)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s-direct" }] })
    expect(r[0]?.sample_id).toBe("s-direct")
  })

  it("raw.doc_id (branch #2; defensive)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ doc_id: "d-1" }] })
    expect(r[0]?.sample_id).toBe("d-1")
  })

  it("raw.id (branch #3; defensive)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ id: "id-1" }] })
    expect(r[0]?.sample_id).toBe("id-1")
  })

  it("none of the above → array index (branch #4)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ foo: "bar" }, { baz: "qux" }] })
    expect(r[0]?.sample_id).toBe("0")
    expect(r[1]?.sample_id).toBe("1")
  })

  it("raw.sample_id takes precedence over doc_id and id", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ sample_id: "winner", doc_id: "loser1", id: "loser2" }],
    })
    expect(r[0]?.sample_id).toBe("winner")
  })

  it("numeric sample_id (e.g. 0) → preserved as-is via ?? (NOT falsy-rejected)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ sample_id: 0 }] })
    // sample_id ?? doc_id ?? id ?? String(i) — `??` only triggers on null/undefined.
    // 0 is preserved as a number; the type assertion forces it through.
    expect(r[0]?.sample_id).toBe(0 as unknown as string)
  })
})

// ---------------------------------------------------------------------------
// Group G — choices fallback chain
// ---------------------------------------------------------------------------

describe("Group G — choices fallback chain (0% in current production)", () => {
  it("raw.choices (branch #1)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ choices: ["A", "B", "C"] }] })
    expect(r[0]?.choices).toEqual(["A", "B", "C"])
  })

  it("raw.doc.choices (branch #2; HELM-style)", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ doc: { choices: ["X", "Y"] } }] })
    expect(r[0]?.choices).toEqual(["X", "Y"])
  })

  it("none → undefined", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
    expect(r[0]?.choices).toBeUndefined()
  })
})

// ---------------------------------------------------------------------------
// Group H — metadata merge
// ---------------------------------------------------------------------------

describe("Group H — metadata merge (in order: evaluation, performance, metadata, metrics)", () => {
  it("merges all four sources", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{
        evaluation: { is_correct: true, eval_field: "e" },
        performance: { latency_ms: 100 },
        metadata: { tag: "x" },
        metrics: { exact_match: 1 },
      }],
    })
    expect(r[0]?.metadata).toEqual({
      is_correct: true,
      eval_field: "e",
      latency_ms: 100,
      tag: "x",
      exact_match: 1,
    })
  })

  it("later sources overwrite earlier on key collision (metadata wins over evaluation)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{
        evaluation: { common: "from-eval" },
        metadata: { common: "from-metadata" },
      }],
    })
    expect(r[0]?.metadata?.common).toBe("from-metadata")
  })

  it("returns undefined when all four sources absent", () => {
    const r = parseInstanceLevelData({ instance_examples: [{ sample_id: "s1" }] })
    expect(r[0]?.metadata).toBeUndefined()
  })

  it("returns undefined when all four sources are empty objects", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ evaluation: {}, performance: {}, metadata: {}, metrics: {} }],
    })
    expect(r[0]?.metadata).toBeUndefined()
  })

  it("ignores non-object sources (e.g. evaluation as string)", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{
        evaluation: "not an object",
        metadata: { ok: 1 },
      }],
    })
    expect(r[0]?.metadata).toEqual({ ok: 1 })
  })
})

// ---------------------------------------------------------------------------
// Group I — production-canonical full example (end-to-end)
// ---------------------------------------------------------------------------

describe("Group I — production-canonical full example", () => {
  // Models the exact shape audited 2026-04-28 across all 712 production rows.
  const productionExample = {
    schema_version: "1.0",
    evaluation_id: "swe_bench_verified_mini::abc123",
    model_id: "anthropic__anthropic-claude-3-7-sonnet",
    evaluation_name: "swe_bench_verified_mini",
    sample_id: "instance_42",
    sample_hash: "abc123def456",
    interaction_type: "multi_turn",
    input: { raw: "Fix this bug in this Python file" },
    output: null,
    messages: [
      { role: "user", content: "Fix this bug" },
      { role: "assistant", content: "Here's the fix..." },
    ],
    answer_attribution: [
      { extracted_value: "diff --git a/foo.py b/foo.py..." },
    ],
    evaluation: { is_correct: true, score: 1.0 },
    performance: { latency_ms: 5421 },
    metadata: { difficulty: "medium" },
    token_usage: { input: 1000, output: 500 },
  }

  it("extracts fields per the canonical-shape branches", () => {
    const r = parseInstanceLevelData({ instance_examples: [productionExample] })
    expect(r).toHaveLength(1)
    const sample = r[0]
    expect(sample.sample_id).toBe("instance_42")
    expect(sample.input).toBe("Fix this bug in this Python file")  // from input.raw
    // ground_truth is undefined because input.reference is unset (this example doesn't have it)
    expect(sample.ground_truth).toBeUndefined()
    expect(sample.response).toBe("diff --git a/foo.py b/foo.py...")  // answer_attribution wins (output is null)
    expect(sample.is_correct).toBe(true)
    expect(sample.choices).toBeUndefined()
    expect(sample.metadata).toEqual({
      is_correct: true,
      score: 1.0,
      latency_ms: 5421,
      difficulty: "medium",
    })
  })

  it("with input.reference present → ground_truth is set", () => {
    const r = parseInstanceLevelData({
      instance_examples: [{ ...productionExample, input: { raw: "Q?", reference: "expected answer" } }],
    })
    expect(r[0].ground_truth).toBe("expected answer")
  })
})