general-eval-card / tests /transformations /setup-alias-merging.test.ts
Jenny Chim
Deploy DuckDB-backed frontend to
da8db3e
import { describe, expect, it } from "vitest"
import { getCanonicalModelIdentity } from "../../lib/model-family"
// Executable spec for `notes/transformations/02-setup-alias-merging.md`.
//
// These tests describe TS as it currently runs in production. Quirks are
// preserved on purpose β€” the migration target is to move the computation
// upstream without changing what users see, not to fix transformation
// decisions. If a test below looks "wrong" to product sense, that's a
// future product decision (see the "Future product decision" section of the
// spec); fixing it is explicitly out of scope for this migration.
//
// Pipeline-side implementation must produce identical outputs for every
// row. Verify cross-corpus equivalence with `scripts/verify-setup-alias.mjs`
// once pipeline ships.
// ---------------------------------------------------------------------------
// Group A β€” isSetupAliasQualifier truth table
// ---------------------------------------------------------------------------
//
// Reproduces the function from lib/hf-data.ts:708-720 (and its identical
// twin in scripts/cache-hf-data.mjs:199-211). Pipeline must match exactly.
function normalizeSetupAliasQualifier(value: string | null | undefined): string {
return value?.trim().toLowerCase().replace(/[_\s]+/g, "-") ?? ""
}
function isSetupAliasQualifier(value: string | null | undefined): boolean {
const normalized = normalizeSetupAliasQualifier(value)
return (
normalized === "prompt" ||
normalized === "fc" ||
normalized === "function-calling" ||
normalized.startsWith("thinking")
)
}
describe("Group A β€” isSetupAliasQualifier", () => {
const cases = [
{ input: "prompt", expected: true, why: "exact: prompt" },
{ input: "Prompt", expected: true, why: "case-insensitive" },
{ input: "PROMPT", expected: true, why: "case-insensitive" },
{ input: "fc", expected: true, why: "exact: fc" },
{ input: "FC", expected: true, why: "case-insensitive" },
{ input: "function-calling", expected: true, why: "exact" },
{ input: "function calling", expected: true, why: "space β†’ dash" },
{ input: "function_calling", expected: true, why: "underscore β†’ dash" },
{ input: "thinking", expected: true, why: "exact thinking" },
{ input: "thinking-1k", expected: true, why: "starts with thinking" },
{ input: "thinking-medium", expected: true, why: "starts with thinking" },
{ input: "thinking-none", expected: true, why: "starts with thinking" },
{ input: "thinking_xhigh", expected: true, why: "underscore β†’ dash, then prefix" },
{ input: "Thinking 1K", expected: true, why: "case + space normalized β†’ starts with thinking" },
{ input: "high", expected: false, why: "non-alias inference qualifier" },
{ input: "medium", expected: false, why: "non-alias" },
{ input: "low", expected: false, why: "non-alias" },
{ input: "minimal", expected: false, why: "non-alias" },
{ input: "8k", expected: false, why: "context-length without thinking prefix" },
{ input: "16k", expected: false, why: "context-length without thinking prefix" },
{ input: "", expected: false, why: "empty" },
{ input: null, expected: false, why: "null" },
{ input: undefined, expected: false, why: "undefined" },
{ input: " prompt ", expected: true, why: "leading/trailing whitespace trimmed" },
{ input: "prompts", expected: false, why: "trailing s β€” not exact prompt and no thinking prefix" },
{ input: "fcfc", expected: false, why: "doesn't match exact fc" },
]
it.each(cases)("'$input' β†’ $expected ($why)", ({ input, expected }) => {
expect(isSetupAliasQualifier(input)).toBe(expected)
})
})
// ---------------------------------------------------------------------------
// Group B β€” End-to-end variant normalization (TS-as-is)
// ---------------------------------------------------------------------------
//
// Replicates lib/hf-data.ts:759-786 verbatim, NO date-format fix applied.
// Documents the dashed-date fall-through behaviour as the canonical spec.
interface VariantInput {
variant_key: string
variant_label?: string
}
function normalizeOne(familyId: string, variant: VariantInput): { variant_key: string; variant_label: string } {
if (variant.variant_key === "base") {
return { variant_key: "default", variant_label: "Default" }
}
if (variant.variant_key === "default") {
return { variant_key: "default", variant_label: variant.variant_label ?? "Default" }
}
const synth = getCanonicalModelIdentity({
id: `${familyId}-${variant.variant_key}`,
name: `${familyId}-${variant.variant_key}`,
})
if (synth.versionDate && isSetupAliasQualifier(synth.versionQualifier)) {
return { variant_key: synth.versionDate, variant_label: synth.versionDate }
}
return { variant_key: synth.variantKey, variant_label: synth.variantLabel }
}
describe("Group B β€” End-to-end variant normalization", () => {
const familyId = "openai/gpt-5.2"
const cases = [
{ variant_key: "default", expected: { variant_key: "default", variant_label: "Default" }, why: "default passes through" },
{ variant_key: "base", expected: { variant_key: "default", variant_label: "Default" }, why: "base renamed to default" },
{ variant_key: "20251101", expected: { variant_key: "20251101", variant_label: "2025-11-01" }, why: "YYYYMMDD date-only β€” preserved as raw token, ISO label" },
{
variant_key: "2025-11-01",
expected: { variant_key: "base", variant_label: "Current" },
why: "DASHED date-only falls through to base β€” TS quirk, preserved as canonical for this migration",
},
{
variant_key: "20240620-thinking",
expected: { variant_key: "2024-06-20", variant_label: "2024-06-20" },
why: "YYYYMMDD + thinking β†’ merge to ISO date",
},
{
variant_key: "20240620-thinking-1k",
expected: { variant_key: "2024-06-20", variant_label: "2024-06-20" },
why: "YYYYMMDD + thinking-1k β†’ merge (startsWith match aggregates all thinking budgets)",
},
{
variant_key: "20240620-thinking-medium",
expected: { variant_key: "2024-06-20", variant_label: "2024-06-20" },
why: "all thinking-N variants for this YYYYMMDD date collapse together",
},
{ variant_key: "20240620-fc", expected: { variant_key: "2024-06-20", variant_label: "2024-06-20" }, why: "YYYYMMDD + fc β†’ merge" },
{ variant_key: "20240620-prompt", expected: { variant_key: "2024-06-20", variant_label: "2024-06-20" }, why: "YYYYMMDD + prompt β†’ merge" },
{
variant_key: "20240620-high",
expected: { variant_key: "20240620-high", variant_label: "2024-06-20 Β· High" },
why: "non-alias qualifier preserved with date",
},
{
variant_key: "2025-12-11-thinking-medium",
expected: { variant_key: "base", variant_label: "Current" },
why: "DASHED date β€” regex doesn't match, falls through to base (TS quirk)",
},
{
variant_key: "2025-12-11-thinking-1k",
expected: { variant_key: "base", variant_label: "Current" },
why: "DASHED date with thinking-1k β€” same fall-through",
},
{
variant_key: "2025-12-11-fc",
expected: { variant_key: "base", variant_label: "Current" },
why: "DASHED date + fc β€” fall-through",
},
{
variant_key: "2025-12-11-high",
expected: { variant_key: "base", variant_label: "Current" },
why: "DASHED date + non-alias qualifier β€” fall-through",
},
{ variant_key: "gpt-foo-bar", expected: { variant_key: "base", variant_label: "Current" }, why: "no date detected" },
]
it.each(cases)("'$variant_key' β†’ '$expected.variant_key' ($why)", ({ variant_key, expected }) => {
const result = normalizeOne(familyId, { variant_key })
expect(result.variant_key).toBe(expected.variant_key)
expect(result.variant_label).toBe(expected.variant_label)
})
})
// ---------------------------------------------------------------------------
// Group C β€” Multi-variant deduplication after normalization
// ---------------------------------------------------------------------------
//
// Documents the user-visible aggregation effect: cards with multiple
// dashed-date variants all collapse into a single "base" entry. This is
// TS as-is. If the team later decides users would benefit from
// disaggregation, that's a separate product call (see the spec doc).
function normalizeVariants(
familyId: string,
variants: Array<VariantInput & { evaluation_count?: number; raw_model_ids?: string[]; last_updated?: string }>
) {
const byKey = new Map<
string,
{ variant_key: string; variant_label: string; evaluation_count: number; raw_model_ids: string[]; last_updated?: string }
>()
for (const v of variants) {
const norm = normalizeOne(familyId, v)
const existing = byKey.get(norm.variant_key)
if (existing) {
existing.evaluation_count += v.evaluation_count ?? 0
existing.raw_model_ids = Array.from(new Set([...existing.raw_model_ids, ...(v.raw_model_ids ?? [])])).sort()
if (v.last_updated && (!existing.last_updated || new Date(v.last_updated) > new Date(existing.last_updated))) {
existing.last_updated = v.last_updated
}
} else {
byKey.set(norm.variant_key, {
variant_key: norm.variant_key,
variant_label: norm.variant_label,
evaluation_count: v.evaluation_count ?? 0,
raw_model_ids: [...(v.raw_model_ids ?? [])].sort(),
last_updated: v.last_updated,
})
}
}
return [...byKey.values()]
}
describe("Group C β€” Multi-variant deduplication (TS-as-is)", () => {
it("openai/gpt-5.2: 7 dashed-date variants collapse into default + base", () => {
const result = normalizeVariants("openai/gpt-5.2", [
{ variant_key: "default", evaluation_count: 1, raw_model_ids: ["openai/gpt-5.2"] },
{
variant_key: "2025-12-11",
evaluation_count: 3,
raw_model_ids: ["openai/gpt-5.2-2025-12-11", "openai/gpt-5-2-2025-12-11-fc", "openai/gpt-5-2-2025-12-11-prompt"],
},
{ variant_key: "2025-12-11-thinking-medium", evaluation_count: 1, raw_model_ids: ["openai/gpt-5-2-2025-12-11-thinking-medium"] },
{ variant_key: "2025-12-11-thinking-low", evaluation_count: 1, raw_model_ids: ["openai/gpt-5-2-2025-12-11-thinking-low"] },
{ variant_key: "2025-12-11-thinking-high", evaluation_count: 1, raw_model_ids: ["openai/gpt-5-2-2025-12-11-thinking-high"] },
{ variant_key: "2025-12-11-thinking-none", evaluation_count: 1, raw_model_ids: ["openai/gpt-5-2-2025-12-11-thinking-none"] },
{ variant_key: "2025-12-11-thinking-xhigh", evaluation_count: 1, raw_model_ids: ["openai/gpt-5-2-2025-12-11-thinking-xhigh"] },
])
// All 6 dashed-date variants collapse into a single "base" entry. This
// is TS-as-is behaviour and the canonical spec for this migration.
expect(result.map((v) => v.variant_key).sort()).toEqual(["base", "default"])
const base = result.find((v) => v.variant_key === "base")!
expect(base.evaluation_count).toBe(8)
expect(base.raw_model_ids.length).toBe(8)
})
it("anthropic/claude-haiku-4.5: YYYYMMDD-thinking-Nk variants merge into ISO date (startsWith match fires)", () => {
const result = normalizeVariants("anthropic/claude-haiku-4.5", [
{ variant_key: "default", evaluation_count: 1, raw_model_ids: ["anthropic/claude-haiku-4.5"] },
{ variant_key: "20251001", evaluation_count: 2, raw_model_ids: ["anthropic/claude-haiku-4-5-20251001", "anthropic/claude-haiku-4-5-20251001-fc"] },
{ variant_key: "20251001-thinking-1k", evaluation_count: 1, raw_model_ids: ["anthropic/claude-haiku-4-5-20251001-thinking-1k"] },
{ variant_key: "20251001-thinking-8k", evaluation_count: 1, raw_model_ids: ["anthropic/claude-haiku-4-5-20251001-thinking-8k"] },
{ variant_key: "20251001-thinking-16k", evaluation_count: 1, raw_model_ids: ["anthropic/claude-haiku-4-5-20251001-thinking-16k"] },
{ variant_key: "20251001-thinking-32k", evaluation_count: 1, raw_model_ids: ["anthropic/claude-haiku-4-5-20251001-thinking-32k"] },
])
// YYYYMMDD-thinking-Nk variants merge into "2024-10-01" (ISO) via the
// startsWith("thinking") match. The base "20251001" stays as YYYYMMDD
// because it has no qualifier. So they DON'T merge with each other β€”
// different normalized keys ("20251001" vs "2025-10-01"). TS quirk.
const keys = result.map((v) => v.variant_key).sort()
expect(keys).toContain("default")
expect(keys).toContain("20251001")
expect(keys).toContain("2025-10-01")
expect(keys.length).toBe(3)
const merged = result.find((v) => v.variant_key === "2025-10-01")!
expect(merged.evaluation_count).toBe(4)
expect(merged.raw_model_ids.length).toBe(4)
})
it("non-alias qualifiers with YYYYMMDD dates preserved as separate variants", () => {
const result = normalizeVariants("openai/gpt-5", [
{ variant_key: "default", evaluation_count: 1, raw_model_ids: [] },
{ variant_key: "20250807", evaluation_count: 1, raw_model_ids: [] },
{ variant_key: "20250807-high", evaluation_count: 1, raw_model_ids: [] },
{ variant_key: "20250807-low", evaluation_count: 1, raw_model_ids: [] },
{ variant_key: "20250807-medium", evaluation_count: 1, raw_model_ids: [] },
{ variant_key: "20250807-minimal", evaluation_count: 1, raw_model_ids: [] },
])
expect(result.map((v) => v.variant_key).sort()).toEqual([
"20250807",
"20250807-high",
"20250807-low",
"20250807-medium",
"20250807-minimal",
"default",
])
})
})