general-eval-card / tests /transformations /license-normalization.test.ts
j-chim's picture
WIP: v2 cleanup checkpoint before merging origin/main
d249d5b
import { describe, expect, it } from "vitest"
// Executable spec for the license-normalization transformation.
//
// These tests describe the `shortenLicense` function as it currently runs in
// production (components/eval-card.tsx). Quirks (lowercase SPDX
// identifiers passing through, free-form CC BY descriptions truncating, the
// asymmetric length>24 vs slice(0,22) cut) are preserved on purpose. The
// migration target is to move this transformation upstream without changing
// what users see.
//
// Pipeline-side implementation must produce identical outputs for every
// case below. Verify cross-corpus equivalence with
// `scripts/verify-license.mjs` once pipeline ships.
// Reproduces the function verbatim so tests are independent of import path.
function shortenLicense(license: string | null | undefined): string {
if (!license || license === "Not specified") return ""
if (license.toLowerCase().includes("creative commons attribution 4")) return "CC BY 4.0"
if (license.toLowerCase().includes("creative commons zero")) return "CC0"
if (license.toLowerCase().includes("apache license 2") || license.toLowerCase().includes("apache 2")) return "Apache 2.0"
if (license.toLowerCase().includes("mit license")) return "MIT"
if (license.toLowerCase().includes("cc-by-sa")) return "CC BY-SA"
if (license.length > 24) return license.slice(0, 22) + "…"
return license
}
// ---------------------------------------------------------------------------
// Group A β€” Rule firing order (first match wins)
// ---------------------------------------------------------------------------
describe("Group A β€” rule firing order", () => {
const cases = [
{ input: "Apache License 2.0", expected: "Apache 2.0", why: "rule 4 (matches 'apache license 2')" },
{ input: "Apache 2.0", expected: "Apache 2.0", why: "rule 4 (matches 'apache 2')" },
{ input: "MIT License", expected: "MIT", why: "rule 5" },
{ input: "Creative Commons Attribution 4.0", expected: "CC BY 4.0", why: "rule 2" },
{ input: "Creative Commons Zero v1.0 Universal", expected: "CC0", why: "rule 3" },
{ input: "cc-by-sa-3.0", expected: "CC BY-SA", why: "rule 6" },
{
input: "Open Data Commons Attribution License",
expected: "Open Data Commons Attr…",
why: "rule 7 (truncate, length > 24, no other rule matches)",
},
{
input: "The dataset is made available under a CC BY license.",
expected: "The dataset is made av…",
why: "rule 7 β€” prose form bypasses CC BY 4 rule (substring 'creative commons attribution 4' not present)",
},
{
input: "apache-2.0",
expected: "apache-2.0",
why: "rule 8 (passthrough; SPDX lowercase doesn't contain 'apache license 2' or 'apache 2' with the required space)",
},
{ input: "other", expected: "other", why: "rule 8 (passthrough, length ≀ 24)" },
{ input: "unknown", expected: "unknown", why: "rule 8 (passthrough, length ≀ 24)" },
{ input: "Not specified", expected: "", why: "rule 1 (sentinel for 'no license')" },
{ input: "", expected: "", why: "rule 1 (empty)" },
{ input: null, expected: "", why: "rule 1 (null short-circuits via falsy)" },
{ input: undefined, expected: "", why: "rule 1 (undefined short-circuits via falsy)" },
]
it.each(cases)("'$input' β†’ '$expected' ($why)", ({ input, expected }) => {
expect(shortenLicense(input)).toBe(expected)
})
})
// ---------------------------------------------------------------------------
// Group B β€” Truncation rule edge cases
// ---------------------------------------------------------------------------
describe("Group B β€” truncation rule", () => {
const cases = [
{
input: "x".repeat(24),
expected: "x".repeat(24),
why: "exactly 24 chars passes through (length > 24 is the cutoff, not >= 24)",
},
{
input: "x".repeat(25),
expected: "x".repeat(22) + "…",
why: "25 chars truncates β€” first 22 + ellipsis",
},
{
input: "x".repeat(50),
expected: "x".repeat(22) + "…",
why: "50 chars truncates to same first-22 form",
},
{
input: "the MIT license is awesome, very based",
expected: "MIT",
why: "rule 5 fires before rule 7 β€” substring 'mit license' matches even within longer prose",
},
{
input: "some apache 2 thing that is long",
expected: "Apache 2.0",
why: "rule 4 fires before rule 7 β€” substring match wins",
},
{
input: "MIT-like license that is custom and long",
expected: "MIT-like license that …",
why: "rule 5 does NOT fire β€” substring 'mit license' (with space) isn't present in 'mit-like license' (with hyphen between)",
},
]
it.each(cases)("'$input' β†’ '$expected' ($why)", ({ input, expected }) => {
expect(shortenLicense(input)).toBe(expected)
})
})
// ---------------------------------------------------------------------------
// Group C β€” Case sensitivity
// ---------------------------------------------------------------------------
describe("Group C β€” case insensitivity in match rules", () => {
const cases = [
{ input: "APACHE LICENSE 2.0", expected: "Apache 2.0" },
{ input: "apache license 2.0", expected: "Apache 2.0" },
{ input: "Apache License 2.0", expected: "Apache 2.0" },
{ input: "creative commons attribution 4.0", expected: "CC BY 4.0" },
{ input: "CREATIVE COMMONS ATTRIBUTION 4.0", expected: "CC BY 4.0" },
{ input: "Creative Commons Attribution 4.0 International", expected: "CC BY 4.0" },
{ input: "CREATIVE COMMONS ZERO v1.0", expected: "CC0" },
{ input: "Mit License", expected: "MIT" },
{ input: "MIT LICENSE", expected: "MIT" },
{ input: "CC-BY-SA-4.0", expected: "CC BY-SA" },
]
it.each(cases)("'$input' β†’ '$expected'", ({ input, expected }) => {
expect(shortenLicense(input)).toBe(expected)
})
})
// ---------------------------------------------------------------------------
// Group D β€” Production fixtures (the 11 distinct license strings observed
// in .cache/hf-data/benchmark-metadata.json on 2026-04-28)
// ---------------------------------------------------------------------------
describe("Group D β€” production fixtures (all 11 distinct license strings observed)", () => {
const cases = [
{ input: "Not specified", expected: "" },
{ input: "Apache License 2.0", expected: "Apache 2.0" },
{ input: "MIT License", expected: "MIT" },
{ input: "Open Data Commons Attribution License", expected: "Open Data Commons Attr…" },
{ input: "Creative Commons Attribution 4.0", expected: "CC BY 4.0" },
{ input: "cc-by-sa-3.0", expected: "CC BY-SA" },
{ input: "other", expected: "other" },
{ input: "unknown", expected: "unknown" },
{ input: "Creative Commons Zero v1.0 Universal", expected: "CC0" },
{ input: "apache-2.0", expected: "apache-2.0" },
{ input: "The dataset is made available under a CC BY license.", expected: "The dataset is made av…" },
]
it.each(cases)("'$input' β†’ '$expected'", ({ input, expected }) => {
expect(shortenLicense(input)).toBe(expected)
})
})