Spaces:
Running
Running
| import { describe, expect, it } from "vitest" | |
| // Executable spec for the license-normalization transformation. | |
| // | |
| // These tests describe the `shortenLicense` function as it currently runs in | |
| // production (components/eval-card.tsx). Quirks (lowercase SPDX | |
| // identifiers passing through, free-form CC BY descriptions truncating, the | |
| // asymmetric length>24 vs slice(0,22) cut) are preserved on purpose. The | |
| // migration target is to move this transformation upstream without changing | |
| // what users see. | |
| // | |
| // Pipeline-side implementation must produce identical outputs for every | |
| // case below. Verify cross-corpus equivalence with | |
| // `scripts/verify-license.mjs` once pipeline ships. | |
| // Reproduces the function verbatim so tests are independent of import path. | |
| function shortenLicense(license: string | null | undefined): string { | |
| if (!license || license === "Not specified") return "" | |
| if (license.toLowerCase().includes("creative commons attribution 4")) return "CC BY 4.0" | |
| if (license.toLowerCase().includes("creative commons zero")) return "CC0" | |
| if (license.toLowerCase().includes("apache license 2") || license.toLowerCase().includes("apache 2")) return "Apache 2.0" | |
| if (license.toLowerCase().includes("mit license")) return "MIT" | |
| if (license.toLowerCase().includes("cc-by-sa")) return "CC BY-SA" | |
| if (license.length > 24) return license.slice(0, 22) + "β¦" | |
| return license | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Group A β Rule firing order (first match wins) | |
| // --------------------------------------------------------------------------- | |
| describe("Group A β rule firing order", () => { | |
| const cases = [ | |
| { input: "Apache License 2.0", expected: "Apache 2.0", why: "rule 4 (matches 'apache license 2')" }, | |
| { input: "Apache 2.0", expected: "Apache 2.0", why: "rule 4 (matches 'apache 2')" }, | |
| { input: "MIT License", expected: "MIT", why: "rule 5" }, | |
| { input: "Creative Commons Attribution 4.0", expected: "CC BY 4.0", why: "rule 2" }, | |
| { input: "Creative Commons Zero v1.0 Universal", expected: "CC0", why: "rule 3" }, | |
| { input: "cc-by-sa-3.0", expected: "CC BY-SA", why: "rule 6" }, | |
| { | |
| input: "Open Data Commons Attribution License", | |
| expected: "Open Data Commons Attrβ¦", | |
| why: "rule 7 (truncate, length > 24, no other rule matches)", | |
| }, | |
| { | |
| input: "The dataset is made available under a CC BY license.", | |
| expected: "The dataset is made avβ¦", | |
| why: "rule 7 β prose form bypasses CC BY 4 rule (substring 'creative commons attribution 4' not present)", | |
| }, | |
| { | |
| input: "apache-2.0", | |
| expected: "apache-2.0", | |
| why: "rule 8 (passthrough; SPDX lowercase doesn't contain 'apache license 2' or 'apache 2' with the required space)", | |
| }, | |
| { input: "other", expected: "other", why: "rule 8 (passthrough, length β€ 24)" }, | |
| { input: "unknown", expected: "unknown", why: "rule 8 (passthrough, length β€ 24)" }, | |
| { input: "Not specified", expected: "", why: "rule 1 (sentinel for 'no license')" }, | |
| { input: "", expected: "", why: "rule 1 (empty)" }, | |
| { input: null, expected: "", why: "rule 1 (null short-circuits via falsy)" }, | |
| { input: undefined, expected: "", why: "rule 1 (undefined short-circuits via falsy)" }, | |
| ] | |
| it.each(cases)("'$input' β '$expected' ($why)", ({ input, expected }) => { | |
| expect(shortenLicense(input)).toBe(expected) | |
| }) | |
| }) | |
| // --------------------------------------------------------------------------- | |
| // Group B β Truncation rule edge cases | |
| // --------------------------------------------------------------------------- | |
| describe("Group B β truncation rule", () => { | |
| const cases = [ | |
| { | |
| input: "x".repeat(24), | |
| expected: "x".repeat(24), | |
| why: "exactly 24 chars passes through (length > 24 is the cutoff, not >= 24)", | |
| }, | |
| { | |
| input: "x".repeat(25), | |
| expected: "x".repeat(22) + "β¦", | |
| why: "25 chars truncates β first 22 + ellipsis", | |
| }, | |
| { | |
| input: "x".repeat(50), | |
| expected: "x".repeat(22) + "β¦", | |
| why: "50 chars truncates to same first-22 form", | |
| }, | |
| { | |
| input: "the MIT license is awesome, very based", | |
| expected: "MIT", | |
| why: "rule 5 fires before rule 7 β substring 'mit license' matches even within longer prose", | |
| }, | |
| { | |
| input: "some apache 2 thing that is long", | |
| expected: "Apache 2.0", | |
| why: "rule 4 fires before rule 7 β substring match wins", | |
| }, | |
| { | |
| input: "MIT-like license that is custom and long", | |
| expected: "MIT-like license that β¦", | |
| why: "rule 5 does NOT fire β substring 'mit license' (with space) isn't present in 'mit-like license' (with hyphen between)", | |
| }, | |
| ] | |
| it.each(cases)("'$input' β '$expected' ($why)", ({ input, expected }) => { | |
| expect(shortenLicense(input)).toBe(expected) | |
| }) | |
| }) | |
| // --------------------------------------------------------------------------- | |
| // Group C β Case sensitivity | |
| // --------------------------------------------------------------------------- | |
| describe("Group C β case insensitivity in match rules", () => { | |
| const cases = [ | |
| { input: "APACHE LICENSE 2.0", expected: "Apache 2.0" }, | |
| { input: "apache license 2.0", expected: "Apache 2.0" }, | |
| { input: "Apache License 2.0", expected: "Apache 2.0" }, | |
| { input: "creative commons attribution 4.0", expected: "CC BY 4.0" }, | |
| { input: "CREATIVE COMMONS ATTRIBUTION 4.0", expected: "CC BY 4.0" }, | |
| { input: "Creative Commons Attribution 4.0 International", expected: "CC BY 4.0" }, | |
| { input: "CREATIVE COMMONS ZERO v1.0", expected: "CC0" }, | |
| { input: "Mit License", expected: "MIT" }, | |
| { input: "MIT LICENSE", expected: "MIT" }, | |
| { input: "CC-BY-SA-4.0", expected: "CC BY-SA" }, | |
| ] | |
| it.each(cases)("'$input' β '$expected'", ({ input, expected }) => { | |
| expect(shortenLicense(input)).toBe(expected) | |
| }) | |
| }) | |
| // --------------------------------------------------------------------------- | |
| // Group D β Production fixtures (the 11 distinct license strings observed | |
| // in .cache/hf-data/benchmark-metadata.json on 2026-04-28) | |
| // --------------------------------------------------------------------------- | |
| describe("Group D β production fixtures (all 11 distinct license strings observed)", () => { | |
| const cases = [ | |
| { input: "Not specified", expected: "" }, | |
| { input: "Apache License 2.0", expected: "Apache 2.0" }, | |
| { input: "MIT License", expected: "MIT" }, | |
| { input: "Open Data Commons Attribution License", expected: "Open Data Commons Attrβ¦" }, | |
| { input: "Creative Commons Attribution 4.0", expected: "CC BY 4.0" }, | |
| { input: "cc-by-sa-3.0", expected: "CC BY-SA" }, | |
| { input: "other", expected: "other" }, | |
| { input: "unknown", expected: "unknown" }, | |
| { input: "Creative Commons Zero v1.0 Universal", expected: "CC0" }, | |
| { input: "apache-2.0", expected: "apache-2.0" }, | |
| { input: "The dataset is made available under a CC BY license.", expected: "The dataset is made avβ¦" }, | |
| ] | |
| it.each(cases)("'$input' β '$expected'", ({ input, expected }) => { | |
| expect(shortenLicense(input)).toBe(expected) | |
| }) | |
| }) | |