flowos's picture
iter 14: hybrid regex + fine-tuned EXAONE 4.0 1.2B (F1=0.930)
1bcb098 verified
import { describe, it, expect } from "vitest";
import { detectName } from "./name.js";
describe("Korean name detection", () => {
it("detects common Korean names", () => {
expect(detectName("๊น€์ฒ ์ˆ˜ ๋‹˜")).toHaveLength(1);
expect(detectName("์ด์˜ํฌ ์ž…๋‹ˆ๋‹ค")).toHaveLength(1);
expect(detectName("๋ฐ•๋ฏผ์ˆ˜")).toHaveLength(1);
});
it("detects names with various top surnames", () => {
expect(detectName("์ตœ์ง€์›")).toHaveLength(1);
expect(detectName("์ •ํ•˜๋‚˜")).toHaveLength(1);
expect(detectName("๊ฐ•๋ฏผํ˜ธ")).toHaveLength(1);
});
it("detects 2-char names (surname + 1 char)", () => {
expect(detectName("๊น€์†”")).toHaveLength(1);
expect(detectName("์ด์ค€")).toHaveLength(1);
});
it("rejects single character (surname only)", () => {
// "๊น€" alone is just a surname, not a full name
expect(detectName("๊น€ ์ž…๋‹ˆ๋‹ค")).toHaveLength(0);
});
it("detects multiple names in text", () => {
const r = detectName("๊น€์ฒ ์ˆ˜์™€ ์ด์˜ํฌ๊ฐ€ ๋งŒ๋‚ฌ๋‹ค");
expect(r).toHaveLength(2);
});
it("has lower confidence than other PII types", () => {
const r = detectName("๊น€์ฒ ์ˆ˜");
if (r.length > 0) {
expect(r[0].confidence).toBeLessThan(0.8);
}
});
it("returns empty for non-Korean text", () => {
expect(detectName("John Smith")).toHaveLength(0);
expect(detectName("hello world")).toHaveLength(0);
});
it("excludes common Korean words that start with surname chars", () => {
// These words start with top surnames but are not names
expect(detectName("์˜ค๋Š˜")).toHaveLength(0);
expect(detectName("์ „ํ™”")).toHaveLength(0);
expect(detectName("์„œ๋ฒ„")).toHaveLength(0);
expect(detectName("์ฃผ์†Œ")).toHaveLength(0);
expect(detectName("ํ•œ๊ตญ")).toHaveLength(0);
expect(detectName("๊ณ ๊ฐ")).toHaveLength(0);
expect(detectName("์ตœ๊ทผ")).toHaveLength(0);
expect(detectName("๋ฌธ์ œ")).toHaveLength(0);
expect(detectName("์•ˆ๋‚ด")).toHaveLength(0);
expect(detectName("์‹ ์ฒญ")).toHaveLength(0);
});
it("still detects real names despite exclusion list", () => {
// These are valid names, not in the exclusion list
expect(detectName("์˜ค์„ธ์ง„")).toHaveLength(1);
expect(detectName("์ „์ง€ํ˜„")).toHaveLength(1);
expect(detectName("์„œ์—ฐ์šฐ")).toHaveLength(1);
});
});