flowos's picture
iter 14: hybrid regex + fine-tuned EXAONE 4.0 1.2B (F1=0.930)
1bcb098 verified
/**
* Korean name detection โ€” heuristic based on top surnames.
* Matches: top-100 Korean surname + 1-2 Hangul given name characters.
*/
import type { PIIDetection } from "@klawn/shared";
/** Top Korean surnames by frequency. */
const TOP_SURNAMES = [
"๊น€", "์ด", "๋ฐ•", "์ตœ", "์ •", "๊ฐ•", "์กฐ", "์œค", "์žฅ", "์ž„",
"ํ•œ", "์˜ค", "์„œ", "์‹ ", "๊ถŒ", "ํ™ฉ", "์•ˆ", "์†ก", "๋ฅ˜", "์ „",
"ํ™", "๊ณ ", "๋ฌธ", "์–‘", "์†", "๋ฐฐ", "๋ฐฑ", "ํ—ˆ", "์œ ", "๋‚จ",
"์‹ฌ", "๋…ธ", "ํ•˜", "๊ณฝ", "์„ฑ", "์ฐจ", "์ฃผ", "์šฐ", "๊ตฌ", "๋ฏผ",
"๋ผ", "์ง€", "์—„", "์›", "์ฒœ", "๋ฐฉ", "๊ณต", "์ง„", "๊ฐ", "๋ณ€",
];
/** Hangul Unicode range. */
function isHangul(char: string): boolean {
const code = char.charCodeAt(0);
return code >= 0xAC00 && code <= 0xD7A3;
}
/** Common Korean particles that follow names. */
const PARTICLES = "์™€|๊ณผ|์˜|์€|๋Š”|์ด|๊ฐ€|์„|๋ฅผ|์—|์—๊ฒŒ|ํ•œํ…Œ|๊ป˜|๋„|๋งŒ|๋ถ€ํ„ฐ|๊นŒ์ง€|์ฒ˜๋Ÿผ|๊ฐ™์ด|๋ณด๋‹ค|๋กœ|์œผ๋กœ|๋ž‘|์ด๋ž‘";
/**
* Build regex from surnames. Matches surname + 1-2 hangul chars.
* Preceded by whitespace/start. Followed by particle, whitespace, delimiter, or end.
*/
function buildNameRegex(): RegExp {
const surnames = TOP_SURNAMES.join("|");
// Match surname + 1-2 Hangul given name chars, followed by particle or boundary
return new RegExp(
`(?<=^|[\\s,;:()\\[\\]{}])(?:${surnames})[๊ฐ€-ํžฃ]{1,2}(?=(?:${PARTICLES})|$|[\\s,;:()\\[\\]{}])`,
"g",
);
}
const NAME_REGEX = buildNameRegex();
/**
* Common Korean words that start with surname chars but are NOT names.
* Reduces false positives from the heuristic name detector.
*/
const COMMON_WORD_EXCLUSIONS = new Set([
// ์˜ค (surname)
"์˜ค๋Š˜", "์˜ค์ „", "์˜คํ›„", "์˜ค๋ฅธ",
// ์ „ (surname)
"์ „ํ™”", "์ „์ฒด", "์ „๋ถ€", "์ „๋‹ฌ", "์ „์ž", "์ „๊ตญ", "์ „๋ฌธ", "์ „ํ˜€",
// ์„œ (surname)
"์„œ๋ฒ„", "์„œ๋ฅ˜", "์„œ๋น„", "์„œ์šธ",
// ์ฃผ (surname)
"์ฃผ์†Œ", "์ฃผ๋ฌธ", "์ฃผ์š”", "์ฃผ์˜", "์ฃผ๊ฐ„", "์ฃผ๋ง",
// ํ•œ (surname)
"ํ•œ๊ตญ", "ํ•œ๋ฒˆ", "ํ•œํŽธ",
// ๊ณ  (surname)
"๊ณ ๊ฐ",
// ์ตœ (surname)
"์ตœ๊ทผ", "์ตœ๋Œ€", "์ตœ์†Œ", "์ตœ์ดˆ",
// ๊ณต (surname)
"๊ณต์ง€", "๊ณต์œ ", "๊ณต๊ฐ„",
// ๋ฐฉ (surname)
"๋ฐฉ๋ฒ•", "๋ฐฉ๊ธˆ", "๋ฐฉ๋ฌธ",
// ๊ตฌ (surname)
"๊ตฌ๋งค",
// ๋‚จ (surname)
"๋‚จ์€", "๋‚จ์ž",
// ๋ฌธ (surname)
"๋ฌธ์˜", "๋ฌธ์„œ", "๋ฌธ์ œ",
// ์‹  (surname)
"์‹ ์ฒญ", "์‹ ๊ทœ",
// ๋ฐฐ (surname)
"๋ฐฐ์†ก",
// ์ฐจ (surname)
"์ฐจ์ด",
// ์•ˆ (surname)
"์•ˆ๋‚ด", "์•ˆ์ „",
// ์„ฑ (surname)
"์„ฑ๊ณต",
// ๊ฐ• (surname)
"๊ฐ•ํ™”",
// ์ž„ (surname)
"์ž„์‹œ",
// ์› (surname)
"์›๋ž˜",
]);
export function detectName(text: string): PIIDetection[] {
const results: PIIDetection[] = [];
let match: RegExpExecArray | null;
NAME_REGEX.lastIndex = 0;
while ((match = NAME_REGEX.exec(text)) !== null) {
const name = match[0];
// Validate all characters are Hangul
if ([...name].every(isHangul) && name.length >= 2 && name.length <= 3) {
// Skip common Korean words that are not names
if (COMMON_WORD_EXCLUSIONS.has(name)) continue;
results.push({
type: "NAME",
value: name,
start: match.index,
end: match.index + name.length,
confidence: 0.6, // Heuristic โ€” lower confidence
});
}
}
return results;
}