/** * Korean name detection — heuristic based on top surnames. * Matches: top-100 Korean surname + 1-2 Hangul given name characters. */ import type { PIIDetection } from "@klawn/shared"; /** Top Korean surnames by frequency. */ const TOP_SURNAMES = [ "김", "이", "박", "최", "정", "강", "조", "윤", "장", "임", "한", "오", "서", "신", "권", "황", "안", "송", "류", "전", "홍", "고", "문", "양", "손", "배", "백", "허", "유", "남", "심", "노", "하", "곽", "성", "차", "주", "우", "구", "민", "라", "지", "엄", "원", "천", "방", "공", "진", "감", "변", ]; /** Hangul Unicode range. */ function isHangul(char: string): boolean { const code = char.charCodeAt(0); return code >= 0xAC00 && code <= 0xD7A3; } /** Common Korean particles that follow names. */ const PARTICLES = "와|과|의|은|는|이|가|을|를|에|에게|한테|께|도|만|부터|까지|처럼|같이|보다|로|으로|랑|이랑"; /** * Build regex from surnames. Matches surname + 1-2 hangul chars. * Preceded by whitespace/start. Followed by particle, whitespace, delimiter, or end. */ function buildNameRegex(): RegExp { const surnames = TOP_SURNAMES.join("|"); // Match surname + 1-2 Hangul given name chars, followed by particle or boundary return new RegExp( `(?<=^|[\\s,;:()\\[\\]{}])(?:${surnames})[가-힣]{1,2}(?=(?:${PARTICLES})|$|[\\s,;:()\\[\\]{}])`, "g", ); } const NAME_REGEX = buildNameRegex(); /** * Common Korean words that start with surname chars but are NOT names. * Reduces false positives from the heuristic name detector. */ const COMMON_WORD_EXCLUSIONS = new Set([ // 오 (surname) "오늘", "오전", "오후", "오른", // 전 (surname) "전화", "전체", "전부", "전달", "전자", "전국", "전문", "전혀", // 서 (surname) "서버", "서류", "서비", "서울", // 주 (surname) "주소", "주문", "주요", "주의", "주간", "주말", // 한 (surname) "한국", "한번", "한편", // 고 (surname) "고객", // 최 (surname) "최근", "최대", "최소", "최초", // 공 (surname) "공지", "공유", "공간", // 방 (surname) "방법", "방금", "방문", // 구 (surname) "구매", // 남 (surname) "남은", "남자", // 문 (surname) "문의", "문서", "문제", // 신 (surname) "신청", "신규", // 배 (surname) "배송", // 차 (surname) "차이", // 안 (surname) "안내", "안전", // 성 (surname) "성공", // 강 (surname) "강화", // 임 (surname) "임시", // 원 (surname) "원래", ]); export function detectName(text: string): PIIDetection[] { const results: PIIDetection[] = []; let match: RegExpExecArray | null; NAME_REGEX.lastIndex = 0; while ((match = NAME_REGEX.exec(text)) !== null) { const name = match[0]; // Validate all characters are Hangul if ([...name].every(isHangul) && name.length >= 2 && name.length <= 3) { // Skip common Korean words that are not names if (COMMON_WORD_EXCLUSIONS.has(name)) continue; results.push({ type: "NAME", value: name, start: match.index, end: match.index + name.length, confidence: 0.6, // Heuristic — lower confidence }); } } return results; }