flowos's picture
iter 14: hybrid regex + fine-tuned EXAONE 4.0 1.2B (F1=0.930)
1bcb098 verified
/**
* ํ•œ๊ตญ ์ฃผ์†Œ (Korean Address) detection.
* Heuristic: detects Korean address patterns by structural markers.
* Markers: ์‹œ, ๋„, ๊ตฌ, ๊ตฐ, ์, ๋ฉด, ๋™, ๋ฆฌ, ๋กœ, ๊ธธ, ๋ฒˆ์ง€, ์•„ํŒŒํŠธ, ์ธต, ํ˜ธ
*/
import type { PIIDetection } from "@klawn/shared";
// Korean address structural pattern:
// [์‹œ/๋„] [์‹œ/๊ตฐ/๊ตฌ]+ [๋™/์/๋ฉด/๋ฆฌ/๋กœ/๊ธธ] [๋ฒˆ์ง€] [๊ฑด๋ฌผ/์•„ํŒŒํŠธ] [๋™/์ธต/ํ˜ธ]
// Supports variable admin levels (e.g., ๊ฒฝ๊ธฐ๋„ ์„ฑ๋‚จ์‹œ ๋ถ„๋‹น๊ตฌ ํŒ๊ต๋กœ 789๋ฒˆ์ง€)
const ADDRESS_REGEX =
/(?:์„œ์šธ|๋ถ€์‚ฐ|๋Œ€๊ตฌ|์ธ์ฒœ|๊ด‘์ฃผ|๋Œ€์ „|์šธ์‚ฐ|์„ธ์ข…|๊ฒฝ๊ธฐ|๊ฐ•์›|์ถฉ๋ถ|์ถฉ๋‚จ|์ „๋ถ|์ „๋‚จ|๊ฒฝ๋ถ|๊ฒฝ๋‚จ|์ œ์ฃผ)(?:ํŠน๋ณ„์‹œ|๊ด‘์—ญ์‹œ|ํŠน๋ณ„์ž์น˜์‹œ|ํŠน๋ณ„์ž์น˜๋„|๋„)?\s?(?:[\uAC00-\uD7A3]+(?:์‹œ|๊ตฐ|๊ตฌ|๋™|์|๋ฉด|๋ฆฌ|๋กœ|๊ธธ)\s?){1,4}[\uAC00-\uD7A3\d\s\-]*?\d+(?:๋ฒˆ์ง€|๋ฒˆ)?/g;
// Simpler fallback: detect by marker density
const MARKERS = ["์‹œ ", "๊ตฌ ", "๋™ ", "๋กœ ", "๊ธธ ", "๋ฒˆ์ง€", "์•„ํŒŒํŠธ", "์ธต", "ํ˜ธ"];
export function detectAddress(text: string): PIIDetection[] {
const results: PIIDetection[] = [];
let match: RegExpExecArray | null;
// Primary: structured regex
ADDRESS_REGEX.lastIndex = 0;
while ((match = ADDRESS_REGEX.exec(text)) !== null) {
results.push({
type: "ADDRESS",
value: match[0].trim(),
start: match.index,
end: match.index + match[0].length,
confidence: 0.8,
});
}
return results;
}