# ๐Ÿ“ฆ PART 1: ์ด๋ฆ„ ์ถ”์ถœ๊ธฐ + ํƒœ๊ทธ ์น˜ํ™˜๊ธฐ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import re TAG_PREFIX = "N" def apply_name_tags(text: str, names: list, start_index: int = 100) -> tuple[str, dict]: """ ๐Ÿท ์ด๋ฆ„ ๋ฆฌ์ŠคํŠธ๋ฅผ ํƒœ๊ทธ๋กœ ์น˜ํ™˜: ๊น€์ฒ ์ˆ˜ โ†’ N100 ๋ฐ˜ํ™˜: (ํƒœ๊น…๋œ ํ…์ŠคํŠธ, ํƒœ๊ทธ ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ) """ mapping = {} tagged_text = text counter = start_index # โœ… ๊ธด ์ด๋ฆ„ ์šฐ์„  ์ •๋ ฌ names = sorted(set(names), key=len, reverse=True) for name in names: tag = f"{TAG_PREFIX}{counter:03d}" pattern = re.compile(rf"([\s\(\[\"']*){re.escape(kw)}([๊ฐ€-ํžฃ\s.,;:!?()\[\]\"']*)", re.IGNORECASE) tagged_text, n = pattern.subn(tag, tagged_text) if n > 0: mapping[tag] = name counter += 1 return tagged_text, mapping def replace_institution_keywords(text: str, keywords: list, replace_word: str) -> str: """ ๐Ÿข ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ธฐ๊ด€๋ช… โ†’ ์น˜ํ™˜์–ด๋กœ ๋ณ€๊ฒฝ """ for kw in keywords: pattern = re.compile( rf'([\s\(\["']*){re.escape(kw)}([๊ฐ€-ํžฃ\s.,;:!?()\[\]"'"]*)', re.IGNORECASE ) text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text) return text # ๐Ÿ“ฆ PART 2 (Extended & Fixed): ํ˜ธ์นญ/์กฐ์‚ฌ ํ™•์žฅ๊ธฐ + ํƒœ๊ทธ ๋งคํ•‘ ๋ณด์ •๊ธฐ - ํŠน์ˆ˜๋ฌธ์ž ์˜ค๋ฅ˜ ์ˆ˜์ •ํŒ import re # โœ… ํ™•์žฅ๋œ ํ˜ธ์นญ ๋ฆฌ์ŠคํŠธ COMMON_SUFFIXES = [ # ๐Ÿ“ ๊ฐ€์ •/๊ด€๊ณ„ ๊ธฐ๋ฐ˜ '์–ด๋จธ๋‹ˆ', '์•„๋ฒ„์ง€', '์—„๋งˆ', '์•„๋น ', 'ํ˜•', '๋ˆ„๋‚˜', '์–ธ๋‹ˆ', '์˜ค๋น ', '๋™์ƒ', '๋”ธ', '์•„๋“ค', '์กฐ์นด', '์‚ฌ์ดŒ', '์ด๋ชจ', '๊ณ ๋ชจ', '์‚ผ์ดŒ', '์ˆ™๋ชจ', '์™ธ์‚ผ์ดŒ', 'ํ• ๋จธ๋‹ˆ', 'ํ• ์•„๋ฒ„์ง€', '์™ธํ• ๋จธ๋‹ˆ', '์™ธํ• ์•„๋ฒ„์ง€', '์žฅ๋ชจ', '์žฅ์ธ', '๋ฉฐ๋А๋ฆฌ', '์‚ฌ์œ„', '๋ถ€์ธ', '์™€์ดํ”„', '์‹ ๋ž‘', '์˜ฌ์ผ€', 'ํ˜•์ˆ˜', '์ œ์ˆ˜์”จ', '๋งคํ˜•', '์ฒ˜์ œ', '์‹œ๋ˆ„์ด', # ๐Ÿ“ ์‚ฌํšŒ/๊ต์œก/์ง์—… ํ˜ธ์นญ 'ํ•™์ƒ', '์ดˆ๋“ฑํ•™์ƒ', '์ค‘ํ•™์ƒ', '๊ณ ๋“ฑํ•™์ƒ', '์ˆ˜ํ—˜์ƒ', 'ํ•™๋ถ€๋ชจ', '์„ ์ƒ', '์„ ์ƒ๋‹˜', '๊ต์‚ฌ', '๊ต๊ฐ', '๊ต์žฅ', '๋‹ด์ž„', '๋ฐ˜์žฅ', '์กฐ๊ต์ˆ˜', '๊ต์ˆ˜', '์—ฐ๊ตฌ์›', '๊ฐ•์‚ฌ', '๋ฐ•์‚ฌ', '์„์‚ฌ', 'ํ•™์‚ฌ', '๋ณดํ˜ธ์ž', 'ํ”ผํ•ด์ž', '์•„๋™', '์ฃผ๋ฏผ', '๋‹น์‚ฌ์ž', '๋Œ€์ƒ์ž', '๋‹ด๋‹น์ž', # ๐Ÿ“ ์ง์žฅ/์กฐ์ง ์ง๊ธ‰ '๋Œ€ํ‘œ', '์ด์‚ฌ', '์ „๋ฌด', '์ƒ๋ฌด', '๋ถ€์žฅ', '์ฐจ์žฅ', '๊ณผ์žฅ', '๋Œ€๋ฆฌ', '์‚ฌ์›', 'ํŒ€์žฅ', '๋ณธ๋ถ€์žฅ', '์„ผํ„ฐ์žฅ', '์†Œ์žฅ', '์‹ค์žฅ', '์ด๋ฌด', '์ง์›', '๋งค๋‹ˆ์ €', '์ง€์ ์žฅ', '์‚ฌ๋ฌด์žฅ', # ๐Ÿ“ ์˜๋ฃŒ/๊ธฐํƒ€ '์˜์‚ฌ', '๊ฐ„ํ˜ธ์‚ฌ', '๊ฐ„๋ณ‘์ธ', '๊ธฐ์‚ฌ๋‹˜', '์–ด๋ฅด์‹ ', '๋‹˜', '์”จ' ] # โœ… ์‹ค์ „๊ธ‰ ์กฐ์‚ฌ ๋ฆฌ์ŠคํŠธ COMMON_JOSA = [ # โœ… ๊ธฐ๋ณธ ์กฐ์‚ฌ '์ด', '๊ฐ€', '์„', '๋ฅผ', '์€', '๋Š”', '์˜', '๋„', # โœ… ์ฒ˜์†Œ/๋ฐฉํ–ฅ/๋Œ€์ƒ '์—', '์—์„œ', '์—๊ฒŒ', '๊ป˜์„œ', '์œผ๋กœ', '๋กœ', '๋ถ€ํ„ฐ', '๊นŒ์ง€', 'ํ•œํ…Œ', # โœ… ๊ฐ•์กฐ/๋Œ€์กฐ/๋น„๊ต '๋ณด๋‹ค', '๋ณด๋‹ค๋„', '๋งˆ์ €', '์กฐ์ฐจ', '์กฐ์ฐจ๋„', '๊นŒ์ง€๋„', '๋ฐ–์—', '๋งŒํผ', '๋งŒํผ์€', '์ด๋ผ๋„', '์ด๋“ ์ง€', '์ด๋‚˜๋งˆ', '์ด๊ฑด', '์ด๋ž€', '์ด๋ผ์„œ', '์ด์ง€๋งŒ', # โœ… ์—ฐ๊ฒฐํ˜• ์กฐ์‚ฌ '์ด๋ฉฐ', '์ด๋‚˜', '์ด๊ฑฐ๋‚˜', '์ด๋‹ˆ๊นŒ', '์ด๋ผ๋ฉด', '์ฒ˜๋Ÿผ', '๋Œ€๋กœ', 'ํ•˜๊ณ ', '๊ทธ๋ฆฌ๊ณ ', '์™€', '๊ณผ', # โœ… ๋ณด์กฐ/์ข…๊ฒฐํ˜• ์–ด๋ฏธ '์ด๊ธฐ๋„', '์ด์—ˆ๋˜', '์ด์—ˆ์ง€๋งŒ', '์ด์–ด์„œ', '์ด์—ˆ๋‹ค๋ฉด', '์ธ', '์ผ', '์ž„', '์ด๋ž€', '์ด๋ผ๋Š”', # โœ… ํŠน์ˆ˜ํ˜• ์กฐ์‚ฌ/์กฐํ•ฉํ˜• '๊ฐ™์€', '๊ฐ™์•„์„œ', '๊นŒ์ง€๋Š”', '๋ฟ๋งŒ ์•„๋‹ˆ๋ผ', '์™€๋Š”', '์™€๋„', 'ํ•˜๊ณ ๋„', '์œผ๋กœ์„œ', '์œผ๋กœ์จ' ] def expand_variation_patterns(text: str, mapping: dict) -> str: """ ๐Ÿ‘“ ํƒœ๊ทธ๋œ ํ…์ŠคํŠธ์—์„œ ์„ฑ+์ด๋ฆ„+ํ˜ธ์นญ+์กฐ์‚ฌ ํ˜•ํƒœ๋ฅผ ๋‹ค์‹œ ํƒœ๊น… """ for tag, base in mapping.items(): prefix = r'[\\s\\(\\["\\\']*' # ๊ณต๋ฐฑ, ๊ด„ํ˜ธ, ๋”ฐ์˜ดํ‘œ ํฌํ•จ๋œ ์•ˆ์ „ ํŒจํ„ด suffix = f"(?:{'|'.join(COMMON_SUFFIXES)})?" josa = f"(?:{'|'.join(COMMON_JOSA)})?" pattern = re.compile(rf'{prefix}{re.escape(base)}{suffix}{josa}', re.IGNORECASE) text = pattern.sub(lambda m: m.group(0).replace(base, tag), text) return text def boost_mapping_from_context(text: str, mapping: dict) -> dict: """ ๐Ÿ“Œ ํƒœ๊น…๋œ ํ…์ŠคํŠธ์—์„œ ๊ฐ ํƒœ๊ทธ์˜ ์‹ค์ œ ํ™•์žฅ๋œ ํ‘œํ˜„ ๊ฐ์ง€ํ•ด mapping ๋ณด์ • """ updated = {} for tag, base in mapping.items(): idx = text.find(tag) if idx == -1: updated[tag] = base continue window = text[max(0, idx - 100): idx + 100] pattern = re.compile(rf'([\s\(\["\']*){re.escape(kw)}([๊ฐ€-ํžฃ\s.,;:!?()\[\]"\'"]*)', re.IGNORECASE) match = pattern.search(window) if match: updated[tag] = match.group(0) else: updated[tag] = base return updated # ๐Ÿ“ฆ PART 3: ๋ฏผ๊ฐ์ •๋ณด ๋งˆ์Šค์ปค + ํ•™๊ต/ํ•™๋…„/ํ•™๊ณผ ๋งˆ์Šค์ปค import re def postprocess_sensitive_patterns(text: str) -> str: """ ๐Ÿ” ์ด๋ฉ”์ผ, ์ฃผ๋ฏผ๋“ฑ๋ก๋ฒˆํ˜ธ, ๊ณ„์ขŒ๋ฒˆํ˜ธ, ์นด๋“œ๋ฒˆํ˜ธ, ์ „ํ™”๋ฒˆํ˜ธ, ์ฃผ์†Œ ๋งˆ์Šคํ‚น """ text = re.sub(r"[\w\.-]+@[\w\.-]+", "******@***.***", text) # ์ด๋ฉ”์ผ text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text) # ์ฃผ๋ฏผ๋ฒˆํ˜ธ text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text) # ์นด๋“œ/์ „ํ™” text = re.sub(r"(\d{1,3})๋™", "***๋™", text) text = re.sub(r"(\d{1,4})ํ˜ธ", "****ํ˜ธ", text) return text def to_chosung(text: str) -> str: """ ๐Ÿง  ์ดˆ์„ฑ ๋ณ€ํ™˜๊ธฐ: ํ•™๊ต๋ช…, ํ•™๊ณผ๋ช… ๋“ฑ์— ์ ์šฉ """ CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)] result = "" for ch in text: if '๊ฐ€' <= ch <= 'ํžฃ': code = ord(ch) - ord('๊ฐ€') cho = code // 588 result += CHOSUNG_LIST[cho] else: result += ch return result def mask_school_names(text: str) -> str: """ ๐Ÿซ ํ•™๊ต๋ช… โ†’ ์ดˆ์„ฑ ๋ณ€ํ™˜ ๋งˆ์Šคํ‚น (์—ฐ์„ธ๋Œ€ํ•™๊ต โ†’ ใ…‡ใ……๋Œ€ํ•™๊ต) """ def replace_school(m): return to_chosung(m.group(1)) + m.group(2) return re.sub(r"([๊ฐ€-ํžฃ]{2,20})(์ดˆ๋“ฑํ•™๊ต|์ค‘ํ•™๊ต|๊ณ ๋“ฑํ•™๊ต|๋Œ€ํ•™๊ต)", replace_school, text) def mask_department_names(text: str) -> str: """ ๐Ÿข ํ•™๊ณผ๋ช… โ†’ ์ดˆ์„ฑ ๋งˆ์Šคํ‚น (๊ตญ๋ฌธํ•™๊ณผ โ†’ ใ„ฑใ…ํ•™๊ณผ) """ return re.sub(r"([๊ฐ€-ํžฃ]{2,20})ํ•™๊ณผ", lambda m: to_chosung(m.group(1)) + "ํ•™๊ณผ", text) def mask_grade_class(text: str) -> str: """ ๐ŸŽ“ ํ•™๋…„/๋ฐ˜ ์ •๋ณด ๋งˆ์Šคํ‚น (2ํ•™๋…„ 3๋ฐ˜ โ†’ *ํ•™๋…„ *๋ฐ˜) """ return re.sub(r"(\d)ํ•™๋…„(\s?(\d)๋ฐ˜)?", "*ํ•™๋…„ *๋ฐ˜", text) # ๐Ÿ“ฆ PART 4: ๊ธฐ๊ด€ ํ‚ค์›Œ๋“œ ์น˜ํ™˜๊ธฐ + Gradio UI ์‹คํ–‰๊ธฐ import re import gradio as gr from part1_name_extract_and_tag import extract_names, apply_name_tags from part2_suffix_expansion_and_mapping import expand_variation_patterns, boost_mapping_from_context from part3_sensitive_school_masker import ( postprocess_sensitive_patterns, mask_school_names, mask_department_names, mask_grade_class ) def replace_institution_keywords(text: str, keywords: list, replace_word: str) -> str: """ ๐Ÿข ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ธฐ๊ด€๋ช… โ†’ ์น˜ํ™˜์–ด๋กœ ๋ณ€๊ฒฝ """ for kw in keywords: pattern = re.compile(rf'([\s\(\["'โ€˜โ€œ]*){re.escape(kw)}([๊ฐ€-ํžฃ\s.,;:!?()"'โ€โ€™]*)', re.IGNORECASE) text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text) return text def apply_full_masking(text: str, keyword_str: str, replace_word: str): # 1. ํ‚ค์›Œ๋“œ ์น˜ํ™˜ keywords = [k.strip() for k in keyword_str.split(",") if k.strip()] text = replace_institution_keywords(text, keywords, replace_word) # 2. ๋ฏผ๊ฐ์ •๋ณด + ํ•™๊ต ํ•™๊ณผ ํ•™๋…„ ๋งˆ์Šคํ‚น text = postprocess_sensitive_patterns(text) text = mask_school_names(text) text = mask_department_names(text) text = mask_grade_class(text) # 3. ์ด๋ฆ„ ์ถ”์ถœ + ํƒœ๊น… names = extract_names(text) tagged, mapping = apply_name_tags(text, names) # 4. ํŒŒ์ƒ ํ‘œํ˜„ ํ™•์žฅ tagged = expand_variation_patterns(tagged, mapping) mapping = boost_mapping_from_context(tagged, mapping) # 5. ๋งคํ•‘ ์ถœ๋ ฅ ์ •๋ฆฌ mapping_text = "\n".join([f"{k} โ†’ {v}" for k, v in mapping.items()]) return tagged, mapping_text # UI ์‹คํ–‰ with gr.Blocks() as demo: gr.Markdown("๐Ÿง  **v5.0 ๋งˆ์Šคํ‚น ํ†ตํ•ฉ ์‹œ์Šคํ…œ** โ€” ํ‚ค์›Œ๋“œ, ์ด๋ฆ„, ๊ฐœ์ธ์ •๋ณด, ํ•™๊ต ๋งˆ์Šคํ‚น") input_text = gr.Textbox(lines=15, label="๐Ÿ“„ ์›๋ฌธ ํ…์ŠคํŠธ") keyword_input = gr.Textbox(lines=1, label="๊ธฐ๊ด€ ํ‚ค์›Œ๋“œ (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„)", value="๊ตฟ๋„ค์ด๋ฒ„์Šค, ์‚ฌํšŒ๋ณต์ง€๋ฒ•์ธ ๊ตฟ๋„ค์ด๋ฒ„์Šค") replace_input = gr.Textbox(lines=1, label="์น˜ํ™˜ํ•  ํ…์ŠคํŠธ", value="์šฐ๋ฆฌ๊ธฐ๊ด€") run_button = gr.Button("๐Ÿš€ ์‹คํ–‰") masked_output = gr.Textbox(lines=15, label="๐Ÿ” ๋งˆ์Šคํ‚น ๊ฒฐ๊ณผ") mapping_output = gr.Textbox(lines=10, label="๐Ÿท๏ธ ํƒœ๊ทธ ๋งคํ•‘", interactive=False) run_button.click(fn=apply_full_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output]) demo.launch()