Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -192,63 +192,64 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
|
|
| 192 |
|
| 193 |
# 🔹 마스킹 함수 (정리된 최종본)
|
| 194 |
def extract_names(text):
|
| 195 |
-
|
|
|
|
| 196 |
|
| 197 |
def refactored_mask_names(text, names):
|
|
|
|
| 198 |
mapping = {}
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
return text
|
| 209 |
|
| 210 |
-
def sanitize_sensitive_info(text,
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
for kw in keywords:
|
| 213 |
-
pattern = rf"{re.escape(kw)}(?=\W|$)"
|
| 214 |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
| 215 |
|
| 216 |
-
|
| 217 |
-
text = re.sub(r"
|
| 218 |
return text
|
| 219 |
|
| 220 |
-
|
| 221 |
-
# ✅ 핵심 apply_masking 함수
|
| 222 |
def apply_masking(text, keyword_str, replace_word):
|
| 223 |
keywords = [kw.strip() for kw in keyword_str.split(",") if kw.strip()]
|
| 224 |
-
|
| 225 |
names = extract_names(text)
|
| 226 |
masked_text, name_mapping = refactored_mask_names(text, names)
|
| 227 |
-
sanitized_text = sanitize_sensitive_info(masked_text,
|
| 228 |
final_text = final_name_remask_exact_only(sanitized_text, name_mapping)
|
| 229 |
-
|
| 230 |
mapping_table = "\n".join(f"{k} → {v}" for k, v in name_mapping.items())
|
| 231 |
return final_text, mapping_table
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
# ✅ Gradio UI
|
| 235 |
-
with gr.Blocks() as demo:
|
| 236 |
-
gr.Markdown("""
|
| 237 |
-
🛡️ **민감정보 마스킹 [ver2]**
|
| 238 |
-
이름 + 민감정보 + 기관 키워드 마스킹기 (초성 기반 + 후처리 강화)
|
| 239 |
-
""")
|
| 240 |
-
|
| 241 |
-
input_text = gr.Textbox(lines=10, label="📅 원본 텍스트 입력")
|
| 242 |
-
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
|
| 243 |
-
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
|
| 244 |
-
run_button = gr.Button("🚀 마스킹 실행")
|
| 245 |
-
masked_output = gr.Textbox(lines=10, label="🔐 마스킹 결과")
|
| 246 |
-
mapping_output = gr.Textbox(lines=5, label="🏷️ 이름 태그 매핑", interactive=False)
|
| 247 |
-
|
| 248 |
-
run_button.click(
|
| 249 |
-
fn=apply_masking,
|
| 250 |
-
inputs=[input_text, keyword_input, replace_input],
|
| 251 |
-
outputs=[masked_output, mapping_output]
|
| 252 |
-
)
|
| 253 |
-
|
| 254 |
demo.launch()
|
|
|
|
| 192 |
|
| 193 |
# 🔹 마스킹 함수 (정리된 최종본)
|
| 194 |
def extract_names(text):
|
| 195 |
+
# 이름이 '홍길동'이면 테스트용으로 감지
|
| 196 |
+
return ["홍길동"] if "홍길동" in text else []
|
| 197 |
|
| 198 |
def refactored_mask_names(text, names):
|
| 199 |
+
counter = 1
|
| 200 |
mapping = {}
|
| 201 |
+
used_names = set()
|
| 202 |
+
masked = text
|
| 203 |
+
|
| 204 |
+
for name in names:
|
| 205 |
+
# 조사 구분 있는 경우
|
| 206 |
+
for josa in ["은", "는", "이", "가", "을", "를", "께서", "도", "만", "의", "에서"]:
|
| 207 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}{josa}(?![\w가-힣])'
|
| 208 |
+
if re.search(pattern, masked):
|
| 209 |
+
tag = f"{TAG_PREFIX}{counter:03d}"
|
| 210 |
+
mapping[tag] = name
|
| 211 |
+
masked = re.sub(pattern, tag + josa, masked)
|
| 212 |
+
counter += 1
|
| 213 |
+
used_names.add(name)
|
| 214 |
+
break
|
| 215 |
+
for name in names:
|
| 216 |
+
if name in used_names:
|
| 217 |
+
continue
|
| 218 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
| 219 |
+
if re.search(pattern, masked):
|
| 220 |
+
tag = f"{TAG_PREFIX}{counter:03d}"
|
| 221 |
+
mapping[tag] = name
|
| 222 |
+
masked = re.sub(pattern, tag, masked)
|
| 223 |
+
counter += 1
|
| 224 |
+
return masked, mapping
|
| 225 |
+
|
| 226 |
+
def final_name_remask_exact_only(text, mapping_dict):
|
| 227 |
+
for tag, name in mapping_dict.items():
|
| 228 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
| 229 |
+
text = re.sub(pattern, tag, text)
|
| 230 |
return text
|
| 231 |
|
| 232 |
+
def sanitize_sensitive_info(text, keyword_string, replace_word):
|
| 233 |
+
text = postprocess_sensitive_patterns(text)
|
| 234 |
+
text = mask_school_names(text)
|
| 235 |
+
text = mask_department(text)
|
| 236 |
+
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
|
| 237 |
+
|
| 238 |
+
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
|
| 239 |
for kw in keywords:
|
| 240 |
+
pattern = rf"{re.escape(kw)}(?=\W|$)"
|
| 241 |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
| 242 |
|
| 243 |
+
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
|
| 244 |
+
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
|
| 245 |
return text
|
| 246 |
|
|
|
|
|
|
|
| 247 |
def apply_masking(text, keyword_str, replace_word):
|
| 248 |
keywords = [kw.strip() for kw in keyword_str.split(",") if kw.strip()]
|
|
|
|
| 249 |
names = extract_names(text)
|
| 250 |
masked_text, name_mapping = refactored_mask_names(text, names)
|
| 251 |
+
sanitized_text = sanitize_sensitive_info(masked_text, keyword_str, replace_word)
|
| 252 |
final_text = final_name_remask_exact_only(sanitized_text, name_mapping)
|
|
|
|
| 253 |
mapping_table = "\n".join(f"{k} → {v}" for k, v in name_mapping.items())
|
| 254 |
return final_text, mapping_table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
demo.launch()
|