blueradiance commited on
Commit
d39741b
·
verified ·
1 Parent(s): c944b5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -42
app.py CHANGED
@@ -192,63 +192,64 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
192
 
193
  # 🔹 마스킹 함수 (정리된 최종본)
194
  def extract_names(text):
195
- return ["홍길동"] # 예시
 
196
 
197
  def refactored_mask_names(text, names):
 
198
  mapping = {}
199
- for i, name in enumerate(names):
200
- tag = f"N{i+1:03}"
201
- text = text.replace(name, tag)
202
- mapping[name] = tag
203
- return text, mapping
204
-
205
- def final_name_remask_exact_only(text, mapping):
206
- for name, tag in mapping.items():
207
- text = text.replace(name, tag)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  return text
209
 
210
- def sanitize_sensitive_info(text, keywords, replace_word):
211
- # 기관 키워드 치환
 
 
 
 
 
212
  for kw in keywords:
213
- pattern = rf"{re.escape(kw)}(?=\W|$)" # 조사 대응
214
  text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
215
 
216
- # 이메일 마스킹 예시
217
- text = re.sub(r"\b[\w\.-]+@", "******@", text)
218
  return text
219
 
220
-
221
- # ✅ 핵심 apply_masking 함수
222
  def apply_masking(text, keyword_str, replace_word):
223
  keywords = [kw.strip() for kw in keyword_str.split(",") if kw.strip()]
224
-
225
  names = extract_names(text)
226
  masked_text, name_mapping = refactored_mask_names(text, names)
227
- sanitized_text = sanitize_sensitive_info(masked_text, keywords, replace_word)
228
  final_text = final_name_remask_exact_only(sanitized_text, name_mapping)
229
-
230
  mapping_table = "\n".join(f"{k} → {v}" for k, v in name_mapping.items())
231
  return final_text, mapping_table
232
-
233
-
234
- # ✅ Gradio UI
235
- with gr.Blocks() as demo:
236
- gr.Markdown("""
237
- 🛡️ **민감정보 마스킹 [ver2]**
238
- 이름 + 민감정보 + 기관 키워드 마스킹기 (초성 기반 + 후처리 강화)
239
- """)
240
-
241
- input_text = gr.Textbox(lines=10, label="📅 원본 텍스트 입력")
242
- keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
243
- replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
244
- run_button = gr.Button("🚀 마스킹 실행")
245
- masked_output = gr.Textbox(lines=10, label="🔐 마스킹 결과")
246
- mapping_output = gr.Textbox(lines=5, label="🏷️ 이름 태그 매핑", interactive=False)
247
-
248
- run_button.click(
249
- fn=apply_masking,
250
- inputs=[input_text, keyword_input, replace_input],
251
- outputs=[masked_output, mapping_output]
252
- )
253
-
254
  demo.launch()
 
192
 
193
  # 🔹 마스킹 함수 (정리된 최종본)
194
  def extract_names(text):
195
+ # 이름이 '홍길동'이면 테스트으로 감지
196
+ return ["홍길동"] if "홍길동" in text else []
197
 
198
  def refactored_mask_names(text, names):
199
+ counter = 1
200
  mapping = {}
201
+ used_names = set()
202
+ masked = text
203
+
204
+ for name in names:
205
+ # 조사 구분 있는 경우
206
+ for josa in ["은", "는", "이", "가", "을", "를", "께서", "도", "만", "의", "에서"]:
207
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}{josa}(?![\w가-힣])'
208
+ if re.search(pattern, masked):
209
+ tag = f"{TAG_PREFIX}{counter:03d}"
210
+ mapping[tag] = name
211
+ masked = re.sub(pattern, tag + josa, masked)
212
+ counter += 1
213
+ used_names.add(name)
214
+ break
215
+ for name in names:
216
+ if name in used_names:
217
+ continue
218
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
219
+ if re.search(pattern, masked):
220
+ tag = f"{TAG_PREFIX}{counter:03d}"
221
+ mapping[tag] = name
222
+ masked = re.sub(pattern, tag, masked)
223
+ counter += 1
224
+ return masked, mapping
225
+
226
+ def final_name_remask_exact_only(text, mapping_dict):
227
+ for tag, name in mapping_dict.items():
228
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
229
+ text = re.sub(pattern, tag, text)
230
  return text
231
 
232
+ def sanitize_sensitive_info(text, keyword_string, replace_word):
233
+ text = postprocess_sensitive_patterns(text)
234
+ text = mask_school_names(text)
235
+ text = mask_department(text)
236
+ text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
237
+
238
+ keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
239
  for kw in keywords:
240
+ pattern = rf"{re.escape(kw)}(?=\W|$)"
241
  text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
242
 
243
+ text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
244
+ text = re.sub(r"([-]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
245
  return text
246
 
 
 
247
  def apply_masking(text, keyword_str, replace_word):
248
  keywords = [kw.strip() for kw in keyword_str.split(",") if kw.strip()]
 
249
  names = extract_names(text)
250
  masked_text, name_mapping = refactored_mask_names(text, names)
251
+ sanitized_text = sanitize_sensitive_info(masked_text, keyword_str, replace_word)
252
  final_text = final_name_remask_exact_only(sanitized_text, name_mapping)
 
253
  mapping_table = "\n".join(f"{k} → {v}" for k, v in name_mapping.items())
254
  return final_text, mapping_table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  demo.launch()