blueradiance commited on
Commit
ee2c558
·
verified ·
1 Parent(s): 8959389

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -95
app.py CHANGED
@@ -38,13 +38,13 @@ def extract_names(text):
38
  if len(name) >= 2 and name not in names:
39
  names.append(name)
40
 
 
41
  title_suffixes = [
42
  '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장',
43
  '선생님', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '박사', '석사', '학사',
44
  '학생', '고등학생', '중학생', '초등학생', '학부모', '수험생',
45
  '주임', '총무', '회장', '부회장', '사무장', '간호사', '의사', '원장', '기사님', '매니저', '지점장'
46
  ]
47
-
48
  pattern = r'\b([가-힣]{2,4})(' + '|'.join(title_suffixes) + r')\b'
49
  matches = re.findall(pattern, text)
50
  for match in matches:
@@ -52,6 +52,25 @@ def extract_names(text):
52
  if name not in names:
53
  names.append(name)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  return names
56
 
57
  def refactored_mask_names(original_text, names, start_counter=100):
@@ -95,97 +114,3 @@ def to_chosung(text):
95
  else:
96
  result += ch
97
  return result
98
-
99
- def mask_department(text):
100
- text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
101
- return text
102
-
103
- def mask_general_human_terms(text):
104
- human_terms = [
105
- '엄마', '아빠', '어머니', '아버지', '부모', '부모님', '자식', '아들', '딸',
106
- '할아버지', '할머니', '외할아버지', '외할머니',
107
- '형', '누나', '오빠', '언니', '동생', '형제', '자매',
108
- '이모', '고모', '삼촌', '외삼촌', '숙모', '고모부', '이모부', '조카', '손자', '손녀', '사촌',
109
- '사위', '며느리', '장모', '장인', '처제', '시누이', '형수', '제수씨', '매형', '올케',
110
- '아동', '아이', '피해자', '당사자', '보호자', '가족',
111
- r'[가-힣]{1,3}씨', r'[가-힣]{1,3}님', r'[가-힣]{1,3}양', r'[가-힣]{1,3}군', r'[가-힣]{1,3}어르신'
112
- ]
113
-
114
- for term in human_terms:
115
- pattern = rf'\b{term}\b'
116
- text = re.sub(pattern, '○○○', text)
117
-
118
- return text
119
-
120
- def sanitize_sensitive_info(text, keyword_string, replace_word):
121
- text = mask_school_names(text)
122
- text = mask_department(text)
123
- text = mask_general_human_terms(text)
124
-
125
- text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
126
- text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
127
-
128
- keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
129
- for kw in keywords:
130
- pattern = rf"\b{re.escape(kw)}\b"
131
- text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
132
- text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
133
- text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
134
- text = re.sub(r"(\d{1,3})번지", r"***번지", text)
135
- text = re.sub(r"(\d{1,3})동", r"***동", text)
136
- text = re.sub(r"(\d{1,4})호", r"****호", text)
137
- text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
138
- text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
139
- text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
140
- text = re.sub(r"(\d{2,6})[-]?(\d{2,6})[-]?(\d{2,6})",
141
- lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text)
142
- text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
143
- lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
144
- text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})",
145
- lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
146
- text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})",
147
- lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
148
-
149
- return text
150
-
151
- def final_name_remask_exact_only(text, mapping_dict):
152
- for tag, name in mapping_dict.items():
153
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
154
- text = re.sub(pattern, tag, text)
155
- return text
156
-
157
- def apply_masking(text, keywords, replace_word):
158
- names = extract_names(text)
159
- masked, mapping = refactored_mask_names(text, names)
160
- sanitized = sanitize_sensitive_info(masked, keywords, replace_word)
161
- sanitized = final_name_remask_exact_only(sanitized, mapping)
162
- mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
163
- return sanitized, mapping_table
164
-
165
- def remask_with_mapping(text, mapping_string):
166
- mapping = {}
167
- for line in mapping_string.strip().split("\n"):
168
- if "→" in line:
169
- tag, name = line.split("→")
170
- mapping[tag.strip()] = name.strip()
171
- for tag, name in mapping.items():
172
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
173
- text = re.sub(pattern, tag, text)
174
- return text
175
-
176
- with gr.Blocks() as demo:
177
- gr.Markdown("""
178
- 🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
179
- 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
180
- ⚠️ *완벽하지 않을 수 있습니다. 반드시 직접 최종 점검하세요.*
181
- """)
182
- input_text = gr.Textbox(lines=15, label="📥 원본 텍스트 입력")
183
- keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스")
184
- replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
185
- run_button = gr.Button("🚀 마스킹 실행")
186
- masked_output = gr.Textbox(lines=15, label="🔐 마스킹된 텍스트")
187
- mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
188
-
189
- run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
190
-
191
- demo.launch()
 
38
  if len(name) >= 2 and name not in names:
39
  names.append(name)
40
 
41
+ # 붙임형 직함 기반
42
  title_suffixes = [
43
  '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장',
44
  '선생님', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '박사', '석사', '학사',
45
  '학생', '고등학생', '중학생', '초등학생', '학부모', '수험생',
46
  '주임', '총무', '회장', '부회장', '사무장', '간호사', '의사', '원장', '기사님', '매니저', '지점장'
47
  ]
 
48
  pattern = r'\b([가-힣]{2,4})(' + '|'.join(title_suffixes) + r')\b'
49
  matches = re.findall(pattern, text)
50
  for match in matches:
 
52
  if name not in names:
53
  names.append(name)
54
 
55
+ # 띄어쓰기 있는 지칭어 형태에서도 이름 추출
56
+ honorific_suffixes = [
57
+ '어머니', '아버지', '엄마', '아빠', '할머니', '할아버지', '외할머니', '외할아버지',
58
+ '형', '누나', '언니', '오빠', '동생', '아들', '딸',
59
+ '이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌',
60
+ '남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리',
61
+ '올케', '형수', '제수씨', '매형', '처제', '시누이',
62
+ '대표', '사장', '부장', '차장', '과장', '대리', '주임', '직원', '팀장', '실장', '원장', '이사',
63
+ '선생님', '선생', '교사', '교장', '교감', '조교수', '교수', '연구원', '강사', '학부모',
64
+ '학생', '수험생', '초등학생', '중학생', '고등학생',
65
+ '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민'
66
+ ]
67
+ spaced_pattern = r'\b([가-힣]{2,4})\s+(' + '|'.join(honorific_suffixes) + r')\b'
68
+ spaced_matches = re.findall(spaced_pattern, text)
69
+ for match in spaced_matches:
70
+ name = match[0]
71
+ if name not in names:
72
+ names.append(name)
73
+
74
  return names
75
 
76
  def refactored_mask_names(original_text, names, start_counter=100):
 
114
  else:
115
  result += ch
116
  return result