Spaces:

blueradiance
/

Masking2

Runtime error

App Files Files Community

Masking2 / app.py

blueradiance

Update app.py

8f37fc1 verified 9 months ago

raw

history blame contribute delete

9.14 kB

	# 📦 PART 1: 이름 추출기 + 태그 치환기

	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import re

	TAG_PREFIX = "N"

	def apply_name_tags(text: str, names: list, start_index: int = 100) -> tuple[str, dict]:
	"""
	🏷 이름 리스트를 태그로 치환: 김철수 → N100
	반환: (태깅된 텍스트, 태그 매핑 딕셔너리)
	"""
	mapping = {}
	tagged_text = text
	counter = start_index

	# ✅ 긴 이름 우선 정렬
	names = sorted(set(names), key=len, reverse=True)

	for name in names:
	tag = f"{TAG_PREFIX}{counter:03d}"
	pattern = re.compile(rf"([\s\(\[\"']){re.escape(kw)}([가-힣\s.,;:!?()\[\]\"'])", re.IGNORECASE)

	tagged_text, n = pattern.subn(tag, tagged_text)
	if n > 0:
	mapping[tag] = name
	counter += 1
	return tagged_text, mapping


	def replace_institution_keywords(text: str, keywords: list, replace_word: str) -> str:
	"""
	🏢 키워드 기반 기관명 → 치환어로 변경
	"""
	for kw in keywords:
	pattern = re.compile(
	rf'([\s\(\["']){re.escape(kw)}([가-힣\s.,;:!?()\[\]"'"])',
	re.IGNORECASE
	)
	text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text)
	return text



	# 📦 PART 2 (Extended & Fixed): 호칭/조사 확장기 + 태그 매핑 보정기 - 특수문자 오류 수정판

	import re

	# ✅ 확장된 호칭 리스트
	COMMON_SUFFIXES = [
	# 📁 가정/관계 기반
	'어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생',
	'딸', '아들', '조카', '사촌', '이모', '고모', '삼촌', '숙모', '외삼촌',
	'할머니', '할아버지', '외할머니', '외할아버지', '장모', '장인', '며느리', '사위',
	'부인', '와이프', '신랑', '올케', '형수', '제수씨', '매형', '처제', '시누이',

	# 📁 사회/교육/직업 호칭
	'학생', '초등학생', '중학생', '고등학생', '수험생', '학부모', '선생', '선생님', '교사',
	'교감', '교장', '담임', '반장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
	'보호자', '피해자', '아동', '주민', '당사자', '대상자', '담당자',

	# 📁 직장/조직 직급
	'대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '팀장', '본부장',
	'센터장', '소장', '실장', '총무', '직원', '매니저', '지점장', '사무장',

	# 📁 의료/기타
	'의사', '간호사', '간병인', '기사님', '어르신', '님', '씨'
	]

	# ✅ 실전급 조사 리스트
	COMMON_JOSA = [
	# ✅ 기본 조사
	'이', '가', '을', '를', '은', '는', '의', '도',

	# ✅ 처소/방향/대상
	'에', '에서', '에게', '께서', '으로', '로', '부터', '까지', '한테',

	# ✅ 강조/대조/비교
	'보다', '보다도', '마저', '조차', '조차도', '까지도', '밖에', '만큼', '만큼은',
	'이라도', '이든지', '이나마', '이건', '이란', '이라서', '이지만',

	# ✅ 연결형 조사
	'이며', '이나', '이거나', '이니까', '이라면', '처럼', '대로', '하고', '그리고', '와', '과',

	# ✅ 보조/종결형 어미
	'이기도', '이었던', '이었지만', '이어서', '이었다면', '인', '일', '임', '이란', '이라는',

	# ✅ 특수형 조사/조합형
	'같은', '같아서', '까지는', '뿐만 아니라', '와는', '와도', '하고도', '으로서', '으로써'
	]

	def expand_variation_patterns(text: str, mapping: dict) -> str:
	"""
	👓 태그된 텍스트에서 성+이름+호칭+조사 형태를 다시 태깅
	"""
	for tag, base in mapping.items():
	prefix = r'[\\s\\(\\["\\\']*' # 공백, 괄호, 따옴표 포함된 안전 패턴
	suffix = f"(?:{'\|'.join(COMMON_SUFFIXES)})?"
	josa = f"(?:{'\|'.join(COMMON_JOSA)})?"
	pattern = re.compile(rf'{prefix}{re.escape(base)}{suffix}{josa}', re.IGNORECASE)

	text = pattern.sub(lambda m: m.group(0).replace(base, tag), text)
	return text


	def boost_mapping_from_context(text: str, mapping: dict) -> dict:
	"""
	📌 태깅된 텍스트에서 각 태그의 실제 확장된 표현 감지해 mapping 보정
	"""
	updated = {}
	for tag, base in mapping.items():
	idx = text.find(tag)
	if idx == -1:
	updated[tag] = base
	continue
	window = text[max(0, idx - 100): idx + 100]
	pattern = re.compile(rf'([\s\(\["\']){re.escape(kw)}([가-힣\s.,;:!?()\[\]"\'"])', re.IGNORECASE)
	match = pattern.search(window)
	if match:
	updated[tag] = match.group(0)
	else:
	updated[tag] = base
	return updated



	# 📦 PART 3: 민감정보 마스커 + 학교/학년/학과 마스커

	import re

	def postprocess_sensitive_patterns(text: str) -> str:
	"""
	🔐 이메일, 주민등록번호, 계좌번호, 카드번호, 전화번호, 주소 마스킹
	"""
	text = re.sub(r"[\w\.-]+@[\w\.-]+", "****@.**", text) # 이메일
	text = re.sub(r"(\d{6})[- ]?(\d{7})", "****-*****", text) # 주민번호
	text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "*--**", text) # 카드/전화
	text = re.sub(r"(\d{1,3})동", "***동", text)
	text = re.sub(r"(\d{1,4})호", "****호", text)
	return text

	def to_chosung(text: str) -> str:
	"""
	🧠 초성 변환기: 학교명, 학과명 등에 적용
	"""
	CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
	result = ""
	for ch in text:
	if '가' <= ch <= '힣':
	code = ord(ch) - ord('가')
	cho = code // 588
	result += CHOSUNG_LIST[cho]
	else:
	result += ch
	return result

	def mask_school_names(text: str) -> str:
	"""
	🏫 학교명 → 초성 변환 마스킹 (연세대학교 → ㅇㅅ대학교)
	"""
	def replace_school(m):
	return to_chosung(m.group(1)) + m.group(2)
	return re.sub(r"([가-힣]{2,20})(초등학교\|중학교\|고등학교\|대학교)", replace_school, text)

	def mask_department_names(text: str) -> str:
	"""
	🏢 학과명 → 초성 마스킹 (국문학과 → ㄱㅁ학과)
	"""
	return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)

	def mask_grade_class(text: str) -> str:
	"""
	🎓 학년/반 정보 마스킹 (2학년 3반 → 학년 반)
	"""
	return re.sub(r"(\d)학년(\s?(\d)반)?", "학년 반", text)



	# 📦 PART 4: 기관 키워드 치환기 + Gradio UI 실행기

	import re
	import gradio as gr
	from part1_name_extract_and_tag import extract_names, apply_name_tags
	from part2_suffix_expansion_and_mapping import expand_variation_patterns, boost_mapping_from_context
	from part3_sensitive_school_masker import (
	postprocess_sensitive_patterns,
	mask_school_names,
	mask_department_names,
	mask_grade_class
	)

	def replace_institution_keywords(text: str, keywords: list, replace_word: str) -> str:
	"""
	🏢 키워드 기반 기관명 → 치환어로 변경
	"""
	for kw in keywords:
	pattern = re.compile(rf'([\s\(\["'‘“]){re.escape(kw)}([가-힣\s.,;:!?()"'”’])', re.IGNORECASE)
	text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text)
	return text

	def apply_full_masking(text: str, keyword_str: str, replace_word: str):
	# 1. 키워드 치환
	keywords = [k.strip() for k in keyword_str.split(",") if k.strip()]
	text = replace_institution_keywords(text, keywords, replace_word)

	# 2. 민감정보 + 학교 학과 학년 마스킹
	text = postprocess_sensitive_patterns(text)
	text = mask_school_names(text)
	text = mask_department_names(text)
	text = mask_grade_class(text)

	# 3. 이름 추출 + 태깅
	names = extract_names(text)
	tagged, mapping = apply_name_tags(text, names)

	# 4. 파생 표현 확장
	tagged = expand_variation_patterns(tagged, mapping)
	mapping = boost_mapping_from_context(tagged, mapping)

	# 5. 매핑 출력 정리
	mapping_text = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
	return tagged, mapping_text

	# UI 실행
	with gr.Blocks() as demo:
	gr.Markdown("🧠 v5.0 마스킹 통합 시스템 — 키워드, 이름, 개인정보, 학교 마스킹")
	input_text = gr.Textbox(lines=15, label="📄 원문 텍스트")
	keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
	replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
	run_button = gr.Button("🚀 실행")
	masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
	mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
	run_button.click(fn=apply_full_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])

	demo.launch()