Spaces:

blueradiance
/

Masking2

Runtime error

App Files Files Community

blueradiance commited on Apr 17, 2025

Commit

8429f23

verified ·

1 Parent(s): b99d196

Upload APP.py

Browse files

Files changed (1) hide show

APP.py +48 -0

APP.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# 📦 PART 1: 이름 추출기 + 태그 치환기
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import re
+TAG_PREFIX = "N"
+# 모델 설정
+model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForTokenClassification.from_pretrained(model_name)
+ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# 예외 단어 (태깅 제외)
+NAME_ENTITY_EXCEPTIONS = set([
+    '법적', '사회적', '행정적', '심리적', '의료적', '법률적', '해당', '본인', '소속', '상담'
+])
+def extract_names(text: str) -> list:
+    """
+    🤖 KoELECTRA 기반 NER로 이름 후보 추출 (2글자 이상, PS만)
+    """
+    results = ner_pipeline(text)
+    names = []
+    for entity in results:
+        if entity.get("entity_group") == "PS":
+            name = entity["word"].replace("##", "").strip()
+            if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
+                names.append(name)
+    return list(set(names))
+def apply_name_tags(text: str, names: list, start_index: int = 100) -> tuple[str, dict]:
+    """
+    🏷 이름 리스트를 태그로 치환: 김철수 → N100
+    반환: (태깅된 텍스트, 태그 매핑 딕셔너리)
+    """
+    mapping = {}
+    tagged_text = text
+    counter = start_index
+    for name in names:
+        tag = f"{TAG_PREFIX}{counter:03d}"
+        pattern = re.compile(rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])')
+        tagged_text, n = pattern.subn(tag, tagged_text)
+        if n > 0:
+            mapping[tag] = name
+            counter += 1
+    return tagged_text, mapping