blueradiance commited on
Commit
8429f23
·
verified ·
1 Parent(s): b99d196

Upload APP.py

Browse files
Files changed (1) hide show
  1. APP.py +48 -0
APP.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 📦 PART 1: 이름 추출기 + 태그 치환기
3
+
4
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
5
+ import re
6
+
7
+ TAG_PREFIX = "N"
8
+
9
+ # 모델 설정
10
+ model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
13
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
14
+
15
+ # 예외 단어 (태깅 제외)
16
+ NAME_ENTITY_EXCEPTIONS = set([
17
+ '법적', '사회적', '행정적', '심리적', '의료적', '법률적', '해당', '본인', '소속', '상담'
18
+ ])
19
+
20
+ def extract_names(text: str) -> list:
21
+ """
22
+ 🤖 KoELECTRA 기반 NER로 이름 후보 추출 (2글자 이상, PS만)
23
+ """
24
+ results = ner_pipeline(text)
25
+ names = []
26
+ for entity in results:
27
+ if entity.get("entity_group") == "PS":
28
+ name = entity["word"].replace("##", "").strip()
29
+ if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
30
+ names.append(name)
31
+ return list(set(names))
32
+
33
+ def apply_name_tags(text: str, names: list, start_index: int = 100) -> tuple[str, dict]:
34
+ """
35
+ 🏷 이름 리스트를 태그로 치환: 김철수 → N100
36
+ 반환: (태깅된 텍스트, 태그 매핑 딕셔너리)
37
+ """
38
+ mapping = {}
39
+ tagged_text = text
40
+ counter = start_index
41
+ for name in names:
42
+ tag = f"{TAG_PREFIX}{counter:03d}"
43
+ pattern = re.compile(rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])')
44
+ tagged_text, n = pattern.subn(tag, tagged_text)
45
+ if n > 0:
46
+ mapping[tag] = name
47
+ counter += 1
48
+ return tagged_text, mapping