blueradiance commited on
Commit
cca37d8
·
verified ·
1 Parent(s): 8429f23

Upload 6 files

Browse files
Files changed (6) hide show
  1. LICENSE +1 -0
  2. README.md +6 -9
  3. app.py +48 -0
  4. description.md +24 -0
  5. gitattributes +35 -0
  6. requirements.txt +3 -0
LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+ 비영리/개인 내부용. 무단 수정 및 재배포 금지. 출처 명시 필수 (blueradiance / masking-app)
README.md CHANGED
@@ -1,13 +1,10 @@
1
  ---
2
- title: Masking2
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.25.2
8
  app_file: app.py
9
  pinned: false
10
- license: other
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: 민감정보마스킹 [땡땡이 마스킹]
3
+ emoji: 🛡️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.16.0
8
  app_file: app.py
9
  pinned: false
10
+ ---
 
 
 
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 📦 PART 1: 이름 추출기 + 태그 치환기
3
+
4
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
5
+ import re
6
+
7
+ TAG_PREFIX = "N"
8
+
9
+ # 모델 설정
10
+ model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
13
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
14
+
15
+ # 예외 단어 (태깅 제외)
16
+ NAME_ENTITY_EXCEPTIONS = set([
17
+ '법적', '사회적', '행정적', '심리적', '의료적', '법률적', '해당', '본인', '소속', '상담'
18
+ ])
19
+
20
+ def extract_names(text: str) -> list:
21
+ """
22
+ 🤖 KoELECTRA 기반 NER로 이름 후보 추출 (2글자 이상, PS만)
23
+ """
24
+ results = ner_pipeline(text)
25
+ names = []
26
+ for entity in results:
27
+ if entity.get("entity_group") == "PS":
28
+ name = entity["word"].replace("##", "").strip()
29
+ if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
30
+ names.append(name)
31
+ return list(set(names))
32
+
33
+ def apply_name_tags(text: str, names: list, start_index: int = 100) -> tuple[str, dict]:
34
+ """
35
+ 🏷 이름 리스트를 태그로 치환: 김철수 → N100
36
+ 반환: (태깅된 텍스트, 태그 매핑 딕셔너리)
37
+ """
38
+ mapping = {}
39
+ tagged_text = text
40
+ counter = start_index
41
+ for name in names:
42
+ tag = f"{TAG_PREFIX}{counter:03d}"
43
+ pattern = re.compile(rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])')
44
+ tagged_text, n = pattern.subn(tag, tagged_text)
45
+ if n > 0:
46
+ mapping[tag] = name
47
+ counter += 1
48
+ return tagged_text, mapping
description.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔐 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
2
+
3
+ 초중고 학교명은 초성으로 마스킹되고, 학과/과, 학년·반 정보도 자동으로 처리됩니다.
4
+ 기관명도 설정해서 `"우리기관"` 같은 식으로 바꿀 수 있어요!
5
+
6
+ ---
7
+
8
+ ## 💡 민감정보 마스킹 (땡땡이 마스킹)
9
+
10
+ 예:
11
+ - 전화번호 → `010-****-1234`
12
+ - 주소 → `서울시 ***동 ***번지`
13
+ - 이메일, 주민번호, IP, 날짜 등 자동 치환됩니다.
14
+
15
+ ---
16
+
17
+ **제작자**: `blueradiance`
18
+
19
+ ---
20
+
21
+ ### 🛠 사용법
22
+ 1. 왼쪽 입력창에 마스킹할 원문을 붙여넣고
23
+ 2. 아래 `🚀 마스킹 실행` 버튼 클릭!
24
+ 3. 오른쪽 창에 결과와 이름 태그 매핑이 표시됩니다.
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers