Spaces:
Running
Running
jkkim
PII ๋ถ์ ์คํ ๋์ฑ
3์ข
โ NRP/DATE_TIME ๊ฐ์ค์น 0 + ์ ๊ท์ ๊ฐํ + ํ์ฒ๋ฆฌ ํํฐ
64f5afe | """๋ฌธ์ ๋ณด์๋ฑ๊ธ ๋ถ๋ฅ๊ธฐ (Rule-based / Zero-shot) | |
| ==================================================== | |
| SPEC_ํ์ผ๋ถ๋ฅ_PoC.md ยง2.2 / ยง2.3 / ยง10 ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ํ **๊ฐ๋จํ** 1์ฐจ ๋ถ๋ฅ ๋ชจ๋ธ. | |
| ์ค๊ณ ์๋ | |
| --------- | |
| - SPEC ์ ์ต์ข ๋ชฉํ๋ KoELECTRA + mDeBERTa ์์๋ธ + ์ฌ์ฉ์ ํผ๋๋ฐฑ ๋ฃจํ์ง๋ง, | |
| ๊ทธ ๋ชจ๋ธ์ **์ฌ์ฉ์ ๋ ์ด๋ธ์ด ๋์ ๋์ด์ผ** ํ์ต์ด ๊ฐ๋ฅํ๋ค (์ฝ๋์คํํธ ๋ฌธ์ ). | |
| - ๋ณธ ๋ชจ๋์ **๋ถํธ์คํธ๋ฉ 1๋จ๊ณ โ zero-shot ์๋ ๋ ์ด๋ธ ์์ฑ๊ธฐ**. | |
| (SPEC ยง2.3 โ "๋ฃฐ + ํค์๋ ๋งค์นญ๋ง์ผ๋ก ์๋ ๋ ์ด๋ธ") | |
| - ์ ์(score) ๊ธฐ๋ฐ ๊ฒฐ์ โ ์๊ณ๊ฐ(threshold) ์ผ๋ก C / S / O ๋ถ๋ฅ. | |
| - "์ ๊ทธ ๋ฑ๊ธ์ธ๊ฐ" ๋ฅผ ํจ๊ป ๋๋ ค์ค๋ค (SPEC ๊ธฐ๋ฅ 4 โ XAI). | |
| C / S / O ์ ์ (SPEC ยง2.2) | |
| -------------------------- | |
| - **C** Critical / ์ํ : ์ง์ ์๋ณ์(์ฃผ๋ฏผ๋ฒํธยท์ฌ๊ถยท์นด๋ ๋ฑ) ๋๋ ๊ฐํ ๋ฑ๊ธ ๋ง์ปค | |
| - **S** Sensitive / ๋ฏผ๊ฐ : API ํค, ์ฌ์ ์๋ฒํธ, ๋ด๋ถ ํ๋ก์ ํธ๋ช , VIP ๋ฑ | |
| - **O** Open / ๊ณต๊ฐ : ๊ทธ ์ธ | |
| ์์ | |
| ---- | |
| score = ฮฃ (w_e ยท count_e) + ฮฃ (w_k ยท min(count_k, 3)) | |
| (entity) (grade keyword) | |
| grade = C if score >= C_THRESHOLD | |
| S if score >= S_THRESHOLD | |
| O otherwise | |
| ์ ๋ขฐ๋(confidence) = 0.55 + 0.4 ยท tanh(margin / 2) | |
| margin = ์๊ณ๊ฐ์์ ๋จ์ด์ง ๊ฑฐ๋ฆฌ (๋ฐด๋ ๋ด๋ถ ํญ์ ์์ชฝ ๊ฒฝ๊ณ๊น์ง์ ์ต์๊ฑฐ๋ฆฌ) | |
| """ | |
| from __future__ import annotations | |
| import math | |
| import re | |
| from typing import Iterable | |
| # ๋ฑ๊ธ โ ์ ์ (SPEC ยง10.1, gap ๊ณ์ฐ์ฉ) | |
| GRADE_RANK = {"O": 0, "S": 1, "C": 2} | |
| # entity_type ๋ณ ๋จ์ผ ๋งค์น๋น ๊ฐ์ฐ์ | |
| ENTITY_WEIGHTS: dict[str, float] = { | |
| # --- Critical ํ๋ณด (์ง์ ์๋ณ์) --- | |
| "KR_RRN": 5.0, # ํ๊ตญ ์ฃผ๋ฏผ๋ฑ๋ก๋ฒํธ | |
| "KR_PASSPORT": 4.5, | |
| "CREDIT_CARD": 4.5, | |
| "US_SSN": 4.5, | |
| "IBAN_CODE": 3.0, | |
| # --- Sensitive ํ๋ณด (๊ณ์ ยท๋น์ฆยท๋ด๋ถ) --- | |
| "AWS_ACCESS_KEY": 3.5, | |
| "GENERIC_API_KEY": 1.0, # ์คํ ์ฆ์ (32์ ์์ ํ ํฐ) โ ๋ฎ์ถค | |
| "KR_BIZ_NO": 2.5, | |
| "VIP_NAMES": 2.0, | |
| "INTERNAL_PROJECTS": 2.0, | |
| "KR_ADDRESS": 1.5, | |
| # --- ์ฝํ ์๋ณ์ --- | |
| "IP_ADDRESS": 0.4, | |
| "KR_PHONE": 0.5, | |
| "PHONE_NUMBER": 0.5, | |
| "EMAIL_ADDRESS": 0.4, | |
| "PERSON": 0.3, | |
| "LOCATION": 0.15, # ์ผ๋ฐ ์ง๋ช ํํจ โ ๋ฎ์ถค | |
| "ORGANIZATION": 0.15, # ์ผ๋ฐ ํ์ฌ๋ช ํํจ โ ๋ฎ์ถค | |
| "URL": 0.0, # ์ผ๋ฐ URL โ PII ์๋ | |
| "DATE_TIME": 0.0, # ์ผ๋ฐ ๋ ์ง / ๋ฒ์ ๋ฒํธ (1.9.16 ๋ฑ) โ PII ์๋ | |
| # --- ๋น-PII (NER ๋ชจ๋ธ noise) โ fallback DEFAULT ๊ฐ 0.3 ์ผ๋ก ์กํ ํ์ต dominate ๋ฐฉ์ง --- | |
| "NRP": 0.0, # NORP = Nationalities/Religious/Political (spaCy ์์ด NER) | |
| "NORP": 0.0, # spaCy ์๋ณธ ๋ผ๋ฒจ (Presidio ๊ฐ NRP ๋ก ์ถ์ฝ ํ๊ธฐ) | |
| # --- ํ๊ตญ ํ๊ฒฝ ๋ฌด๊ด ์ธ๊ตญ PII (Presidio default ์ ๊ท์, ํ๊ตญ์ด ํ ์คํธ์ ์คํ) --- | |
| "US_DRIVER_LICENSE": 0.0, # s3, y3, X0 ๊ฐ์ 2๊ธ์ ์คํ ๋ค์ | |
| "US_ITIN": 0.0, | |
| "US_PASSPORT": 0.0, | |
| "IN_PAN": 0.0, | |
| } | |
| DEFAULT_ENTITY_WEIGHT = 0.05 # ๋ฏธ๋ช ์ entity ๊ฐ ํ์ต์ dominate ํ์ง ์๋๋ก (๊ธฐ์กด 0.3 โ 0.05) | |
| # ๋ณธ๋ฌธ ๋ฑ๊ธ ํค์๋ โ ๋ช ์์ ๋ผ๋ฒจ์ด ๋ฐํ์์ ๋ ๊ฐํ๊ฒ ๋ฐ์ | |
| # (kw_lower, weight, display_label) | |
| GRADE_KEYWORDS: list[tuple[str, float, str]] = [ | |
| ("๊ทน๋น", 4.0, "๊ทน๋น"), | |
| ("top secret", 4.0, "Top Secret"), | |
| ("๋์ธ๋น", 3.0, "๋์ธ๋น"), | |
| ("๊ธฐ๋ฐ", 3.0, "๊ธฐ๋ฐ"), | |
| ("confidential", 3.0, "Confidential"), | |
| ("secret", 2.5, "Secret"), | |
| ("๋ด๋ถ์ฉ", 1.5, "๋ด๋ถ์ฉ"), | |
| ("์ฌ๋ดํ์ ", 1.5, "์ฌ๋ดํ์ "), | |
| ("internal use", 1.5, "Internal Use"), | |
| ("restricted", 1.5, "Restricted"), | |
| ("private", 1.0, "Private"), | |
| ("๊ฐ์ธ์ ๋ณด", 1.0, "๊ฐ์ธ์ ๋ณด ๋ผ๋ฒจ"), | |
| ] | |
| # ๋์ผ ํค์๋ ๋์ ์ํ โ "๋น๋ฐ ๋น๋ฐ ๋น๋ฐ..." ์ด๋ทฐ์ง ๋ฐฉ์ง | |
| KW_COUNT_CAP = 3 | |
| # ์๊ณ๊ฐ โ ์ํ๋๊ฐ. ๋ฐ์ดํฐ ๋์ ํ SPEC ยง10.4 ์ Platt scaling ์ผ๋ก ๋ณด์ ์์ | |
| C_THRESHOLD = 5.0 | |
| S_THRESHOLD = 2.0 | |
| CLASSIFIER_VERSION = "rule-v1" # ๋ชจ๋ ์ถํ ์ ๋ฒ ์ด์ค ๋ฒ์ (๋ถ๋ณ) | |
| _active_version = CLASSIFIER_VERSION # ํ์ต/ํซ์ค์ ํ ๊ฐฑ์ ๋๋ ํ์ฑ ๋ฒ์ | |
| def active_version() -> str: | |
| """ํ์ฌ ํ์ฑ ๋ชจ๋ธ ๋ฒ์ . /api/analyze ์๋ต ๋ฑ์์ ์ฌ์ฉ.""" | |
| return _active_version | |
| def set_active_version(v: str) -> None: | |
| """train.apply_new_weights() ๊ฐ ํซ์ค์ ํ ํธ์ถ โ ๋ฒ์ ๋ผ๋ฒจ ์ ๋ฐ์ดํธ.""" | |
| global _active_version | |
| _active_version = v | |
| # --------------------------------------------------------------------------- | |
| # extra grade keywords (์ฌ์ฉ์ ์ถ๊ฐ) โ ๋ถํ ์ GRADE_KEYWORDS ์ ๋ณํฉ / ์์ํ | |
| # --------------------------------------------------------------------------- | |
| import json as _json | |
| from pathlib import Path as _Path | |
| _EXTRA_KW_FILE = _Path(__file__).resolve().parent / "extra_grade_keywords.json" | |
| def _load_extra_kw() -> list[dict]: | |
| if not _EXTRA_KW_FILE.exists(): | |
| return [] | |
| try: | |
| return _json.loads(_EXTRA_KW_FILE.read_text(encoding="utf-8")) | |
| except Exception: | |
| return [] | |
| def _save_extra_kw(items: list[dict]) -> None: | |
| _EXTRA_KW_FILE.write_text( | |
| _json.dumps(items, ensure_ascii=False, indent=2), | |
| encoding="utf-8", | |
| ) | |
| def _is_builtin_kw(label: str) -> bool: | |
| """์ด๊ธฐ GRADE_KEYWORDS ์ ๋ฑ๋ก๋ ํค์๋์ธ์ง โ ์ฌ์ฉ์ ์ญ์ ๋ถ๊ฐ.""" | |
| return label in {l for (_kw, _w, l) in _BUILTIN_GRADE_KEYWORDS} | |
| # ๋นํธ์ธ ์ค๋ ์ท (load_extras ๊ฐ ์ฌ๋ฌ ๋ฒ ํธ์ถ๋ผ๋ ์ค๋ณต ์ถ๊ฐ ์ ๋๋๋ก) | |
| _BUILTIN_GRADE_KEYWORDS: list[tuple[str, float, str]] = list(GRADE_KEYWORDS) | |
| def reload_extra_grade_keywords() -> int: | |
| """ํ์ผ์์ ๋ค์ ์ฝ์ด GRADE_KEYWORDS ๋ฅผ ๋นํธ์ธ + ์ฌ์ฉ์ ์ ์๋ก ์ฌ๊ตฌ์ฑ. ๋ฐํ: ์ถ๊ฐ๋ ํญ๋ชฉ ์.""" | |
| extras = _load_extra_kw() | |
| GRADE_KEYWORDS.clear() | |
| GRADE_KEYWORDS.extend(_BUILTIN_GRADE_KEYWORDS) | |
| for x in extras: | |
| kw = (x.get("keyword") or "").lower() | |
| w = float(x.get("weight", 1.0)) | |
| lbl = x.get("label") or x.get("keyword") or "" | |
| if kw: | |
| GRADE_KEYWORDS.append((kw, w, lbl)) | |
| return len(extras) | |
| def add_extra_grade_keyword(keyword: str, weight: float, label: str) -> dict: | |
| """์ฌ์ฉ์ ์ ์ ๋ฑ๊ธ ํค์๋ 1๊ฐ ์ถ๊ฐ (์ด๋ฏธ ์์ผ๋ฉด ๊ฐ์ค์น/๋ผ๋ฒจ ๊ฐฑ์ ). | |
| Returns: {keyword, weight, label, action: 'added'|'updated'} | |
| """ | |
| keyword = (keyword or "").strip().lower() | |
| if not keyword: | |
| raise ValueError("keyword is empty") | |
| label = (label or keyword).strip() | |
| items = _load_extra_kw() | |
| action = "added" | |
| for it in items: | |
| if it.get("keyword", "").lower() == keyword: | |
| it["weight"] = float(weight) | |
| it["label"] = label | |
| action = "updated" | |
| break | |
| else: | |
| items.append({"keyword": keyword, "weight": float(weight), "label": label}) | |
| _save_extra_kw(items) | |
| reload_extra_grade_keywords() | |
| return {"keyword": keyword, "weight": float(weight), "label": label, "action": action} | |
| def remove_extra_grade_keyword(keyword: str) -> dict: | |
| """์ฌ์ฉ์ ์ถ๊ฐ ํค์๋ ์ญ์ . ๋นํธ์ธ์ ์ญ์ ๋ถ๊ฐ.""" | |
| keyword = (keyword or "").strip().lower() | |
| items = _load_extra_kw() | |
| before = len(items) | |
| items = [it for it in items if it.get("keyword", "").lower() != keyword] | |
| if len(items) == before: | |
| return {"keyword": keyword, "removed": False, "reason": "not in extras (built-in or unknown)"} | |
| _save_extra_kw(items) | |
| reload_extra_grade_keywords() | |
| return {"keyword": keyword, "removed": True} | |
| def list_extra_grade_keywords() -> list[dict]: | |
| return _load_extra_kw() | |
| # ๋ชจ๋ ๋ก๋ ์ ์๋ ๋ณํฉ | |
| reload_extra_grade_keywords() | |
| def _grade_for_score(score: float) -> tuple[str, str, float]: | |
| """score โ (grade, label, margin). margin ์ ๊ฐ์ฅ ๊ฐ๊น์ด ์๊ณ๊ฐ๊น์ง์ ๊ฑฐ๋ฆฌ.""" | |
| if score >= C_THRESHOLD: | |
| return "C", "์ํ (Critical)", score - C_THRESHOLD | |
| if score >= S_THRESHOLD: | |
| return "S", "๋ฏผ๊ฐ (Sensitive)", min(score - S_THRESHOLD, C_THRESHOLD - score) | |
| return "O", "๊ณต๊ฐ (Open)", S_THRESHOLD - score | |
| def _confidence(margin: float) -> float: | |
| """margin โ 0.55..0.95 ๋ฒ์์ ์ ๋ขฐ๋. ์๊ณ๊ฐ์ ๊ฐ๊น์ธ์๋ก 0.55.""" | |
| if margin < 0: | |
| margin = 0.0 | |
| return round(0.55 + 0.4 * math.tanh(margin / 2.0), 3) | |
| def _scan_keywords(text: str) -> list[dict]: | |
| """๋ณธ๋ฌธ์์ ๋ฑ๊ธ ํค์๋ ๋งค์นญ โ ๊ธฐ์ฌ๋ dict ๋ฆฌ์คํธ.""" | |
| if not text: | |
| return [] | |
| lower = text.lower() | |
| out: list[dict] = [] | |
| for kw, w, label in GRADE_KEYWORDS: | |
| cnt = lower.count(kw) | |
| if cnt <= 0: | |
| continue | |
| capped = min(cnt, KW_COUNT_CAP) | |
| out.append({ | |
| "kind": "keyword", | |
| "label": label, | |
| "weight": round(w, 2), | |
| "count": cnt, | |
| "counted": capped, | |
| "contribution": round(w * capped, 2), | |
| }) | |
| return out | |
| def _scan_entities(findings: Iterable[dict]) -> list[dict]: | |
| """findings โ entity_type ๋ณ ๊ธฐ์ฌ๋ dict ๋ฆฌ์คํธ.""" | |
| counts: dict[str, int] = {} | |
| for f in findings or []: | |
| et = f.get("entity_type") | |
| if not et: | |
| continue | |
| counts[et] = counts.get(et, 0) + 1 | |
| out: list[dict] = [] | |
| for et, cnt in counts.items(): | |
| w = ENTITY_WEIGHTS.get(et, DEFAULT_ENTITY_WEIGHT) | |
| out.append({ | |
| "kind": "entity", | |
| "label": et, | |
| "weight": round(w, 2), | |
| "count": cnt, | |
| "contribution": round(w * cnt, 2), | |
| }) | |
| return out | |
| def classify(findings: Iterable[dict] | None, text: str | None) -> dict: | |
| """findings + ๋ณธ๋ฌธ โ ๋ถ๋ฅ ๊ฒฐ๊ณผ. | |
| Args: | |
| findings: /api/analyze ๊ฐ ๋ง๋ findings ๋ฆฌ์คํธ (entity_type ํค ํ์). | |
| text: ๋ณธ๋ฌธ โ ๋ฑ๊ธ ํค์๋ ์ค์บ์ ์ฌ์ฉ. None/"" ์ด๋ฉด ํค์๋ ์ค์บ ์๋ต. | |
| Returns: { | |
| grade: "C" | "S" | "O" | |
| grade_label: "์ํ (Critical)" ๋ฑ | |
| score: ๋์ ์ ์ | |
| confidence: 0.55..0.95 | |
| thresholds: {"C": 5.0, "S": 2.0} | |
| reasons: ๊ธฐ์ฌ ํฐ ์ ์ ๋ ฌ๋ entity/keyword ํญ๋ชฉ (์ต๋ 20) | |
| version: "rule-v1" | |
| text_chars: ์ค์บ ๋์ ๋ณธ๋ฌธ ๊ธธ์ด | |
| } | |
| """ | |
| entity_reasons = _scan_entities(findings or []) | |
| keyword_reasons = _scan_keywords(text or "") | |
| reasons = entity_reasons + keyword_reasons | |
| reasons.sort(key=lambda r: -r["contribution"]) | |
| score = sum(r["contribution"] for r in reasons) | |
| grade, grade_label, margin = _grade_for_score(score) | |
| return { | |
| "grade": grade, | |
| "grade_label": grade_label, | |
| "score": round(score, 2), | |
| "confidence": _confidence(margin), | |
| "thresholds": {"C": C_THRESHOLD, "S": S_THRESHOLD}, | |
| "reasons": reasons[:20], | |
| "version": CLASSIFIER_VERSION, | |
| "text_chars": len(text or ""), | |
| } | |
| def gap(ai_grade: str, user_grade: str) -> int: | |
| """SPEC ยง10.1 โ |aiRank - userRank|. 0=์ผ์น, 1=์ธ์ ๋ถ์ผ์น, 2=๋ฐ๋.""" | |
| return abs(GRADE_RANK.get(ai_grade, 0) - GRADE_RANK.get(user_grade, 0)) | |
| def sample_weight(g: int) -> float: | |
| """SPEC ยง10.2 โ ํ์ต ์ ๊ฐ์ค์น. 1 + 1.5 * gap.""" | |
| return 1.0 + 1.5 * g | |