hetest / pseudo_framework.py
jkkim
๐Ÿ‡ฏ๐Ÿ‡ต ์ผ๋ณธ์–ด PII + APPI/ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ์ปดํ”Œ๋ผ์ด์–ธ์Šค ํŠธ๋ž™ ์ถ”๊ฐ€
ee1da18
"""
ํŒŒ์ผ/๋ฌธ์„œ ๊ฐ€๋ช…ํ™”ยท์ต๋ช…ํ™” PoC ํ”„๋ ˆ์ž„์›Œํฌ.
ํ‘œ์ค€ ๊ธฐ๋ฐ˜:
- ISO/IEC 20889:2018 Privacy enhancing data de-identification terminology
and classification of techniques
- ISO/IEC 27559:2022 Privacy enhancing data de-identification framework
์šฉ์–ด ๊ธฐ๋ฐ˜:
- W3C DPV 2.0 (Data Privacy Vocabulary) โ€” dpv-pd:* compact IRI
๊ทœ์ œ ๋งคํŠธ๋ฆญ์Šค:
- KR ๊ฐœ์ธ์ •๋ณด๋ณดํ˜ธ๋ฒ• + ๊ฐœ์ธ์ •๋ณด๋ณดํ˜ธ์œ„์›ํšŒ ๊ฐ€๋ช…์ •๋ณด ์ฒ˜๋ฆฌ ๊ฐ€์ด๋“œ๋ผ์ธ
- JP ๅ€‹ไบบๆƒ…ๅ ฑไฟ่ญทๆณ• (APPI) + ไปฎๅๅŠ ๅทฅๆƒ…ๅ ฑใƒปๅŒฟๅๅŠ ๅทฅๆƒ…ๅ ฑใฎไฝœๆˆๅŸบๆบ–
- US HIPAA Safe Harbor (45 CFR ยง164.514(b)(2)) + CCPA/CPRA + NIST SP 800-188
- EU GDPR Art. 4(5), Recital 26 + EDPB Guidelines 01/2025 + WP29 Op. 05/2014
๋ณธ ๋ชจ๋“ˆ์€ PII Scanner ์˜ ๊ฒ€์ถœ ๊ฒฐ๊ณผ(findings)๋ฅผ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›์•„,
ISO 20889 ์˜ ๊ธฐ๋ฒ• ์นดํƒˆ๋กœ๊ทธ์— ๋”ฐ๋ผ ๋ณ€ํ™˜์„ ์ ์šฉํ•˜๊ณ 
๊ด€ํ• ๋ณ„ ์ค€์ˆ˜ ์—ฌ๋ถ€๋ฅผ ํŒ์ •ํ•ฉ๋‹ˆ๋‹ค (PoC โ€” ์‹ค ์šด์˜ ์‹œ ๋ฒ•๋ฌด ๊ฒ€ํ†  ํ•„์ˆ˜).
"""
from __future__ import annotations
import hashlib
import hmac
import re
import secrets
from dataclasses import asdict, dataclass, field
from typing import Dict, List, Optional, Tuple
# =========================================================================
# DPV ๋งคํ•‘ โ€” PII Scanner entity_type โ†’ DPV 2.0 personal data category
# =========================================================================
# ํ˜•์‹: entity_type โ†’ (dpv_concept, ํ•ด์„ค)
DPV_CATEGORY: Dict[str, Tuple[str, str]] = {
"KR_RRN": ("dpv-pd:NationalIdentificationNumber", "์ง์ ‘์‹๋ณ„์ž: ํ•œ๊ตญ ์ฃผ๋ฏผ๋“ฑ๋ก๋ฒˆํ˜ธ"),
"KR_PASSPORT": ("dpv-pd:PassportNumber", "์ง์ ‘์‹๋ณ„์ž: ํ•œ๊ตญ ์—ฌ๊ถŒ๋ฒˆํ˜ธ"),
"KR_PHONE": ("dpv-pd:TelephoneNumber", "์ง์ ‘์‹๋ณ„์ž: ํ•œ๊ตญ ํœด๋Œ€ํฐ"),
"KR_BIZ_NO": ("dpv-pd:Identifier", "๋ฒ•์ธ ์‹๋ณ„์ž (์‚ฌ์—…์ž๋“ฑ๋ก๋ฒˆํ˜ธ)"),
"KR_ADDRESS": ("dpv-pd:HomeAddress", "์ค€์‹๋ณ„์ž: ํ•œ๊ตญ ์ฃผ์†Œ"),
"EMAIL_ADDRESS": ("dpv-pd:EmailAddress", "์ง์ ‘์‹๋ณ„์ž: ์ด๋ฉ”์ผ"),
"PHONE_NUMBER": ("dpv-pd:TelephoneNumber", "์ง์ ‘์‹๋ณ„์ž: ์ผ๋ฐ˜ ์ „ํ™”๋ฒˆํ˜ธ"),
"CREDIT_CARD": ("dpv-pd:CreditCardNumber", "๋ฏผ๊ฐ/๊ธˆ์œต: ์‹ ์šฉ์นด๋“œ"),
"US_SSN": ("dpv-pd:NationalIdentificationNumber", "์ง์ ‘์‹๋ณ„์ž: ๋ฏธ๊ตญ SSN"),
"URL": ("dpv-pd:URL", "์ค€์‹๋ณ„์ž(๊ฐ€๋Šฅ): URL"),
"IP_ADDRESS": ("dpv-pd:IPAddress", "์ค€์‹๋ณ„์ž/Tracking"),
"IBAN_CODE": ("dpv-pd:BankAccount", "๋ฏผ๊ฐ/๊ธˆ์œต: IBAN"),
"VIP_PERSON": ("dpv-pd:Name", "์ง์ ‘์‹๋ณ„์ž: ์ž์—ฐ์ธ ์ด๋ฆ„"),
"INTERNAL_PROJECT": ("dpv:NonPersonalData", "๋‚ด๋ถ€ ์ฝ”๋“œ๋ช… (PII ์•„๋‹˜)"),
"AWS_ACCESS_KEY": ("dpv:NonPersonalData", "๋น„๋ฐ€: ์ž๊ฒฉ์ฆ๋ช…"),
"GENERIC_API_KEY": ("dpv:NonPersonalData", "๋น„๋ฐ€: ์ž๊ฒฉ์ฆ๋ช… ํ›„๋ณด"),
# ์ผ๋ณธ PII โ€” APPI / ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ•
"JP_MY_NUMBER": ("dpv-pd:NationalIdentificationNumber", "ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท: ใƒžใ‚คใƒŠใƒณใƒใƒผ (12์ž๋ฆฌ). ๋ณ„๋„๋ฒ•(ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ•) ์ ์šฉ"),
"JP_PASSPORT": ("dpv-pd:PassportNumber", "ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท: ์ผ๋ณธ ์—ฌ๊ถŒ๋ฒˆํ˜ธ"),
"JP_DRIVERS_LICENSE": ("dpv-pd:Identifier", "ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท: ์šด์ „๋ฉดํ—ˆ๋ฒˆํ˜ธ (12์ž๋ฆฌ)"),
"JP_PHONE": ("dpv-pd:TelephoneNumber", "์ง์ ‘์‹๋ณ„์ž: ์ผ๋ณธ ์ „ํ™”๋ฒˆํ˜ธ"),
"JP_POSTAL_CODE": ("dpv-pd:HomeAddress", "์ค€์‹๋ณ„์ž: ์ผ๋ณธ ์šฐํŽธ๋ฒˆํ˜ธ"),
"JP_ADDRESS": ("dpv-pd:HomeAddress", "์ค€์‹๋ณ„์ž: ์ผ๋ณธ ์ฃผ์†Œ"),
"JP_CORPORATE_NUMBER": ("dpv:NonPersonalData", "๋ฒ•์ธ๋ฒˆํ˜ธ (13์ž๋ฆฌ) โ€” ๅ€‹ไบบๆƒ…ๅ ฑ ์•„๋‹˜"),
"JP_BANK_ACCOUNT": ("dpv-pd:BankAccount", "๋ฏผ๊ฐ/๊ธˆ์œต: ์ผ๋ณธ ์€ํ–‰๊ตฌ์ขŒ"),
}
# =========================================================================
# ๊ด€ํ• ๋ณ„ ๊ทœ์ œ ๋งคํŠธ๋ฆญ์Šค
# =========================================================================
# ๊ฐ ๊ด€ํ• ์—์„œ entity_type ์„ ์–ด๋–ป๊ฒŒ ๋ถ„๋ฅ˜ํ•˜๋Š”์ง€(direct/quasi/sensitive/secret)
JURISDICTION: Dict[str, Dict] = {
"KR": {
"name": "๋Œ€ํ•œ๋ฏผ๊ตญ",
"law": "๊ฐœ์ธ์ •๋ณด๋ณดํ˜ธ๋ฒ• ยง2ยทยง28-2~7 ยท ๊ฐ€๋ช…์ •๋ณด ์ฒ˜๋ฆฌ ๊ฐ€์ด๋“œ๋ผ์ธ (PIPC)",
"url": "https://www.pipc.go.kr/",
"notes": (
"์ง์ ‘์‹๋ณ„์ž๋Š” ๊ฐ€๋ช…ํ™” ์‹œ ๋น„๊ฐ€์—ญ์  ๋ณ€ํ™˜ ํ•„์š”. ์ถ”๊ฐ€์ •๋ณด(๋งคํ•‘ ํ‚ค ๋“ฑ)๋Š” "
"๋ณ„๋„ ๋ถ„๋ฆฌ ๋ณด๊ด€. ์ค€์‹๋ณ„์ž ์กฐํ•ฉ์œผ๋กœ ์žฌ์‹๋ณ„ ๊ฐ€๋Šฅ์„ฑ์ด ๋‚ฎ์•„์•ผ ํ•จ "
"(k-์ต๋ช…์„ฑยทl-๋‹ค์–‘์„ฑ ๋“ฑ ์ ์ •์„ฑ ๊ฒ€ํ† ). ์ต๋ช…์ •๋ณด๋Š” ์–ด๋–ค ์ถ”๊ฐ€์ •๋ณด๋กœ๋„ "
"๋ณต์› ๋ถˆ๊ฐ€๋Šฅํ•ด์•ผ ํ•จ (ยง2 ์ œ1ํ˜ธ์˜2)."
),
"direct": ["KR_RRN", "KR_PASSPORT", "EMAIL_ADDRESS", "PHONE_NUMBER", "KR_PHONE", "VIP_PERSON", "US_SSN"],
"quasi": ["KR_ADDRESS", "KR_BIZ_NO", "IP_ADDRESS", "URL"],
"sensitive":["CREDIT_CARD", "IBAN_CODE"],
"secret": ["AWS_ACCESS_KEY", "GENERIC_API_KEY"],
},
"JP": {
"name": "ๆ—ฅๆœฌ",
"law": "ๅ€‹ไบบๆƒ…ๅ ฑไฟ่ญทๆณ• (APPI) ยง2ยทยง16-2ยทยง35-2ยทยง43 / ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ยง3ยทยง19 / PPCใ€ŽไปฎๅๅŠ ๅทฅๆƒ…ๅ ฑใƒปๅŒฟๅๅŠ ๅทฅๆƒ…ๅ ฑใฎไฝœๆˆๅŸบๆบ–ใ€",
"url": "https://www.ppc.go.jp/",
"notes": (
"ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท(ใƒžใ‚คใƒŠใƒณใƒใƒผยทๆ—…ๅˆธยท้‹่ปขๅ…่จฑยทๆŒ‡็ด‹ ๋“ฑ)๋Š” ไปฎๅๅŠ ๅทฅ ์‹œ "
"ๅ‰Š้™ค ๋˜๋Š” ๅพฉๅ…ƒไธๅฏ่ƒฝๅŒ–. ่ฆ้…ๆ…ฎๅ€‹ไบบๆƒ…ๅ ฑ(๋ณ‘๋ ฅยท๋ฒ”์ฃ„๊ฒฝ๋ ฅ ๋“ฑ)๋Š” ์‚ฌ์ „ "
"๋™์˜ ํ•„์ˆ˜. ใƒžใ‚คใƒŠใƒณใƒใƒผใฏ็‰นๅฎšๅ€‹ไบบๆƒ…ๅ ฑใจใ—ใฆๅˆฅ้€”ๅŽณๆ ผ่ฆๅˆถ("
"ๅˆฉ็”จ็›ฎ็š„้™ๅฎšยทๆš—ๅทๅŒ–ๅฟ…้ ˆ). ไปฎๅๅŠ ๅทฅๆƒ…ๅ ฑ๋Š” ์ถ”๊ฐ€์ •๋ณด ๋ณ„๋„ ๊ด€๋ฆฌ ์‹œ "
"์‹๋ณ„ ๊ฐ€๋Šฅ. ๅŒฟๅๅŠ ๅทฅๆƒ…ๅ ฑ๋Š” ๅพฉๅ…ƒไธๅฏ่ƒฝ + ๅŠ ๅทฅๆ–นๆณ• ์ •๋ณด ๋ณด์กด ํ•„์š”."
),
# ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท (ๆ”ฟไปค์—ด๊ฑฐ) โ€” ๋‹จ๋…์œผ๋กœ ๊ฐœ์ธ์ •๋ณด, ์ง์ ‘์‹๋ณ„์ž
"direct": ["JP_MY_NUMBER", "JP_PASSPORT", "JP_DRIVERS_LICENSE",
"EMAIL_ADDRESS", "JP_PHONE", "PHONE_NUMBER", "VIP_PERSON"],
"quasi": ["JP_ADDRESS", "JP_POSTAL_CODE", "IP_ADDRESS", "URL"],
# APPI ์˜ ่ฆ้…ๆ…ฎๅ€‹ไบบๆƒ…ๅ ฑ โ€” ๋ณ„๋„ ์˜ตํŠธ์ธ ๋™์˜ ํ•„์ˆ˜ (๋ณ„๋„ ๋ผ๋ฒจ)
"sensitive_appi": [], # ์˜๋ฃŒ/๋ณ‘๋ ฅ ๊ฒ€์ถœ๊ธฐ ์ถ”๊ฐ€ ์‹œ ์—ฌ๊ธฐ ํ™•์žฅ
"sensitive": ["CREDIT_CARD", "JP_BANK_ACCOUNT", "IBAN_CODE"],
"secret": ["AWS_ACCESS_KEY", "GENERIC_API_KEY"],
# ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ยง ์ ์šฉ โ€” ๋ฌด์กฐ๊ฑด suppress ๊ฐ•์ œ + ๋ฏธ์ฒ˜๋ฆฌ ์‹œ verdict insufficient
"my_number_act": ["JP_MY_NUMBER"],
},
"US": {
"name": "United States",
"law": "HIPAA Safe Harbor (45 CFR ยง164.514(b)(2)) ยท CCPA/CPRA ยท NIST SP 800-188",
"url": "https://www.hhs.gov/hipaa/",
"notes": (
"HIPAA Safe Harbor 18๊ฐ€์ง€ ์‹๋ณ„์ž ๋ชจ๋‘ ์ œ๊ฑฐ/์ผ๋ฐ˜ํ™” โ€” ์ด๋ฆ„ยท์ฃผ์†Œ "
"(์•ž 3์ž๋ฆฌ ZIP ๋งŒ ๊ฐ€๋Šฅ, ์ธ๊ตฌ โ‰ฅ20,000), ๋ชจ๋“  ๋‚ ์งœ(์—ฐ๋„๋งŒ), ์ „ํ™”ยทํŒฉ์Šคยท"
"์ด๋ฉ”์ผยทSSNยท๊ณ„์ •ยท์ธ์ฆ์„œยท์ฐจ๋Ÿ‰ยท๊ธฐ๊ธฐยทURLยทIPยท์ƒ์ฒด์ •๋ณดยท์‚ฌ์ง„. CCPA "
"deidentified data ๋Š” 'cannot reasonably identify' + ๊ธฐ์ˆ ยท๊ณ„์•ฝ ํ†ต์ œ."
),
"direct": ["KR_RRN", "KR_PASSPORT", "EMAIL_ADDRESS", "PHONE_NUMBER", "KR_PHONE", "VIP_PERSON", "US_SSN", "URL", "IP_ADDRESS", "CREDIT_CARD", "IBAN_CODE"],
"quasi": ["KR_ADDRESS", "KR_BIZ_NO"],
"sensitive":[],
"secret": ["AWS_ACCESS_KEY", "GENERIC_API_KEY"],
},
"EU": {
"name": "European Union",
"law": "GDPR Art. 4(5), Recital 26 ยท EDPB Guidelines 01/2025 on Pseudonymisation ยท WP29 Op. 05/2014",
"url": "https://edpb.europa.eu/",
"notes": (
"Pseudonymisation = ์ถ”๊ฐ€์ •๋ณด๋ฅผ ๋ณ„๋„๋กœ ๋ณด๊ด€ํ•˜๊ณ  ๊ธฐ์ˆ ยท์กฐ์ง์  ์กฐ์น˜๋กœ "
"์žฌ์‹๋ณ„์„ ์ฐจ๋‹จ(Art 4(5)). Anonymisation = ์–ด๋–ค ํ•ฉ๋ฆฌ์  ์ˆ˜๋‹จ์œผ๋กœ๋„ "
"์žฌ์‹๋ณ„ ๋ถˆ๊ฐ€๋Šฅ (Recital 26). Special categories(Art 9: ๊ฑด๊ฐ•ยท์ƒ์ฒดยท"
"๋ฏผ์กฑ ๋“ฑ) ์€ ์ถ”๊ฐ€ ๋ณดํ˜ธ. Singling-out, linkability, inference 3๊ฐœ "
"๋ฆฌ์Šคํฌ๊ฐ€ ๋ชจ๋‘ ์ œ๊ฑฐ๋˜์–ด์•ผ ์ต๋ช…."
),
"direct": ["KR_RRN", "KR_PASSPORT", "EMAIL_ADDRESS", "PHONE_NUMBER", "KR_PHONE", "VIP_PERSON", "US_SSN", "IP_ADDRESS"],
"quasi": ["KR_ADDRESS", "KR_BIZ_NO", "URL"],
"sensitive":["CREDIT_CARD", "IBAN_CODE"],
"secret": ["AWS_ACCESS_KEY", "GENERIC_API_KEY"],
},
}
# =========================================================================
# ๊ถŒ์žฅ ๊ธฐ๋ฒ• (ISO/IEC 20889 ๋ถ„๋ฅ˜ ์ธ์šฉ)
# =========================================================================
TECHNIQUE: Dict[str, Tuple[str, str, str]] = {
# entity โ†’ (technique_id, ISO 20889 ์ธ์šฉ, DPV ํ‘œํ˜„)
"KR_RRN": ("tokenize_random", "ISO 20889 ยง8.4 Tokenization (random) โ€” ์ง์ ‘์‹๋ณ„์ž, ๋งคํ•‘ ํ‚ค๋Š” ๋ถ„๋ฆฌ ๋ณด๊ด€", "dpv:Pseudonymisation+dpv:Tokenisation"),
"KR_PASSPORT": ("tokenize_random", "ISO 20889 ยง8.4 Tokenization", "dpv:Pseudonymisation+dpv:Tokenisation"),
"KR_PHONE": ("mask_partial", "ISO 20889 ยง7.5 Masking โ€” ์•ž 3 / ๋’ค 4 ์œ ์ง€", "dpv:DataMasking"),
"PHONE_NUMBER": ("mask_partial", "ISO 20889 ยง7.5 Masking", "dpv:DataMasking"),
"EMAIL_ADDRESS": ("hash_local_keep_domain", "ISO 20889 ยง8.4 Cryptographic โ€” local ๋ถ€๋ถ„ HMAC-BLAKE2b, ๋„๋ฉ”์ธ ์œ ์ง€", "dpv:Pseudonymisation+dpv:Encryption"),
"KR_ADDRESS": ("generalize_to_city", "ISO 20889 ยง7.2 Generalization โ€” ์‹œยท๋„ ๋‹จ์œ„๊นŒ์ง€ ์ผ๋ฐ˜ํ™” (HIPAA SH ยง164.514(b)(2)(i)(B) ์™€ ์ •ํ•ฉ)", "dpv:Generalisation"),
"CREDIT_CARD": ("mask_pan", "PCI-DSS Req 3.4 โ€” ์•ž 6 / ๋’ค 4 ์œ ์ง€, ์ค‘๊ฐ„ ๋งˆ์Šคํ‚น", "dpv:DataMasking"),
"US_SSN": ("tokenize_random", "์ง์ ‘์‹๋ณ„์ž โ†’ ๋น„๊ฐ€์—ญ ํ† ํฐ", "dpv:Pseudonymisation+dpv:Tokenisation"),
"IBAN_CODE": ("mask_partial", "ISO 20889 ยง7.5 Masking โ€” ๊ตญ๊ฐ€์ฝ”๋“œ+์ฒดํฌ๋””์ง€ํŠธ ์œ ์ง€, ๊ณ„์ขŒ๋ถ€ ์ค‘๊ฐ„ ๋งˆ์Šคํ‚น", "dpv:DataMasking"),
"VIP_PERSON": ("pseudonym_consistent", "ISO 20889 ยง8.4 Pseudonymisation โ€” ๋™์ผ์ธ ์ผ๊ด€ ๋งคํ•‘(HMAC)", "dpv:Pseudonymisation"),
"AWS_ACCESS_KEY": ("suppress", "๋น„๋ฐ€ ์ž๊ฒฉ์ฆ๋ช… โ€” ์ฆ‰์‹œ ํšŒ์ „(rotate) + ์™„์ „ ์ œ๊ฑฐ", "dpv:Erasure"),
"GENERIC_API_KEY": ("suppress", "๋น„๋ฐ€ ํ›„๋ณด โ€” ๋ณด์ˆ˜์  ์™„์ „ ์ œ๊ฑฐ", "dpv:Erasure"),
"INTERNAL_PROJECT": ("tokenize_random", "๋‚ด๋ถ€ ์ฝ”๋“œ๋ช… โ€” ์™ธ๋ถ€ ๋…ธ์ถœ ์‹œ ๋ฌด์ž‘์œ„ ํ† ํฐ", "dpv:Pseudonymisation"),
"KR_BIZ_NO": ("mask_partial", "๋ฒ•์ธ ID โ€” ๋ถ€๋ถ„ ๋งˆ์Šคํ‚น (์ „์ฒด ๋น„์‹๋ณ„ ์‹œ์—๋Š” ํ† ํฐํ™”)", "dpv:DataMasking"),
"URL": ("generalize_url", "ISO 20889 ยง7.2 โ€” ํ˜ธ์ŠคํŠธ๋งŒ ์œ ์ง€, ๊ฒฝ๋กœ/์ฟผ๋ฆฌ ์ œ๊ฑฐ", "dpv:Generalisation"),
"IP_ADDRESS": ("ip_truncate", "ISO 20889 ยง7.2 โ€” IPv4 ๋งˆ์ง€๋ง‰ ์˜ฅํ…Ÿ ์ ˆ๋‹จ (/24)", "dpv:Generalisation"),
# ์ผ๋ณธ PII ๊ถŒ์žฅ ๊ธฐ๋ฒ•
"JP_MY_NUMBER": ("suppress", "ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ยง19 ๅˆฉ็”จ็›ฎ็š„ๅค–ไฟ็ฎก็ฆๆญข โ€” ไปฎๅๅŠ ๅทฅยทๅŒฟๅๅŠ ๅทฅ ๋ชจ๋‘ ์™„์ „ ์ œ๊ฑฐ", "dpv:Erasure"),
"JP_PASSPORT": ("tokenize_random", "ISO 20889 ยง8.4 โ€” ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท (ๆ—…ๅˆธ็•ชๅท) ๋น„๊ฐ€์—ญ ํ† ํฐํ™”", "dpv:Pseudonymisation+dpv:Tokenisation"),
"JP_DRIVERS_LICENSE": ("tokenize_random", "ISO 20889 ยง8.4 โ€” ๅ€‹ไบบ่ญ˜ๅˆฅ็ฌฆๅท (ๅ…ฌๅฎ‰ๅง”ๅ“กไผš็™บ่กŒ็•ชๅท) ๋น„๊ฐ€์—ญ ํ† ํฐํ™”", "dpv:Pseudonymisation+dpv:Tokenisation"),
"JP_PHONE": ("mask_partial", "ISO 20889 ยง7.5 โ€” ์•ž 3 / ๋’ค 4 ์œ ์ง€ (ํ•œ๊ตญ KR_PHONE ๊ณผ ๋™์ผ ์ •์ฑ…)", "dpv:DataMasking"),
"JP_POSTAL_CODE": ("generalize_postal", "์•ž 3์ž๋ฆฌ (ํ–‰์ •๊ตฌ์—ญ ๋‹จ์œ„) ๋งŒ ์œ ์ง€ โ€” HIPAA SH ์˜ ZIP3 ์ •ํ•ฉ", "dpv:Generalisation"),
"JP_ADDRESS": ("generalize_to_city", "ISO 20889 ยง7.2 โ€” ้ƒฝ้“ๅบœ็œŒยทๅธ‚ๅŒบ็”บๆ‘ ๋‹จ์œ„๊นŒ์ง€ ์ผ๋ฐ˜ํ™”", "dpv:Generalisation"),
"JP_CORPORATE_NUMBER": ("identity", "๋ฒ•์ธ๋ฒˆํ˜ธ๋Š” ๅ€‹ไบบๆƒ…ๅ ฑ ์•„๋‹˜ (ๆณ•ไบบ็จŽๆณ• ยง10-3 ๅ…ฌ้–‹) โ€” ๊ธฐ๋ณธ ๋ณด์กด", "dpv:NonPersonalData"),
"JP_BANK_ACCOUNT": ("mask_partial", "ISO 20889 ยง7.5 โ€” ๆœซๅฐพ4ๆก ์œ ์ง€, ๋ณธ๋ฌธ ๋งˆ์Šคํ‚น", "dpv:DataMasking"),
}
# =========================================================================
# ๋ฐ์ดํ„ฐ ํด๋ž˜์Šค
# =========================================================================
@dataclass
class EntityRecord:
index: int
entity_type: str
original: str
start: int
end: int
score: float
dpv_concept: str
dpv_note: str
technique: str
technique_note: str
technique_dpv: str
transformed: str
classifications: Dict[str, str] # {jurisdiction: 'direct'|'quasi'|'sensitive'|'secret'|'unmapped'}
@dataclass
class ComplianceVerdict:
jurisdiction: str
name: str
law: str
url: str
notes: str
treatment_level: str # 'pseudonymization' | 'anonymization'
counts: Dict[str, int] # direct/quasi/sensitive/secret
untreated: List[str]
verdict: str # 'compliant' | 'partial' | 'insufficient'
rationale: str
requirements_met: List[str]
requirements_pending: List[str]
# =========================================================================
# ๋ณ€ํ™˜๊ธฐ (ISO 20889 ๊ธฐ๋ฒ•๋ณ„)
# =========================================================================
class Pseudonymizer:
"""๋ณ€ํ™˜ ๊ธฐ๋ฒ• ๋ชจ์Œ. salt ์™€ ๋งคํ•‘ ํ…Œ์ด๋ธ”์€ ์ธ์Šคํ„ด์Šค ๋‚ด์— ๋ณด๊ด€ โ€” ์‹ค ์šด์˜ ์‹œ
์ด ๋งคํ•‘์ด 'additional information' (GDPR Art 4(5)) ์— ํ•ด๋‹นํ•˜๋ฏ€๋กœ ๋ณ„๋„ KMSยทHSM
๋ณด๊ด€ ๊ถŒ์žฅ. PoC ์—์„œ๋Š” ๋ฉ”๋ชจ๋ฆฌ ๋‚ด."""
def __init__(self, salt: Optional[bytes] = None, anonymize: bool = False):
self.salt = salt or secrets.token_bytes(16)
self.anonymize = anonymize # True ๋ฉด ์ผ๊ด€์„ฑ ๋งคํ•‘๋„ ๋”
self.consistent: Dict[Tuple[str, str], str] = {}
self._counter: Dict[str, int] = {}
def transform(self, entity_type: str, value: str) -> Tuple[str, str, str, str]:
"""๋ฐ˜ํ™˜: (๋ณ€ํ™˜๋ฌธ, technique_id, ISO 20889 ๋…ธํŠธ, DPV ํ‘œํ˜„)"""
tech_id, note, dpv = TECHNIQUE.get(
entity_type,
("suppress", "๊ธฐ๋ณธ ์ •์ฑ… โ€” ๋ฏธ์ง€์ • ์—”ํ‹ฐํ‹ฐ๋Š” ๋ณด์ˆ˜์ ์œผ๋กœ ์ œ๊ฑฐ", "dpv:Erasure"),
)
method = getattr(self, f"_{tech_id}", self._suppress)
return method(entity_type, value), tech_id, note, dpv
# ---- ๊ธฐ๋ฒ• ๊ตฌํ˜„ ----
def _suppress(self, et, v):
return "[REDACTED]"
def _tokenize_random(self, et, v):
if not self.anonymize:
key = (et, v)
if key in self.consistent:
return self.consistent[key]
n = self._counter.get(et, 0) + 1
self._counter[et] = n
token = f"<{et}_{n:04d}>"
if not self.anonymize:
self.consistent[(et, v)] = token
return token
def _pseudonym_consistent(self, et, v):
if self.anonymize:
# ์ต๋ช…ํ™”: ๋™์ผ์„ฑ๋„ ๋ณด์กดํ•˜์ง€ ์•Š์Œ โ†’ ๋งค๋ฒˆ ์ž„์˜ ๊ฐ’
return f"<PERSON_{secrets.token_hex(3).upper()}>"
key = (et, v)
if key in self.consistent:
return self.consistent[key]
h = hmac.new(self.salt, v.encode("utf-8"), hashlib.blake2b).hexdigest()[:8]
token = f"<PERSON_{h.upper()}>"
self.consistent[key] = token
return token
def _mask_partial(self, et, v):
digit_pos = [i for i, c in enumerate(v) if c.isdigit()]
if len(digit_pos) < 7:
return self._suppress(et, v)
keep_front, keep_back = 3, 4
masked = set(digit_pos[keep_front : len(digit_pos) - keep_back])
return "".join("*" if i in masked else c for i, c in enumerate(v))
def _mask_pan(self, et, v):
digit_pos = [i for i, c in enumerate(v) if c.isdigit()]
if len(digit_pos) < 13:
return self._suppress(et, v)
masked = set(digit_pos[6 : len(digit_pos) - 4])
return "".join("*" if i in masked else c for i, c in enumerate(v))
def _hash_local_keep_domain(self, et, v):
if "@" not in v:
return self._suppress(et, v)
local, domain = v.split("@", 1)
h = hmac.new(self.salt, local.encode("utf-8"), hashlib.blake2b).hexdigest()[:8]
return f"user-{h}@{domain}"
_KR_PROVINCES = (
"์„œ์šธ", "๋ถ€์‚ฐ", "๋Œ€๊ตฌ", "์ธ์ฒœ", "๊ด‘์ฃผ", "๋Œ€์ „", "์šธ์‚ฐ", "์„ธ์ข…",
"๊ฒฝ๊ธฐ", "๊ฐ•์›", "์ถฉ๋ถ", "์ถฉ๋‚จ", "์ „๋ถ", "์ „๋‚จ", "๊ฒฝ๋ถ", "๊ฒฝ๋‚จ", "์ œ์ฃผ",
)
_JP_PROVINCES = (
"ๆฑไบฌ้ƒฝ", "ไบฌ้ƒฝๅบœ", "ๅคง้˜ชๅบœ", "ๅŒ—ๆตท้“",
"็ฅžๅฅˆๅท็œŒ", "ๅŸผ็މ็œŒ", "ๅƒ่‘‰็œŒ", "่ŒจๅŸŽ็œŒ", "ๆ ƒๆœจ็œŒ", "็พค้ฆฌ็œŒ",
"ๆ„›็Ÿฅ็œŒ", "ๅฒ้˜œ็œŒ", "ไธ‰้‡็œŒ", "้™ๅฒก็œŒ",
"ๅ…ตๅบซ็œŒ", "ๅฅˆ่‰ฏ็œŒ", "ๅ’ŒๆญŒๅฑฑ็œŒ", "ๆป‹่ณ€็œŒ",
"็ฆๅฒก็œŒ", "ไฝ่ณ€็œŒ", "้•ทๅดŽ็œŒ", "็†Šๆœฌ็œŒ", "ๅคงๅˆ†็œŒ", "ๅฎฎๅดŽ็œŒ", "้นฟๅ…ๅณถ็œŒ", "ๆฒ–็ธ„็œŒ",
"ๅฎฎๅŸŽ็œŒ", "็ฆๅณถ็œŒ", "ๅฑฑๅฝข็œŒ", "็ง‹็”ฐ็œŒ", "ๅฒฉๆ‰‹็œŒ", "้’ๆฃฎ็œŒ",
"ๆ–ฐๆฝŸ็œŒ", "ๅฏŒๅฑฑ็œŒ", "็Ÿณๅท็œŒ", "็ฆไบ•็œŒ", "ๅฑฑๆขจ็œŒ", "้•ท้‡Ž็œŒ",
"้ณฅๅ–็œŒ", "ๅณถๆ น็œŒ", "ๅฒกๅฑฑ็œŒ", "ๅบƒๅณถ็œŒ", "ๅฑฑๅฃ็œŒ",
"ๅพณๅณถ็œŒ", "้ฆ™ๅท็œŒ", "ๆ„›ๅช›็œŒ", "้ซ˜็Ÿฅ็œŒ",
)
def _generalize_to_city(self, et, v):
# JP ์šฐ์„  (๊ธด ๋งค์น˜)
for p in self._JP_PROVINCES:
if v.startswith(p):
return f"{p} (ไปฅไธ‹ไธ€่ˆฌๅŒ–)"
for p in self._KR_PROVINCES:
if v.startswith(p):
return f"{p} (์ดํ•˜ ์ผ๋ฐ˜ํ™”)"
return self._suppress(et, v)
def _generalize_postal(self, et, v):
"""์ผ๋ณธ ์šฐํŽธ๋ฒˆํ˜ธ โ€” ์•ž 3์ž๋ฆฌ๋งŒ ์œ ์ง€ (์ง€์—ญ ๋ถ„๋ฅ˜ ๋‹จ์œ„)."""
m = re.search(r"(\d{3})-?\d{4}", v)
if m:
return f"ใ€’{m.group(1)}-****"
return self._suppress(et, v)
def _identity(self, et, v):
"""๋ณ€ํ™˜ ์—†์Œ โ€” ๅ€‹ไบบๆƒ…ๅ ฑ ๊ฐ€ ์•„๋‹Œ ํ•ญ๋ชฉ (๋ฒ•์ธ๋ฒˆํ˜ธ ๋“ฑ)."""
return v
def _ip_truncate(self, et, v):
m = re.match(r"(\d+)\.(\d+)\.(\d+)\.\d+", v)
if m:
return f"{m.group(1)}.{m.group(2)}.{m.group(3)}.0/24"
return self._suppress(et, v)
def _generalize_url(self, et, v):
m = re.match(r"(https?://[^/]+)", v, flags=re.IGNORECASE)
if m:
return f"{m.group(1)}/[โ€ฆ]"
return v
# =========================================================================
# ๋ถ„๋ฅ˜ / ํ‰๊ฐ€ ํ•จ์ˆ˜
# =========================================================================
def classify_entity(entity_type: str, jurisdictions: List[str]) -> Dict[str, str]:
out: Dict[str, str] = {}
for j in jurisdictions:
rules = JURISDICTION.get(j)
if not rules:
out[j] = "unmapped"
continue
# JP ์ „์šฉ โ€” ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ์ ์šฉ ํ•ญ๋ชฉ ์šฐ์„  (direct ๋ณด๋‹ค ๋” ์—„๊ฒฉํ•œ ๋ถ„๋ฅ˜)
if entity_type in rules.get("my_number_act", []):
out[j] = "my_number_act"
elif entity_type in rules.get("sensitive_appi", []):
out[j] = "sensitive_appi"
elif entity_type in rules.get("direct", []):
out[j] = "direct"
elif entity_type in rules.get("quasi", []):
out[j] = "quasi"
elif entity_type in rules.get("sensitive", []):
out[j] = "sensitive"
elif entity_type in rules.get("secret", []):
out[j] = "secret"
else:
out[j] = "unmapped"
return out
def _requirements_per_jurisdiction(j: str, level: str) -> List[str]:
"""๊ด€ํ• /์ฒ˜๋ฆฌ์ˆ˜์ค€๋ณ„ ํ•ต์‹ฌ ์š”๊ตฌ์‚ฌํ•ญ ์ฒดํฌ๋ฆฌ์ŠคํŠธ."""
common = [
"์ง์ ‘์‹๋ณ„์ž ๋ชจ๋‘ ๋ณ€ํ™˜/์ œ๊ฑฐ",
"๋ฏผ๊ฐ์ •๋ณด(๊ธˆ์œตยท๊ฑด๊ฐ• ๋“ฑ) ๋งˆ์Šคํ‚น/์ œ๊ฑฐ",
"๋น„๋ฐ€ ์ž๊ฒฉ์ฆ๋ช… ์™„์ „ ์ œ๊ฑฐ",
]
by_level = {
"pseudonymization": [
"์ถ”๊ฐ€์ •๋ณด(๋งคํ•‘ยทํ‚ค) ๋ณ„๋„ ๋ถ„๋ฆฌ ๋ณด๊ด€",
"์žฌ์‹๋ณ„ ์‹œ๋„ ๋ฐฉ์ง€ ๊ธฐ์ˆ ยท์กฐ์ง์  ์กฐ์น˜",
],
"anonymization": [
"์ถ”๊ฐ€์ •๋ณด๋ฅผ ํฌํ•จํ•œ ์–ด๋–ค ํ•ฉ๋ฆฌ์  ์ˆ˜๋‹จ์œผ๋กœ๋„ ์žฌ์‹๋ณ„ ๋ถˆ๊ฐ€",
"์ค€์‹๋ณ„์ž ์กฐํ•ฉ ์žฌ์‹๋ณ„ ์œ„ํ—˜ ๊ฒ€์ฆ (k-์ต๋ช…์„ฑ ๋“ฑ)",
"๊ฒฐ๊ณผ์˜ ๋ถ„ํฌยท์ผ๋ฐ˜ํ™” ์ˆ˜์ค€ ํ†ต๊ณ„์  ๊ฒ€์ฆ",
],
}
j_specific = {
"KR": ["๊ฐ€๋ช…์ •๋ณด ์ ์ •์„ฑ ๊ฒ€ํ† (๊ฐ€๋ช…์ •๋ณด ์ฒ˜๋ฆฌ ๊ฐ€์ด๋“œ๋ผ์ธ)"] if level == "pseudonymization"
else ["์ ์ •์„ฑ ํ‰๊ฐ€ + ์ถ”๊ฐ€์ •๋ณด ํ๊ธฐ"],
"JP": ["ไปฎๅๅŠ ๅทฅๆƒ…ๅ ฑใฎๅฎ‰ๅ…จ็ฎก็†ๆŽช็ฝฎ (APPI ยง35-2)"] if level == "pseudonymization"
else ["ๅŒฟๅๅŠ ๅทฅๆƒ…ๅ ฑใฎไฝœๆˆๆ–นๆณ•็ญ‰ใฎๅ…ฌ่กจ (APPI ยง43)"],
"US": ["HIPAA Safe Harbor 18 ์‹๋ณ„์ž ํ•ญ๋ชฉ ๋ชจ๋‘ ์ฒ˜๋ฆฌ",
"Expert Determination ํŠธ๋ž™ ์‹œ ํ†ต๊ณ„ ์ „๋ฌธ๊ฐ€ ๊ฒ€์ฆ"],
"EU": ["EDPB 01/2025 โ€” singling-out / linkability / inference 3๊ฐœ ์œ„ํ—˜ ํ‰๊ฐ€",
"Art 32 ์ ์ ˆํ•œ ๋ณด์•ˆ์กฐ์น˜"],
}
return common + by_level[level] + j_specific.get(j, [])
def evaluate_compliance(
entities: List[EntityRecord],
jurisdictions: List[str],
treatment_level: str,
) -> List[ComplianceVerdict]:
out: List[ComplianceVerdict] = []
for j in jurisdictions:
rules = JURISDICTION.get(j)
if not rules:
continue
counts = {"direct": 0, "quasi": 0, "sensitive": 0, "secret": 0,
"sensitive_appi": 0, "my_number_act": 0, "unmapped": 0}
untreated: List[str] = []
# ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• โ€” 'suppress' ๊ธฐ๋ฒ•(=[REDACTED]) ์ด์™ธ๋Š” ๋ชจ๋‘ ์œ„๋ฐ˜
my_number_violations: List[str] = []
# ่ฆ้…ๆ…ฎๅ€‹ไบบๆƒ…ๅ ฑ โ€” ์˜ตํŠธ์ธ ๋™์˜ ์—ฌ๋ถ€๋Š” ์ž๋™ ํŒ์ • ๋ถˆ๊ฐ€ โ†’ pending ์œผ๋กœ ๋ˆ„์ 
sensitive_appi_present: List[str] = []
for e in entities:
cls = e.classifications.get(j, "unmapped")
counts[cls] = counts.get(cls, 0) + 1
if cls in ("direct", "secret", "sensitive", "sensitive_appi", "my_number_act") \
and e.transformed == e.original:
untreated.append(e.entity_type)
if cls == "my_number_act" and e.technique != "suppress":
my_number_violations.append(e.entity_type)
if cls == "sensitive_appi":
sensitive_appi_present.append(e.entity_type)
reqs = _requirements_per_jurisdiction(j, treatment_level)
# ๋‹จ์ˆœ ํœด๋ฆฌ์Šคํ‹ฑ ํŒ์ • (PoC)
# ์šฐ์„  ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ์œ„๋ฐ˜ ๊ฒ€์‚ฌ (๋‹ค๋ฅธ ๋ชจ๋“  ํŒ์ • ์šฐ์„ )
if my_number_violations:
verdict = "insufficient"
rationale = (
f"ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ยง19 ์œ„๋ฐ˜: {', '.join(sorted(set(my_number_violations)))} โ€” "
"๋งˆ์ด๋„˜๋ฒ„๋Š” ๅˆฉ็”จ็›ฎ็š„ๅค–ไฟ็ฎก็ฆๆญข โ†’ ๋งˆ์Šคํ‚น/ํ† ํฐํ™” ๋ถˆ๊ฐ€, ์™„์ „ ์ œ๊ฑฐ(suppress) ํ•„์ˆ˜."
)
met = ["๊ฒ€์ถœ/๋ถ„๋ฅ˜ ์™„๋ฃŒ"]
pending = [
"JP_MY_NUMBER ์˜ technique ์„ 'suppress' ๋กœ ๋ณ€๊ฒฝ (์™„์ „ ์ œ๊ฑฐ)",
"ใƒžใ‚คใƒŠใƒณใƒใƒผๆณ• ยง12 ๅฎ‰ๅ…จ็ฎก็†ๆŽช็ฝฎ โ€” ๆš—ๅทๅŒ–ยท์ ‘๊ทผํ†ต์ œ ์ฆ๋น™",
"็›ฎ็š„ๅค–ๅˆฉ็”จยทๆไพ›็ฆๆญข (ๆณ• ยง20)",
] + reqs
elif untreated:
verdict = "insufficient"
rationale = (
f"๋ฏธ์ฒ˜๋ฆฌ ์ง์ ‘/๋ฏผ๊ฐ/๋น„๋ฐ€ ํ•ญ๋ชฉ ์กด์žฌ: {', '.join(sorted(set(untreated)))} โ€” "
f"{rules['name']} ๊ธฐ์ค€ {treatment_level} ๋ฏธ์ถฉ์กฑ."
)
met = ["๊ฒ€์ถœ/๋ถ„๋ฅ˜ ์™„๋ฃŒ"]
pending = ["๋ฏธ์ฒ˜๋ฆฌ ํ•ญ๋ชฉ ๋ณ€ํ™˜"] + reqs
elif treatment_level == "anonymization":
# ์ต๋ช…: ์ค€์‹๋ณ„์ž ๋‹ค์ˆ˜ ์ž”์กด ์‹œ ๋ถ€๋ถ„ (์ˆ˜๋™ ๊ฒ€์ฆ ํ•„์š”)
quasi_n = counts.get("quasi", 0)
if quasi_n >= 2:
verdict = "partial"
rationale = (
f"์ค€์‹๋ณ„์ž {quasi_n}๊ฐœ โ€” ์กฐํ•ฉ ์žฌ์‹๋ณ„ ์œ„ํ—˜. "
"k-์ต๋ช…์„ฑยทl-๋‹ค์–‘์„ฑยทt-๊ทผ์ ‘์„ฑ ๋“ฑ ์ •๋Ÿ‰ ๊ฒ€์ฆ ํ•„์š”."
)
met = [
"์ง์ ‘ยท๋ฏผ๊ฐยท๋น„๋ฐ€ ๋ชจ๋‘ ๋ณ€ํ™˜",
"PII ๊ฒ€์ถœยท๋ถ„๋ฅ˜ยทDPV ๋งคํ•‘ ์™„๋ฃŒ",
]
pending = [
"์ค€์‹๋ณ„์ž ์กฐํ•ฉ์— ๋Œ€ํ•œ ์žฌ์‹๋ณ„ ์œ„ํ—˜ ์ •๋Ÿ‰ ํ‰๊ฐ€",
"์ถ”๊ฐ€์ •๋ณด(๋งคํ•‘) ํ๊ธฐ ์ ˆ์ฐจ",
]
else:
verdict = "compliant"
rationale = (
"์ง์ ‘ยท๋ฏผ๊ฐยท๋น„๋ฐ€ ๋ชจ๋‘ ๋ณ€ํ™˜๋˜์—ˆ๊ณ  ์ค€์‹๋ณ„์ž ์ผ๋ฐ˜ํ™” ์ ์šฉ. "
"๋‹จ PoC ํœด๋ฆฌ์Šคํ‹ฑ โ€” ์‹ค ์šด์˜ ์‹œ ํ†ต๊ณ„์  ์ ์ •์„ฑ ๊ฒ€ํ†  ํ•„์ˆ˜."
)
met = [
"์ง์ ‘ยท๋ฏผ๊ฐยท๋น„๋ฐ€ ๋ชจ๋‘ ๋ณ€ํ™˜",
"์ค€์‹๋ณ„์ž ์ผ๋ฐ˜ํ™”/์ œ๊ฑฐ",
"DPV ๋งคํ•‘ + ๊ด€ํ•  ๋ถ„๋ฅ˜ ๋ช…์„ธํ™”",
]
pending = [
"์ถ”๊ฐ€์ •๋ณด(๋งคํ•‘) ํ๊ธฐ ๋˜๋Š” ๋ถ„๋ฆฌ ํ๊ธฐ ์ฆ๋น™",
"ํ†ต๊ณ„์  ์žฌ์‹๋ณ„ ์œ„ํ—˜ ํ‰๊ฐ€ ๋ณด๊ณ ์„œ",
]
else:
# ๊ฐ€๋ช…ํ™” โ€” ์ถ”๊ฐ€์ •๋ณด ๋ถ„๋ฆฌ ๋ณด๊ด€ ์ „์ œ๋กœ ์ผ๋‹จ compliant
verdict = "compliant"
n_treated = counts["direct"] + counts["sensitive"] + counts["secret"]
rationale = (
f"์ง์ ‘ยท๋ฏผ๊ฐยท๋น„๋ฐ€ {n_treated}๊ฑด ๋ชจ๋‘ ๋ณ€ํ™˜ ์™„๋ฃŒ. ๋งคํ•‘ ํ…Œ์ด๋ธ”์€ "
"๋ณธ PoC ๊ฐ€ ๋ฉ”๋ชจ๋ฆฌ์— ๋ณด๊ด€ โ€” ์‹ค ์šด์˜ ์‹œ KMS/HSM ๋ถ„๋ฆฌ ๋ณด๊ด€ ํ•„์š”."
)
met = [
"์ง์ ‘์‹๋ณ„์ž ๋ชจ๋‘ ๋ณ€ํ™˜/์ œ๊ฑฐ",
"๋ฏผ๊ฐ์ •๋ณด ๋งˆ์Šคํ‚น/์ œ๊ฑฐ",
"๋น„๋ฐ€ ์ž๊ฒฉ์ฆ๋ช… ์™„์ „ ์ œ๊ฑฐ",
"DPV ๋งคํ•‘ + ์ฒ˜๋ฆฌ ํ๋ฆ„ ๋ฌธ์„œํ™”",
]
pending = [
"๋งคํ•‘ ํ…Œ์ด๋ธ”์˜ ๋ณ„๋„ ๋ณด๊ด€ (KMSยทHSM)",
"์žฌ์‹๋ณ„ ์‹œ๋„ ๋ฐฉ์ง€ ๊ธฐ์ˆ ยท์กฐ์ง์  ์กฐ์น˜ (Art 32 / APPI ๅฎ‰ๅ…จ็ฎก็†)",
"(KR) ๊ฐ€๋ช…์ •๋ณด ์ ์ •์„ฑ ๊ฒ€ํ† ",
]
# KR ๋งŒ์˜ ๊ฐ€์ด๋“œ๋ผ์ธ ์ ์ •์„ฑ ๊ฒ€ํ†  ๊ฐ•์กฐ
if j == "KR":
pending.append("๊ฐ€๋ช…์ •๋ณด ์ฒ˜๋ฆฌ ๊ฐ€์ด๋“œ๋ผ์ธ ยงIII-3 ์ ์ •์„ฑ ๊ฒ€ํ† ์œ„์›ํšŒ ์˜์‚ฌ๋ก")
# JP ่ฆ้…ๆ…ฎๅ€‹ไบบๆƒ…ๅ ฑ โ€” ์˜ตํŠธ์ธ ๋™์˜ ์ž๋™ ๊ฒ€์ฆ ๋ถˆ๊ฐ€
if j == "JP" and sensitive_appi_present:
pending.append(
f"่ฆ้…ๆ…ฎๅ€‹ไบบๆƒ…ๅ ฑ {len(set(sensitive_appi_present))}๊ฑด โ€” "
"APPI ยง20 ์‚ฌ์ „ ์˜ตํŠธ์ธ ๋™์˜ ํ™•๋ณด ์ฆ๋น™ ํ•„์š”"
)
out.append(ComplianceVerdict(
jurisdiction=j,
name=rules["name"],
law=rules["law"],
url=rules["url"],
notes=rules["notes"],
treatment_level=treatment_level,
counts={k: v for k, v in counts.items() if k != "unmapped"},
untreated=sorted(set(untreated)),
verdict=verdict,
rationale=rationale,
requirements_met=met,
requirements_pending=pending,
))
return out
# =========================================================================
# ๋ฉ”์ธ ์ง„์ž…์  โ€” ํ…์ŠคํŠธ + findings โ†’ ๋ณ€ํ™˜ ํ…์ŠคํŠธ + ํ‰๊ฐ€
# =========================================================================
def run(
text: str,
findings: List[Dict],
jurisdictions: List[str],
treatment_level: str,
salt: Optional[bytes] = None,
) -> Dict:
"""text ์™€ PII Scanner findings ๋ฅผ ๋ฐ›์•„ ๋ณ€ํ™˜ + ํ‰๊ฐ€ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜."""
if treatment_level not in ("pseudonymization", "anonymization"):
treatment_level = "pseudonymization"
valid = [j for j in jurisdictions if j in JURISDICTION]
if not valid:
valid = list(JURISDICTION.keys())
pz = Pseudonymizer(salt=salt, anonymize=(treatment_level == "anonymization"))
# ์œ„์น˜ ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ ํ›„ ๋น„๊ฒน์นจ ์„ ํƒ
sorted_findings = sorted(findings, key=lambda f: (f["start"], -f.get("score", 0)))
chosen = []
last_end = -1
for f in sorted_findings:
if f["start"] >= last_end:
chosen.append(f)
last_end = f["end"]
records: List[EntityRecord] = []
for i, f in enumerate(chosen):
et = f["entity_type"]
original = f["text"]
dpv_concept, dpv_note = DPV_CATEGORY.get(et, ("dpv:NonPersonalData", "DPV ๋งคํ•‘ ์—†์Œ"))
transformed, tech_id, tech_note, tech_dpv = pz.transform(et, original)
classifications = classify_entity(et, valid)
records.append(EntityRecord(
index=i + 1,
entity_type=et,
original=original,
start=f["start"],
end=f["end"],
score=float(f.get("score", 0)),
dpv_concept=dpv_concept,
dpv_note=dpv_note,
technique=tech_id,
technique_note=tech_note,
technique_dpv=tech_dpv,
transformed=transformed,
classifications=classifications,
))
# ๋โ†’์•ž ์œผ๋กœ ์น˜ํ™˜ (์ธ๋ฑ์Šค ์œ ์ง€)
out_text = text
for r in sorted(records, key=lambda r: r.start, reverse=True):
out_text = out_text[: r.start] + r.transformed + out_text[r.end :]
verdicts = evaluate_compliance(records, valid, treatment_level)
return {
"treatment_level": treatment_level,
"jurisdictions": valid,
"original_text": text,
"transformed_text": out_text,
"entities": [asdict(r) for r in records],
"verdicts": [asdict(v) for v in verdicts],
# ๋งคํ•‘ ํ…Œ์ด๋ธ” (PoC ์‹œ์—ฐ์šฉ โ€” ์‹ค ์šด์˜ ์‹œ ์ ˆ๋Œ€ ์‘๋‹ต์— ํฌํ•จ ๊ธˆ์ง€)
"mapping_demo": [
{"entity_type": k[0], "original": k[1], "token": v}
for k, v in pz.consistent.items()
],
}