x / gdpr_filter.py
SsebaA's picture
Update gdpr_filter.py
20bf715 verified
import re
import logging
logger = logging.getLogger(__name__)
class GDPRFilter:
PERSONNUMMER_PATTERN = r'\b(?:19|20)?\d{6}[-–]\d{4}\b'
EMAIL_PATTERN = r'\b[\w\.\-]+@[\w\.\-]+\.\w{2,4}\b'
PHONE_PATTERN = r'\b(?:\+46|0)[\s\-]?\d{1,3}[\s\-]?\d{3,4}[\s\-]?\d{2,4}\b'
DATE_PATTERNS = [
r'\b\d{4}[-–]\d{2}[-–]\d{2}\b',
r'\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b'
]
ADDRESS_PATTERN = r'\b[A-ZÅÄÖ][a-zåäö]+(?:gatan|vägen|gränd|torget|plan|allén|stigen|backen|torg)\s\d+[A-Za-z]?\b'
MEDICATION_CONTEXT_KEYWORDS = [
r'\b(läkemedel|medicin|mediciner|drog|droger)\b',
r'\b(dos|doser|dosering|doseras)\b',
r'\b(mg|ml|gram|tabletter?|kapslar?|ampuller?|dropp?ar?|injektioner?)\b',
r'\b(ta|tar|tog|tar in|skall ta|ska ta|bör ta|måste ta|börja ta|slutat ta)\b',
r'\b(recept|receptfritt|förskrivit|ordinerat)\b',
r'\b(apotek|apoteket|farmaceut)\b',
r'\b(två gånger|tre gånger|en gång|fyra gånger|dagligen|morgon|kväll|middag|natt)\b',
]
MEDICATION_PATTERNS = [
r'(ol|in|stat|pril|zol|am|um|ase|mycin|axin|cillin)$',
]
MEDICATION_EXCLUSIONS = [
'Metoprolol', 'Atorvastatin', 'Losartan', 'Omeprazol',
'Lisinopril', 'Simvastatin', 'Ramipril', 'Bisoprolol',
'Warfarin', 'Insulin', 'Aspirin', 'Paracetamol', 'Ibuprofen',
'Amoxicillin', 'Penicillin', 'Tetracyclin', 'Doxycyclin',
]
@staticmethod
def has_medication_context(text: str) -> bool:
for pattern in GDPRFilter.MEDICATION_CONTEXT_KEYWORDS:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
@staticmethod
def is_likely_medication(word: str) -> bool:
if len(word) < 6:
return False
for pattern in GDPRFilter.MEDICATION_PATTERNS:
if re.search(pattern, word, re.IGNORECASE):
return True
return False
@staticmethod
def apply_filter(text: str) -> str:
if not text or not text.strip():
return ""
text = re.sub(GDPRFilter.PERSONNUMMER_PATTERN, '[PERSONNR]', text)
text = re.sub(GDPRFilter.PHONE_PATTERN, '[TELEFON]', text)
text = re.sub(
r'(?i)(mobil(?:nummer)?|telefon(?:nummer)?|nummer)[\s:]+(\d[\d\s\-]{6,})',
r'\1 [TELEFON]', text
)
text = re.sub(GDPRFilter.EMAIL_PATTERN, '[EMAIL]', text)
for pattern in GDPRFilter.DATE_PATTERNS:
text = re.sub(pattern, '[DATUM]', text)
text = re.sub(GDPRFilter.ADDRESS_PATTERN, '[ADRESS]', text)
text = re.sub(
r'(?i)(bor\s+(?:i|på|kvar\s+i)|i\s+stadsdelen|i\s+området)\s+([A-ZÅÄÖ][a-zåäö]{2,}(?:\s+[A-ZÅÄÖ][a-zåäö]+)?)',
r'\1 [ORT]', text
)
text = re.sub(r'(\[TELEFON\])\s+\d{1,4}\b', r'\1', text)
text = re.sub(r'(\[PERSONNR\])\s+\d{1,4}\b', r'\1', text)
has_med_context = GDPRFilter.has_medication_context(text)
def replace_name(match):
word = match.group(0)
if GDPRFilter.is_likely_medication(word):
if has_med_context:
return word
return '[LÄKEMEDEL]'
if any(word.startswith(ex) for ex in GDPRFilter.MEDICATION_EXCLUSIONS):
return word
return '[NAMN]'
text = re.sub(
r'\b[A-ZÅÄÖ][a-zåäö]{2,}\s[A-ZÅÄÖ][a-zåäö]{2,}\b',
replace_name, text
)
return text
def apply_gdpr_filter(text: str) -> str:
return GDPRFilter.apply_filter(text)
def apply_dual_layer_gdpr(input_text: str, output_text: str):
return apply_gdpr_filter(input_text), apply_gdpr_filter(output_text)
if __name__ == "__main__":
tests = [
"Patienten tar Metoprolol 50mg två gånger dagligen",
"Patienten är allergisk mot Metoprolol",
"Jag heter Anna och mitt personnummer är 880415-1234",
"Erik tar Atorvastatin 40mg enligt recept och är allergisk mot Penicillin",
]
print("\n" + "="*80)
print("GDPR FILTER TESTS")
print("="*80)
for text in tests:
result = apply_gdpr_filter(text)
print(f"\nINPUT: {text}")
print(f"OUTPUT: {result}")
print("\n" + "="*80)