import re import logging logger = logging.getLogger(__name__) class GDPRFilter: PERSONNUMMER_PATTERN = r'\b(?:19|20)?\d{6}[-–]\d{4}\b' EMAIL_PATTERN = r'\b[\w\.\-]+@[\w\.\-]+\.\w{2,4}\b' PHONE_PATTERN = r'\b(?:\+46|0)[\s\-]?\d{1,3}[\s\-]?\d{3,4}[\s\-]?\d{2,4}\b' DATE_PATTERNS = [ r'\b\d{4}[-–]\d{2}[-–]\d{2}\b', r'\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b' ] ADDRESS_PATTERN = r'\b[A-ZÅÄÖ][a-zåäö]+(?:gatan|vägen|gränd|torget|plan|allén|stigen|backen|torg)\s\d+[A-Za-z]?\b' MEDICATION_CONTEXT_KEYWORDS = [ r'\b(läkemedel|medicin|mediciner|drog|droger)\b', r'\b(dos|doser|dosering|doseras)\b', r'\b(mg|ml|gram|tabletter?|kapslar?|ampuller?|dropp?ar?|injektioner?)\b', r'\b(ta|tar|tog|tar in|skall ta|ska ta|bör ta|måste ta|börja ta|slutat ta)\b', r'\b(recept|receptfritt|förskrivit|ordinerat)\b', r'\b(apotek|apoteket|farmaceut)\b', r'\b(två gånger|tre gånger|en gång|fyra gånger|dagligen|morgon|kväll|middag|natt)\b', ] MEDICATION_PATTERNS = [ r'(ol|in|stat|pril|zol|am|um|ase|mycin|axin|cillin)$', ] MEDICATION_EXCLUSIONS = [ 'Metoprolol', 'Atorvastatin', 'Losartan', 'Omeprazol', 'Lisinopril', 'Simvastatin', 'Ramipril', 'Bisoprolol', 'Warfarin', 'Insulin', 'Aspirin', 'Paracetamol', 'Ibuprofen', 'Amoxicillin', 'Penicillin', 'Tetracyclin', 'Doxycyclin', ] @staticmethod def has_medication_context(text: str) -> bool: for pattern in GDPRFilter.MEDICATION_CONTEXT_KEYWORDS: if re.search(pattern, text, re.IGNORECASE): return True return False @staticmethod def is_likely_medication(word: str) -> bool: if len(word) < 6: return False for pattern in GDPRFilter.MEDICATION_PATTERNS: if re.search(pattern, word, re.IGNORECASE): return True return False @staticmethod def apply_filter(text: str) -> str: if not text or not text.strip(): return "" text = re.sub(GDPRFilter.PERSONNUMMER_PATTERN, '[PERSONNR]', text) text = re.sub(GDPRFilter.PHONE_PATTERN, '[TELEFON]', text) text = re.sub( r'(?i)(mobil(?:nummer)?|telefon(?:nummer)?|nummer)[\s:]+(\d[\d\s\-]{6,})', r'\1 [TELEFON]', text ) text = re.sub(GDPRFilter.EMAIL_PATTERN, '[EMAIL]', text) for pattern in GDPRFilter.DATE_PATTERNS: text = re.sub(pattern, '[DATUM]', text) text = re.sub(GDPRFilter.ADDRESS_PATTERN, '[ADRESS]', text) text = re.sub( r'(?i)(bor\s+(?:i|på|kvar\s+i)|i\s+stadsdelen|i\s+området)\s+([A-ZÅÄÖ][a-zåäö]{2,}(?:\s+[A-ZÅÄÖ][a-zåäö]+)?)', r'\1 [ORT]', text ) text = re.sub(r'(\[TELEFON\])\s+\d{1,4}\b', r'\1', text) text = re.sub(r'(\[PERSONNR\])\s+\d{1,4}\b', r'\1', text) has_med_context = GDPRFilter.has_medication_context(text) def replace_name(match): word = match.group(0) if GDPRFilter.is_likely_medication(word): if has_med_context: return word return '[LÄKEMEDEL]' if any(word.startswith(ex) for ex in GDPRFilter.MEDICATION_EXCLUSIONS): return word return '[NAMN]' text = re.sub( r'\b[A-ZÅÄÖ][a-zåäö]{2,}\s[A-ZÅÄÖ][a-zåäö]{2,}\b', replace_name, text ) return text def apply_gdpr_filter(text: str) -> str: return GDPRFilter.apply_filter(text) def apply_dual_layer_gdpr(input_text: str, output_text: str): return apply_gdpr_filter(input_text), apply_gdpr_filter(output_text) if __name__ == "__main__": tests = [ "Patienten tar Metoprolol 50mg två gånger dagligen", "Patienten är allergisk mot Metoprolol", "Jag heter Anna och mitt personnummer är 880415-1234", "Erik tar Atorvastatin 40mg enligt recept och är allergisk mot Penicillin", ] print("\n" + "="*80) print("GDPR FILTER TESTS") print("="*80) for text in tests: result = apply_gdpr_filter(text) print(f"\nINPUT: {text}") print(f"OUTPUT: {result}") print("\n" + "="*80)