| import re |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class GDPRFilter: |
| |
| PERSONNUMMER_PATTERN = r'\b(?:19|20)?\d{6}[-–]\d{4}\b' |
| EMAIL_PATTERN = r'\b[\w\.\-]+@[\w\.\-]+\.\w{2,4}\b' |
| PHONE_PATTERN = r'\b(?:\+46|0)[\s\-]?\d{1,3}[\s\-]?\d{3,4}[\s\-]?\d{2,4}\b' |
| DATE_PATTERNS = [ |
| r'\b\d{4}[-–]\d{2}[-–]\d{2}\b', |
| r'\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b' |
| ] |
| ADDRESS_PATTERN = r'\b[A-ZÅÄÖ][a-zåäö]+(?:gatan|vägen|gränd|torget|plan|allén|stigen|backen|torg)\s\d+[A-Za-z]?\b' |
| |
| MEDICATION_CONTEXT_KEYWORDS = [ |
| r'\b(läkemedel|medicin|mediciner|drog|droger)\b', |
| r'\b(dos|doser|dosering|doseras)\b', |
| r'\b(mg|ml|gram|tabletter?|kapslar?|ampuller?|dropp?ar?|injektioner?)\b', |
| r'\b(ta|tar|tog|tar in|skall ta|ska ta|bör ta|måste ta|börja ta|slutat ta)\b', |
| r'\b(recept|receptfritt|förskrivit|ordinerat)\b', |
| r'\b(apotek|apoteket|farmaceut)\b', |
| r'\b(två gånger|tre gånger|en gång|fyra gånger|dagligen|morgon|kväll|middag|natt)\b', |
| ] |
| |
| MEDICATION_PATTERNS = [ |
| r'(ol|in|stat|pril|zol|am|um|ase|mycin|axin|cillin)$', |
| ] |
| |
| MEDICATION_EXCLUSIONS = [ |
| 'Metoprolol', 'Atorvastatin', 'Losartan', 'Omeprazol', |
| 'Lisinopril', 'Simvastatin', 'Ramipril', 'Bisoprolol', |
| 'Warfarin', 'Insulin', 'Aspirin', 'Paracetamol', 'Ibuprofen', |
| 'Amoxicillin', 'Penicillin', 'Tetracyclin', 'Doxycyclin', |
| ] |
| |
| @staticmethod |
| def has_medication_context(text: str) -> bool: |
| for pattern in GDPRFilter.MEDICATION_CONTEXT_KEYWORDS: |
| if re.search(pattern, text, re.IGNORECASE): |
| return True |
| return False |
| |
| @staticmethod |
| def is_likely_medication(word: str) -> bool: |
| if len(word) < 6: |
| return False |
| for pattern in GDPRFilter.MEDICATION_PATTERNS: |
| if re.search(pattern, word, re.IGNORECASE): |
| return True |
| return False |
| |
| @staticmethod |
| def apply_filter(text: str) -> str: |
| if not text or not text.strip(): |
| return "" |
| |
| text = re.sub(GDPRFilter.PERSONNUMMER_PATTERN, '[PERSONNR]', text) |
| text = re.sub(GDPRFilter.PHONE_PATTERN, '[TELEFON]', text) |
| text = re.sub( |
| r'(?i)(mobil(?:nummer)?|telefon(?:nummer)?|nummer)[\s:]+(\d[\d\s\-]{6,})', |
| r'\1 [TELEFON]', text |
| ) |
| text = re.sub(GDPRFilter.EMAIL_PATTERN, '[EMAIL]', text) |
| for pattern in GDPRFilter.DATE_PATTERNS: |
| text = re.sub(pattern, '[DATUM]', text) |
| text = re.sub(GDPRFilter.ADDRESS_PATTERN, '[ADRESS]', text) |
| text = re.sub( |
| r'(?i)(bor\s+(?:i|på|kvar\s+i)|i\s+stadsdelen|i\s+området)\s+([A-ZÅÄÖ][a-zåäö]{2,}(?:\s+[A-ZÅÄÖ][a-zåäö]+)?)', |
| r'\1 [ORT]', text |
| ) |
| text = re.sub(r'(\[TELEFON\])\s+\d{1,4}\b', r'\1', text) |
| text = re.sub(r'(\[PERSONNR\])\s+\d{1,4}\b', r'\1', text) |
| |
| has_med_context = GDPRFilter.has_medication_context(text) |
| |
| def replace_name(match): |
| word = match.group(0) |
| if GDPRFilter.is_likely_medication(word): |
| if has_med_context: |
| return word |
| return '[LÄKEMEDEL]' |
| if any(word.startswith(ex) for ex in GDPRFilter.MEDICATION_EXCLUSIONS): |
| return word |
| return '[NAMN]' |
| |
| text = re.sub( |
| r'\b[A-ZÅÄÖ][a-zåäö]{2,}\s[A-ZÅÄÖ][a-zåäö]{2,}\b', |
| replace_name, text |
| ) |
| |
| return text |
|
|
|
|
| def apply_gdpr_filter(text: str) -> str: |
| return GDPRFilter.apply_filter(text) |
|
|
|
|
| def apply_dual_layer_gdpr(input_text: str, output_text: str): |
| return apply_gdpr_filter(input_text), apply_gdpr_filter(output_text) |
|
|
|
|
| if __name__ == "__main__": |
| tests = [ |
| "Patienten tar Metoprolol 50mg två gånger dagligen", |
| "Patienten är allergisk mot Metoprolol", |
| "Jag heter Anna och mitt personnummer är 880415-1234", |
| "Erik tar Atorvastatin 40mg enligt recept och är allergisk mot Penicillin", |
| ] |
| |
| print("\n" + "="*80) |
| print("GDPR FILTER TESTS") |
| print("="*80) |
| |
| for text in tests: |
| result = apply_gdpr_filter(text) |
| print(f"\nINPUT: {text}") |
| print(f"OUTPUT: {result}") |
| |
| print("\n" + "="*80) |