Update gdpr_filter.py
Browse files- gdpr_filter.py +20 -16
gdpr_filter.py
CHANGED
|
@@ -11,34 +11,27 @@ from config import Config
|
|
| 11 |
def apply_gdpr_filter(text: str) -> str:
|
| 12 |
"""
|
| 13 |
Identifierar och ersätter personuppgifter i svensk text.
|
| 14 |
-
|
| 15 |
-
Hanterar:
|
| 16 |
-
- Svenska personnummer (YYYYMMDD-XXXX eller YYMMDD-XXXX)
|
| 17 |
-
- Telefonnummer (svenska format)
|
| 18 |
-
- E-postadresser
|
| 19 |
-
- Datum (flera format)
|
| 20 |
-
- Gatuadresser (vanliga svenska suffix)
|
| 21 |
-
- Egennamn (för- och efternamn, två versaler)
|
| 22 |
-
|
| 23 |
-
Returnerar anonymiserad text med platshållare.
|
| 24 |
"""
|
| 25 |
|
| 26 |
-
# --- Personnummer ---
|
| 27 |
text = re.sub(r'\b(?:19|20)?\d{6}[-–]\d{4}\b', '[PERSONNR]', text)
|
| 28 |
|
| 29 |
-
# --- Telefonnummer ---
|
| 30 |
text = re.sub(
|
| 31 |
r'\b(?:\+46|0)[\s\-]?\d{1,3}[\s\-]?\d{3,4}[\s\-]?\d{2,4}\b',
|
| 32 |
'[TELEFON]', text
|
| 33 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# --- E-post ---
|
| 36 |
text = re.sub(r'\b[\w\.\-]+@[\w\.\-]+\.\w{2,4}\b', '[EMAIL]', text)
|
| 37 |
|
| 38 |
-
# --- Datum
|
| 39 |
text = re.sub(r'\b\d{4}[-–]\d{2}[-–]\d{2}\b', '[DATUM]', text)
|
| 40 |
-
|
| 41 |
-
# --- Datum svenska format ---
|
| 42 |
text = re.sub(r'\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b', '[DATUM]', text)
|
| 43 |
|
| 44 |
# --- Gatuadresser ---
|
|
@@ -47,8 +40,19 @@ def apply_gdpr_filter(text: str) -> str:
|
|
| 47 |
'[ADRESS]', text
|
| 48 |
)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# --- Egennamn (körs sist) ---
|
| 51 |
-
# Undviker termer i exclusion list (t.ex. läkemedelsnamn)
|
| 52 |
exclusions = Config.GDPR_EXCLUSION_LIST
|
| 53 |
|
| 54 |
def replace_name(match):
|
|
|
|
| 11 |
def apply_gdpr_filter(text: str) -> str:
|
| 12 |
"""
|
| 13 |
Identifierar och ersätter personuppgifter i svensk text.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
+
# --- Personnummer (innan andra nummer-regler) ---
|
| 17 |
text = re.sub(r'\b(?:19|20)?\d{6}[-–]\d{4}\b', '[PERSONNR]', text)
|
| 18 |
|
| 19 |
+
# --- Telefonnummer (inkl. utrymmen mellan grupper) ---
|
| 20 |
text = re.sub(
|
| 21 |
r'\b(?:\+46|0)[\s\-]?\d{1,3}[\s\-]?\d{3,4}[\s\-]?\d{2,4}\b',
|
| 22 |
'[TELEFON]', text
|
| 23 |
)
|
| 24 |
+
# Löst stående långa nummer (7-10 siffror) efter "mobil/telefon/nummer"
|
| 25 |
+
text = re.sub(
|
| 26 |
+
r'(?i)(mobil(?:nummer)?|telefon(?:nummer)?|nummer)[\s:]+(\d[\d\s\-]{6,})',
|
| 27 |
+
r'\1 [TELEFON]', text
|
| 28 |
+
)
|
| 29 |
|
| 30 |
# --- E-post ---
|
| 31 |
text = re.sub(r'\b[\w\.\-]+@[\w\.\-]+\.\w{2,4}\b', '[EMAIL]', text)
|
| 32 |
|
| 33 |
+
# --- Datum ---
|
| 34 |
text = re.sub(r'\b\d{4}[-–]\d{2}[-–]\d{2}\b', '[DATUM]', text)
|
|
|
|
|
|
|
| 35 |
text = re.sub(r'\b\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4}\b', '[DATUM]', text)
|
| 36 |
|
| 37 |
# --- Gatuadresser ---
|
|
|
|
| 40 |
'[ADRESS]', text
|
| 41 |
)
|
| 42 |
|
| 43 |
+
# --- Svenska ortsnamn/stadsdelar efter "bor i/på", "i Stockholm" etc ---
|
| 44 |
+
# Matchar: "bor i Björkhagen", "i Göteborg", "på Södermalm"
|
| 45 |
+
text = re.sub(
|
| 46 |
+
r'(?i)(bor\s+(?:i|på|kvar\s+i)|i\s+stadsdelen|i\s+området)\s+([A-ZÅÄÖ][a-zåäö]{2,}(?:\s+[A-ZÅÄÖ][a-zåäö]+)?)',
|
| 47 |
+
r'\1 [ORT]', text
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# --- Lösa 5-10 siffriga tal efter redan anonymiserade [TELEFON] etc. ---
|
| 51 |
+
# Exempel: "[TELEFON] 67" → "[TELEFON]"
|
| 52 |
+
text = re.sub(r'(\[TELEFON\])\s+\d{1,4}\b', r'\1', text)
|
| 53 |
+
text = re.sub(r'(\[PERSONNR\])\s+\d{1,4}\b', r'\1', text)
|
| 54 |
+
|
| 55 |
# --- Egennamn (körs sist) ---
|
|
|
|
| 56 |
exclusions = Config.GDPR_EXCLUSION_LIST
|
| 57 |
|
| 58 |
def replace_name(match):
|