refined phone number regex
Browse files
utils.py
CHANGED
|
@@ -50,12 +50,8 @@ class PIIMasker:
|
|
| 50 |
# Define regex patterns for different entity types
|
| 51 |
self.patterns = {
|
| 52 |
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
# - Various delimiter formats: spaces, hyphens, periods, or nothing
|
| 56 |
-
# - Different grouping patterns for various countries
|
| 57 |
-
# - Overall length between 8-15 digits (excluding formatting characters)
|
| 58 |
-
"phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[\s.-]?)?(?:\(?\d{1,5}\)?[\s.-]?)?(?:\d{1,5}[\s.-]?)??(?:\d{1,5}[\s.-]?)??(?:\d{1,5}[\s.-]?)?\d{1,5}(?:[\s.-]?\d{1,5})?\b',
|
| 59 |
# Card number regex: common formats, allows optional spaces/hyphens
|
| 60 |
"credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
|
| 61 |
# CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
|
|
@@ -76,14 +72,17 @@ class PIIMasker:
|
|
| 76 |
start, end = match.span()
|
| 77 |
value = match.group()
|
| 78 |
|
| 79 |
-
# Specific verifications
|
| 80 |
if entity_type == "credit_debit_no":
|
| 81 |
if not self.verify_credit_card(text, match):
|
| 82 |
continue
|
| 83 |
elif entity_type == "cvv_no":
|
| 84 |
if not self.verify_cvv(text, match):
|
| 85 |
continue
|
| 86 |
-
elif entity_type == "
|
|
|
|
|
|
|
|
|
|
| 87 |
if not self._verify_with_context(text, start, end, ["birth", "dob", "born"]):
|
| 88 |
continue
|
| 89 |
|
|
@@ -91,7 +90,7 @@ class PIIMasker:
|
|
| 91 |
# This is a simple check; more robust overlap handling is done later
|
| 92 |
is_substring_of_existing = False
|
| 93 |
for existing_entity in entities:
|
| 94 |
-
if existing_entity.start <= start and existing_entity.end >= end and existing_entity.value != value
|
| 95 |
is_substring_of_existing = True
|
| 96 |
break
|
| 97 |
if is_substring_of_existing:
|
|
@@ -173,6 +172,49 @@ class PIIMasker:
|
|
| 173 |
|
| 174 |
return is_cvv_context and not (is_date_context and looks_like_year)
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
def detect_name_entities(self, text: str) -> List[Entity]:
|
| 178 |
"""Detect name entities using SpaCy NER"""
|
|
@@ -298,13 +340,10 @@ class PIIMasker:
|
|
| 298 |
entities = self.detect_all_entities(text)
|
| 299 |
entity_info = [entity.to_dict() for entity in entities]
|
| 300 |
|
| 301 |
-
masked_text = list(text) # Use list of chars for easier replacement
|
| 302 |
-
|
| 303 |
# Sort entities by start position to ensure correct masking,
|
| 304 |
# longest first at same start to prevent partial masking by shorter entities
|
| 305 |
entities.sort(key=lambda x: (x.start, -(x.end - x.start)))
|
| 306 |
|
| 307 |
-
offset = 0
|
| 308 |
new_text_parts = []
|
| 309 |
current_pos = 0
|
| 310 |
|
|
@@ -313,8 +352,8 @@ class PIIMasker:
|
|
| 313 |
if entity.start > current_pos:
|
| 314 |
new_text_parts.append(text[current_pos:entity.start])
|
| 315 |
|
| 316 |
-
# Add the mask
|
| 317 |
-
mask = f"[{entity.entity_type}]"
|
| 318 |
new_text_parts.append(mask)
|
| 319 |
|
| 320 |
current_pos = entity.end
|
|
|
|
| 50 |
# Define regex patterns for different entity types
|
| 51 |
self.patterns = {
|
| 52 |
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
| 53 |
+
# More precise phone number regex that handles international formats while avoiding false positives
|
| 54 |
+
"phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[-\s.]?)(?:\(?\d{1,5}\)?[-\s.]?)?(?:\d{1,5}[-\s.]?){1,4}\d{1,5}\b',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# Card number regex: common formats, allows optional spaces/hyphens
|
| 56 |
"credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
|
| 57 |
# CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
|
|
|
|
| 72 |
start, end = match.span()
|
| 73 |
value = match.group()
|
| 74 |
|
| 75 |
+
# Specific verifications for each entity type
|
| 76 |
if entity_type == "credit_debit_no":
|
| 77 |
if not self.verify_credit_card(text, match):
|
| 78 |
continue
|
| 79 |
elif entity_type == "cvv_no":
|
| 80 |
if not self.verify_cvv(text, match):
|
| 81 |
continue
|
| 82 |
+
elif entity_type == "phone_number":
|
| 83 |
+
if not self.verify_phone_number(text, match):
|
| 84 |
+
continue
|
| 85 |
+
elif entity_type == "dob":
|
| 86 |
if not self._verify_with_context(text, start, end, ["birth", "dob", "born"]):
|
| 87 |
continue
|
| 88 |
|
|
|
|
| 90 |
# This is a simple check; more robust overlap handling is done later
|
| 91 |
is_substring_of_existing = False
|
| 92 |
for existing_entity in entities:
|
| 93 |
+
if existing_entity.start <= start and existing_entity.end >= end and existing_entity.value != value:
|
| 94 |
is_substring_of_existing = True
|
| 95 |
break
|
| 96 |
if is_substring_of_existing:
|
|
|
|
| 172 |
|
| 173 |
return is_cvv_context and not (is_date_context and looks_like_year)
|
| 174 |
|
| 175 |
+
def verify_phone_number(self, text: str, match: re.Match) -> bool:
|
| 176 |
+
"""
|
| 177 |
+
Verify if a match is actually a phone number using validation rules and context.
|
| 178 |
+
|
| 179 |
+
This helps prevent:
|
| 180 |
+
1. CVV numbers being detected as phone numbers
|
| 181 |
+
2. Parts of a phone number being detected as separate numbers
|
| 182 |
+
3. Random digit sequences being detected as phone numbers
|
| 183 |
+
"""
|
| 184 |
+
value = match.group()
|
| 185 |
+
start, end = match.span()
|
| 186 |
+
|
| 187 |
+
# 1. Minimum digit count check (excluding formatting chars)
|
| 188 |
+
digit_count = sum(1 for c in value if c.isdigit())
|
| 189 |
+
if digit_count < 6:
|
| 190 |
+
return False # Too few digits to be a valid phone number
|
| 191 |
+
|
| 192 |
+
if digit_count > 15:
|
| 193 |
+
return False # Too many digits to be a realistic phone number
|
| 194 |
+
|
| 195 |
+
# 2. Context check for phone numbers
|
| 196 |
+
context_window = 50
|
| 197 |
+
context_before = text[max(0, start - context_window):start].lower()
|
| 198 |
+
context_after = text[end:min(len(text), end + context_window)].lower()
|
| 199 |
+
|
| 200 |
+
phone_keywords = [
|
| 201 |
+
"phone", "call", "tel", "telephone", "contact", "dial",
|
| 202 |
+
"mobile", "cell", "number", "direct", "office", "fax"
|
| 203 |
+
]
|
| 204 |
+
|
| 205 |
+
# If phone context keywords are found, increase confidence
|
| 206 |
+
has_phone_context = any(kw in context_before or kw in context_after for kw in phone_keywords)
|
| 207 |
+
|
| 208 |
+
# 3. Check if this is likely part of a larger number or another entity
|
| 209 |
+
# Look for specific formatted patterns that indicate complete phone numbers
|
| 210 |
+
is_clean_formatted = bool(re.search(r'(?:\+\d{1,4}[-\s])?(?:\(\d+\)[-\s]?)?\d+(?:[-\s]\d+)+', value))
|
| 211 |
+
|
| 212 |
+
# If not properly formatted but has a plus sign, it's likely an international number
|
| 213 |
+
has_intl_prefix = value.startswith('+') or value.startswith('00')
|
| 214 |
+
|
| 215 |
+
# If it has at least some formatting and reasonable digit count, or has clear phone context,
|
| 216 |
+
# we'll consider it a valid phone number
|
| 217 |
+
return (is_clean_formatted and digit_count >= 7) or (has_intl_prefix and digit_count >= 8) or (has_phone_context and digit_count >= 7)
|
| 218 |
|
| 219 |
def detect_name_entities(self, text: str) -> List[Entity]:
|
| 220 |
"""Detect name entities using SpaCy NER"""
|
|
|
|
| 340 |
entities = self.detect_all_entities(text)
|
| 341 |
entity_info = [entity.to_dict() for entity in entities]
|
| 342 |
|
|
|
|
|
|
|
| 343 |
# Sort entities by start position to ensure correct masking,
|
| 344 |
# longest first at same start to prevent partial masking by shorter entities
|
| 345 |
entities.sort(key=lambda x: (x.start, -(x.end - x.start)))
|
| 346 |
|
|
|
|
| 347 |
new_text_parts = []
|
| 348 |
current_pos = 0
|
| 349 |
|
|
|
|
| 352 |
if entity.start > current_pos:
|
| 353 |
new_text_parts.append(text[current_pos:entity.start])
|
| 354 |
|
| 355 |
+
# Add the mask with entity type in uppercase for better visibility
|
| 356 |
+
mask = f"[{entity.entity_type.upper()}]"
|
| 357 |
new_text_parts.append(mask)
|
| 358 |
|
| 359 |
current_pos = entity.end
|