Sparkonix commited on
Commit
32134ac
·
1 Parent(s): 7030c07

refined phone number regex

Browse files
Files changed (1) hide show
  1. utils.py +53 -14
utils.py CHANGED
@@ -50,12 +50,8 @@ class PIIMasker:
50
  # Define regex patterns for different entity types
51
  self.patterns = {
52
  "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
53
- # Enhanced phone number regex that handles international formats:
54
- # - International prefix with country code: +XX, +XXX (optional)
55
- # - Various delimiter formats: spaces, hyphens, periods, or nothing
56
- # - Different grouping patterns for various countries
57
- # - Overall length between 8-15 digits (excluding formatting characters)
58
- "phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[\s.-]?)?(?:\(?\d{1,5}\)?[\s.-]?)?(?:\d{1,5}[\s.-]?)??(?:\d{1,5}[\s.-]?)??(?:\d{1,5}[\s.-]?)?\d{1,5}(?:[\s.-]?\d{1,5})?\b',
59
  # Card number regex: common formats, allows optional spaces/hyphens
60
  "credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
61
  # CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
@@ -76,14 +72,17 @@ class PIIMasker:
76
  start, end = match.span()
77
  value = match.group()
78
 
79
- # Specific verifications
80
  if entity_type == "credit_debit_no":
81
  if not self.verify_credit_card(text, match):
82
  continue
83
  elif entity_type == "cvv_no":
84
  if not self.verify_cvv(text, match):
85
  continue
86
- elif entity_type == "dob": # Using the generic context verifier for DOB
 
 
 
87
  if not self._verify_with_context(text, start, end, ["birth", "dob", "born"]):
88
  continue
89
 
@@ -91,7 +90,7 @@ class PIIMasker:
91
  # This is a simple check; more robust overlap handling is done later
92
  is_substring_of_existing = False
93
  for existing_entity in entities:
94
- if existing_entity.start <= start and existing_entity.end >= end and existing_entity.value != value :
95
  is_substring_of_existing = True
96
  break
97
  if is_substring_of_existing:
@@ -173,6 +172,49 @@ class PIIMasker:
173
 
174
  return is_cvv_context and not (is_date_context and looks_like_year)
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  def detect_name_entities(self, text: str) -> List[Entity]:
178
  """Detect name entities using SpaCy NER"""
@@ -298,13 +340,10 @@ class PIIMasker:
298
  entities = self.detect_all_entities(text)
299
  entity_info = [entity.to_dict() for entity in entities]
300
 
301
- masked_text = list(text) # Use list of chars for easier replacement
302
-
303
  # Sort entities by start position to ensure correct masking,
304
  # longest first at same start to prevent partial masking by shorter entities
305
  entities.sort(key=lambda x: (x.start, -(x.end - x.start)))
306
 
307
- offset = 0
308
  new_text_parts = []
309
  current_pos = 0
310
 
@@ -313,8 +352,8 @@ class PIIMasker:
313
  if entity.start > current_pos:
314
  new_text_parts.append(text[current_pos:entity.start])
315
 
316
- # Add the mask
317
- mask = f"[{entity.entity_type}]" # Changed to upper for clarity
318
  new_text_parts.append(mask)
319
 
320
  current_pos = entity.end
 
50
  # Define regex patterns for different entity types
51
  self.patterns = {
52
  "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
53
+ # More precise phone number regex that handles international formats while avoiding false positives
54
+ "phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[-\s.]?)(?:\(?\d{1,5}\)?[-\s.]?)?(?:\d{1,5}[-\s.]?){1,4}\d{1,5}\b',
 
 
 
 
55
  # Card number regex: common formats, allows optional spaces/hyphens
56
  "credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
57
  # CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
 
72
  start, end = match.span()
73
  value = match.group()
74
 
75
+ # Specific verifications for each entity type
76
  if entity_type == "credit_debit_no":
77
  if not self.verify_credit_card(text, match):
78
  continue
79
  elif entity_type == "cvv_no":
80
  if not self.verify_cvv(text, match):
81
  continue
82
+ elif entity_type == "phone_number":
83
+ if not self.verify_phone_number(text, match):
84
+ continue
85
+ elif entity_type == "dob":
86
  if not self._verify_with_context(text, start, end, ["birth", "dob", "born"]):
87
  continue
88
 
 
90
  # This is a simple check; more robust overlap handling is done later
91
  is_substring_of_existing = False
92
  for existing_entity in entities:
93
+ if existing_entity.start <= start and existing_entity.end >= end and existing_entity.value != value:
94
  is_substring_of_existing = True
95
  break
96
  if is_substring_of_existing:
 
172
 
173
  return is_cvv_context and not (is_date_context and looks_like_year)
174
 
175
+ def verify_phone_number(self, text: str, match: re.Match) -> bool:
176
+ """
177
+ Verify if a match is actually a phone number using validation rules and context.
178
+
179
+ This helps prevent:
180
+ 1. CVV numbers being detected as phone numbers
181
+ 2. Parts of a phone number being detected as separate numbers
182
+ 3. Random digit sequences being detected as phone numbers
183
+ """
184
+ value = match.group()
185
+ start, end = match.span()
186
+
187
+ # 1. Minimum digit count check (excluding formatting chars)
188
+ digit_count = sum(1 for c in value if c.isdigit())
189
+ if digit_count < 6:
190
+ return False # Too few digits to be a valid phone number
191
+
192
+ if digit_count > 15:
193
+ return False # Too many digits to be a realistic phone number
194
+
195
+ # 2. Context check for phone numbers
196
+ context_window = 50
197
+ context_before = text[max(0, start - context_window):start].lower()
198
+ context_after = text[end:min(len(text), end + context_window)].lower()
199
+
200
+ phone_keywords = [
201
+ "phone", "call", "tel", "telephone", "contact", "dial",
202
+ "mobile", "cell", "number", "direct", "office", "fax"
203
+ ]
204
+
205
+ # If phone context keywords are found, increase confidence
206
+ has_phone_context = any(kw in context_before or kw in context_after for kw in phone_keywords)
207
+
208
+ # 3. Check if this is likely part of a larger number or another entity
209
+ # Look for specific formatted patterns that indicate complete phone numbers
210
+ is_clean_formatted = bool(re.search(r'(?:\+\d{1,4}[-\s])?(?:\(\d+\)[-\s]?)?\d+(?:[-\s]\d+)+', value))
211
+
212
+ # If not properly formatted but has a plus sign, it's likely an international number
213
+ has_intl_prefix = value.startswith('+') or value.startswith('00')
214
+
215
+ # If it has at least some formatting and reasonable digit count, or has clear phone context,
216
+ # we'll consider it a valid phone number
217
+ return (is_clean_formatted and digit_count >= 7) or (has_intl_prefix and digit_count >= 8) or (has_phone_context and digit_count >= 7)
218
 
219
  def detect_name_entities(self, text: str) -> List[Entity]:
220
  """Detect name entities using SpaCy NER"""
 
340
  entities = self.detect_all_entities(text)
341
  entity_info = [entity.to_dict() for entity in entities]
342
 
 
 
343
  # Sort entities by start position to ensure correct masking,
344
  # longest first at same start to prevent partial masking by shorter entities
345
  entities.sort(key=lambda x: (x.start, -(x.end - x.start)))
346
 
 
347
  new_text_parts = []
348
  current_pos = 0
349
 
 
352
  if entity.start > current_pos:
353
  new_text_parts.append(text[current_pos:entity.start])
354
 
355
+ # Add the mask with entity type in uppercase for better visibility
356
+ mask = f"[{entity.entity_type.upper()}]"
357
  new_text_parts.append(mask)
358
 
359
  current_pos = entity.end