Sparkonix commited on
Commit
adf8222
·
1 Parent(s): 32134ac

fix refined phone number regex

Browse files
Files changed (1) hide show
  1. utils.py +58 -52
utils.py CHANGED
@@ -50,8 +50,8 @@ class PIIMasker:
50
  # Define regex patterns for different entity types
51
  self.patterns = {
52
  "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
53
- # More precise phone number regex that handles international formats while avoiding false positives
54
- "phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[-\s.]?)(?:\(?\d{1,5}\)?[-\s.]?)?(?:\d{1,5}[-\s.]?){1,4}\d{1,5}\b',
55
  # Card number regex: common formats, allows optional spaces/hyphens
56
  "credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
57
  # CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
@@ -128,7 +128,7 @@ class PIIMasker:
128
 
129
  def verify_cvv(self, text: str, match: re.Match) -> bool:
130
  """Verify if a 3-4 digit number is actually a CVV using contextual clues"""
131
- context_window = 30
132
  start, end = match.span()
133
  value = match.group()
134
 
@@ -139,88 +139,94 @@ class PIIMasker:
139
  if char_before.isdigit() or char_after.isdigit():
140
  return False # It's part of a larger number
141
 
 
 
 
 
142
  context_before = text[max(0, start - context_window):start].lower()
143
  context_after = text[end:min(len(text), end + context_window)].lower()
144
 
145
- cvv_keywords = ["cvv", "cvc", "csc", "security code", "card verification", "verification no"]
146
- date_keywords = ["date", "year", "/", "-", "born", "age", "since", "established", "version", "model", "grade"] # More exhaustive
 
 
 
 
 
147
 
 
148
  is_cvv_context = any(keyword in context_before or keyword in context_after for keyword in cvv_keywords)
149
 
150
- # If it looks like a year in common contexts, it's probably not a CVV
151
- # e.g. "since 2023", "class of 99", "born 1990"
152
- if value.isdigit() and (1900 <= int(value) <= 2100 if len(value) == 4 else False):
153
- year_context_keywords = ["year", "born", "fiscal", "established", "since", "class of", "ended", "began", "joined"]
154
- if any(kw in context_before for kw in year_context_keywords):
155
- return False # Likely a year
156
- # If it's MM/YY or MM/YYYY context, it's expiry, not CVV
157
- if re.search(r'\b(0[1-9]|1[0-2])[/\s-]$', context_before.strip()): # Ends with MM/
158
- return False # Part of an expiry date
159
 
160
- is_date_context = any(keyword in context_before or keyword in context_after for keyword in date_keywords)
 
 
 
161
 
162
- # Check if the number itself looks like a year in typical CVV lengths
163
- looks_like_year = False
164
- if len(value) == 2 and value.isdigit(): # e.g. "23" for year in expiry
165
- if any(k in context_before for k in ["expiry", "exp", "valid thru", "good thru"]) or \
166
- re.search(r'\b(0[1-9]|1[0-2])[/\s-]$', context_before.strip()):
167
- looks_like_year = True # It's the YY part of an expiry
168
- elif len(value) == 4 and value.isdigit() and (1900 <= int(value) <= 2100):
169
- if any(k in (context_before + context_after) for k in ["year", "born", "fiscal"]):
170
- looks_like_year = True
171
-
172
-
173
- return is_cvv_context and not (is_date_context and looks_like_year)
174
 
175
  def verify_phone_number(self, text: str, match: re.Match) -> bool:
176
  """
177
  Verify if a match is actually a phone number using validation rules and context.
178
-
179
- This helps prevent:
180
- 1. CVV numbers being detected as phone numbers
181
- 2. Parts of a phone number being detected as separate numbers
182
- 3. Random digit sequences being detected as phone numbers
183
  """
184
  value = match.group()
185
  start, end = match.span()
186
 
187
- # 1. Minimum digit count check (excluding formatting chars)
188
- digit_count = sum(1 for c in value if c.isdigit())
189
- if digit_count < 6:
190
- return False # Too few digits to be a valid phone number
191
 
192
- if digit_count > 15:
193
- return False # Too many digits to be a realistic phone number
194
-
195
- # 2. Context check for phone numbers
 
196
  context_window = 50
197
  context_before = text[max(0, start - context_window):start].lower()
198
  context_after = text[end:min(len(text), end + context_window)].lower()
199
 
 
200
  phone_keywords = [
201
- "phone", "call", "tel", "telephone", "contact", "dial",
202
- "mobile", "cell", "number", "direct", "office", "fax"
 
203
  ]
204
 
205
- # If phone context keywords are found, increase confidence
206
  has_phone_context = any(kw in context_before or kw in context_after for kw in phone_keywords)
207
 
208
- # 3. Check if this is likely part of a larger number or another entity
209
- # Look for specific formatted patterns that indicate complete phone numbers
210
- is_clean_formatted = bool(re.search(r'(?:\+\d{1,4}[-\s])?(?:\(\d+\)[-\s]?)?\d+(?:[-\s]\d+)+', value))
211
 
212
- # If not properly formatted but has a plus sign, it's likely an international number
213
  has_intl_prefix = value.startswith('+') or value.startswith('00')
214
 
215
- # If it has at least some formatting and reasonable digit count, or has clear phone context,
216
- # we'll consider it a valid phone number
217
- return (is_clean_formatted and digit_count >= 7) or (has_intl_prefix and digit_count >= 8) or (has_phone_context and digit_count >= 7)
 
 
 
 
 
 
218
 
219
  def detect_name_entities(self, text: str) -> List[Entity]:
220
  """Detect name entities using SpaCy NER"""
221
  entities = []
222
  doc = self.nlp(text)
223
-
224
  for ent in doc.ents:
225
  # Use PER for person, common in many models like xx_ent_wiki_sm
226
  # Also checking for PERSON as some models might use it.
@@ -298,7 +304,7 @@ class PIIMasker:
298
  # res is longer, current is dominated
299
  temp_resolved.append(res_entity)
300
  is_overlapped_or_contained = True # Mark current as handled
301
- break # Current is dominated
302
  else: # Same length, keep existing one (res_entity)
303
  temp_resolved.append(res_entity)
304
  is_overlapped_or_contained = True # Mark current as handled
 
50
  # Define regex patterns for different entity types
51
  self.patterns = {
52
  "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
53
+ # Simplified phone regex to capture both standard and international formats
54
+ "phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[-\s.]?)?(?:\(?\d{1,5}\)?[-\s.]?)?\d{1,5}(?:[-\s.]\d{1,5}){1,4}\b',
55
  # Card number regex: common formats, allows optional spaces/hyphens
56
  "credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
57
  # CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
 
128
 
129
  def verify_cvv(self, text: str, match: re.Match) -> bool:
130
  """Verify if a 3-4 digit number is actually a CVV using contextual clues"""
131
+ context_window = 50
132
  start, end = match.span()
133
  value = match.group()
134
 
 
139
  if char_before.isdigit() or char_after.isdigit():
140
  return False # It's part of a larger number
141
 
142
+ # Only consider 3-4 digit numbers
143
+ if not value.isdigit() or len(value) < 3 or len(value) > 4:
144
+ return False
145
+
146
  context_before = text[max(0, start - context_window):start].lower()
147
  context_after = text[end:min(len(text), end + context_window)].lower()
148
 
149
+ # Expanded list of CVV-related keywords to improve detection
150
+ cvv_keywords = [
151
+ "cvv", "cvc", "csc", "security code", "card verification", "verification no",
152
+ "security", "security number", "cv2", "card code", "security value"
153
+ ]
154
+
155
+ date_keywords = ["date", "year", "/", "born", "age", "since", "established"]
156
 
157
+ # Look for CVV context clues
158
  is_cvv_context = any(keyword in context_before or keyword in context_after for keyword in cvv_keywords)
159
 
160
+ # If explicitly mentioned as a CVV, immediately return true
161
+ if is_cvv_context:
162
+ return True
 
 
 
 
 
 
163
 
164
+ # If it looks like a year, reject it
165
+ if len(value) == 4 and 1900 <= int(value) <= 2100:
166
+ if any(k in context_before or k in context_after for k in ["year", "born", "established", "since"]):
167
+ return False
168
 
169
+ # If in expiry date context, reject it
170
+ if re.search(r'\b(0[1-9]|1[0-2])[/\s-]$', context_before.strip()):
171
+ return False
172
+
173
+ # If no context clues but we have a credit card mention nearby, it could be a CVV
174
+ card_context = any(k in context_before or k in context_after
175
+ for k in ["card", "credit", "visa", "mastercard", "amex", "discover"])
176
+
177
+ return is_cvv_context or (card_context and len(value) in [3, 4])
 
 
 
178
 
179
  def verify_phone_number(self, text: str, match: re.Match) -> bool:
180
  """
181
  Verify if a match is actually a phone number using validation rules and context.
 
 
 
 
 
182
  """
183
  value = match.group()
184
  start, end = match.span()
185
 
186
+ # Extract only digits to count them
187
+ digits = ''.join(c for c in value if c.isdigit())
188
+ digit_count = len(digits)
 
189
 
190
+ # Most phone numbers worldwide have between 7 and 15 digits
191
+ if digit_count < 7 or digit_count > 15:
192
+ return False
193
+
194
+ # Check for common phone number indicators
195
  context_window = 50
196
  context_before = text[max(0, start - context_window):start].lower()
197
  context_after = text[end:min(len(text), end + context_window)].lower()
198
 
199
+ # Expanded phone keywords
200
  phone_keywords = [
201
+ "phone", "call", "tel", "telephone", "contact", "dial", "mobile", "cell",
202
+ "number", "direct", "office", "fax", "reach me at", "call me", "contact me",
203
+ "line", "extension", "ext", "phone number"
204
  ]
205
 
206
+ # Check for phone context
207
  has_phone_context = any(kw in context_before or kw in context_after for kw in phone_keywords)
208
 
209
+ # Check for formatting that indicates a phone number
210
+ has_phone_formatting = bool(re.search(r'[-\s.()\+]', value))
 
211
 
212
+ # Check for international prefix
213
  has_intl_prefix = value.startswith('+') or value.startswith('00')
214
 
215
+ # Return true if any of these conditions are met:
216
+ # 1. Has explicit phone context
217
+ # 2. Has phone-like formatting AND reasonable digit count
218
+ # 3. Has international prefix AND reasonable digit count
219
+ # 4. Has 10 digits exactly (common in many countries) with formatting
220
+ return has_phone_context or \
221
+ (has_phone_formatting and digit_count >= 7) or \
222
+ (has_intl_prefix) or \
223
+ (digit_count == 10 and has_phone_formatting)
224
 
225
  def detect_name_entities(self, text: str) -> List[Entity]:
226
  """Detect name entities using SpaCy NER"""
227
  entities = []
228
  doc = self.nlp(text)
229
+
230
  for ent in doc.ents:
231
  # Use PER for person, common in many models like xx_ent_wiki_sm
232
  # Also checking for PERSON as some models might use it.
 
304
  # res is longer, current is dominated
305
  temp_resolved.append(res_entity)
306
  is_overlapped_or_contained = True # Mark current as handled
307
+ break
308
  else: # Same length, keep existing one (res_entity)
309
  temp_resolved.append(res_entity)
310
  is_overlapped_or_contained = True # Mark current as handled