fix refined phone number regex
Browse files
utils.py
CHANGED
|
@@ -50,8 +50,8 @@ class PIIMasker:
|
|
| 50 |
# Define regex patterns for different entity types
|
| 51 |
self.patterns = {
|
| 52 |
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
| 53 |
-
#
|
| 54 |
-
"phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[-\s.]?)(?:\(?\d{1,5}\)?[-\s.]?)
|
| 55 |
# Card number regex: common formats, allows optional spaces/hyphens
|
| 56 |
"credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
|
| 57 |
# CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
|
|
@@ -128,7 +128,7 @@ class PIIMasker:
|
|
| 128 |
|
| 129 |
def verify_cvv(self, text: str, match: re.Match) -> bool:
|
| 130 |
"""Verify if a 3-4 digit number is actually a CVV using contextual clues"""
|
| 131 |
-
context_window =
|
| 132 |
start, end = match.span()
|
| 133 |
value = match.group()
|
| 134 |
|
|
@@ -139,88 +139,94 @@ class PIIMasker:
|
|
| 139 |
if char_before.isdigit() or char_after.isdigit():
|
| 140 |
return False # It's part of a larger number
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
context_before = text[max(0, start - context_window):start].lower()
|
| 143 |
context_after = text[end:min(len(text), end + context_window)].lower()
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
|
|
|
| 148 |
is_cvv_context = any(keyword in context_before or keyword in context_after for keyword in cvv_keywords)
|
| 149 |
|
| 150 |
-
# If
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
year_context_keywords = ["year", "born", "fiscal", "established", "since", "class of", "ended", "began", "joined"]
|
| 154 |
-
if any(kw in context_before for kw in year_context_keywords):
|
| 155 |
-
return False # Likely a year
|
| 156 |
-
# If it's MM/YY or MM/YYYY context, it's expiry, not CVV
|
| 157 |
-
if re.search(r'\b(0[1-9]|1[0-2])[/\s-]$', context_before.strip()): # Ends with MM/
|
| 158 |
-
return False # Part of an expiry date
|
| 159 |
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
#
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
return is_cvv_context and not (is_date_context and looks_like_year)
|
| 174 |
|
| 175 |
def verify_phone_number(self, text: str, match: re.Match) -> bool:
|
| 176 |
"""
|
| 177 |
Verify if a match is actually a phone number using validation rules and context.
|
| 178 |
-
|
| 179 |
-
This helps prevent:
|
| 180 |
-
1. CVV numbers being detected as phone numbers
|
| 181 |
-
2. Parts of a phone number being detected as separate numbers
|
| 182 |
-
3. Random digit sequences being detected as phone numbers
|
| 183 |
"""
|
| 184 |
value = match.group()
|
| 185 |
start, end = match.span()
|
| 186 |
|
| 187 |
-
#
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
return False # Too few digits to be a valid phone number
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
| 196 |
context_window = 50
|
| 197 |
context_before = text[max(0, start - context_window):start].lower()
|
| 198 |
context_after = text[end:min(len(text), end + context_window)].lower()
|
| 199 |
|
|
|
|
| 200 |
phone_keywords = [
|
| 201 |
-
"phone", "call", "tel", "telephone", "contact", "dial",
|
| 202 |
-
"
|
|
|
|
| 203 |
]
|
| 204 |
|
| 205 |
-
#
|
| 206 |
has_phone_context = any(kw in context_before or kw in context_after for kw in phone_keywords)
|
| 207 |
|
| 208 |
-
#
|
| 209 |
-
|
| 210 |
-
is_clean_formatted = bool(re.search(r'(?:\+\d{1,4}[-\s])?(?:\(\d+\)[-\s]?)?\d+(?:[-\s]\d+)+', value))
|
| 211 |
|
| 212 |
-
#
|
| 213 |
has_intl_prefix = value.startswith('+') or value.startswith('00')
|
| 214 |
|
| 215 |
-
#
|
| 216 |
-
#
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
def detect_name_entities(self, text: str) -> List[Entity]:
|
| 220 |
"""Detect name entities using SpaCy NER"""
|
| 221 |
entities = []
|
| 222 |
doc = self.nlp(text)
|
| 223 |
-
|
| 224 |
for ent in doc.ents:
|
| 225 |
# Use PER for person, common in many models like xx_ent_wiki_sm
|
| 226 |
# Also checking for PERSON as some models might use it.
|
|
@@ -298,7 +304,7 @@ class PIIMasker:
|
|
| 298 |
# res is longer, current is dominated
|
| 299 |
temp_resolved.append(res_entity)
|
| 300 |
is_overlapped_or_contained = True # Mark current as handled
|
| 301 |
-
break
|
| 302 |
else: # Same length, keep existing one (res_entity)
|
| 303 |
temp_resolved.append(res_entity)
|
| 304 |
is_overlapped_or_contained = True # Mark current as handled
|
|
|
|
| 50 |
# Define regex patterns for different entity types
|
| 51 |
self.patterns = {
|
| 52 |
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
| 53 |
+
# Simplified phone regex to capture both standard and international formats
|
| 54 |
+
"phone_number": r'\b(?:(?:\+|00)[1-9]\d{0,3}[-\s.]?)?(?:\(?\d{1,5}\)?[-\s.]?)?\d{1,5}(?:[-\s.]\d{1,5}){1,4}\b',
|
| 55 |
# Card number regex: common formats, allows optional spaces/hyphens
|
| 56 |
"credit_debit_no": r'\b(?:(?:\d{4}[\s-]?){3}\d{4}|\d{13,19})\b',
|
| 57 |
# CVV: 3 or 4 digits, ensuring it's a standalone number (word boundary)
|
|
|
|
| 128 |
|
| 129 |
def verify_cvv(self, text: str, match: re.Match) -> bool:
|
| 130 |
"""Verify if a 3-4 digit number is actually a CVV using contextual clues"""
|
| 131 |
+
context_window = 50
|
| 132 |
start, end = match.span()
|
| 133 |
value = match.group()
|
| 134 |
|
|
|
|
| 139 |
if char_before.isdigit() or char_after.isdigit():
|
| 140 |
return False # It's part of a larger number
|
| 141 |
|
| 142 |
+
# Only consider 3-4 digit numbers
|
| 143 |
+
if not value.isdigit() or len(value) < 3 or len(value) > 4:
|
| 144 |
+
return False
|
| 145 |
+
|
| 146 |
context_before = text[max(0, start - context_window):start].lower()
|
| 147 |
context_after = text[end:min(len(text), end + context_window)].lower()
|
| 148 |
|
| 149 |
+
# Expanded list of CVV-related keywords to improve detection
|
| 150 |
+
cvv_keywords = [
|
| 151 |
+
"cvv", "cvc", "csc", "security code", "card verification", "verification no",
|
| 152 |
+
"security", "security number", "cv2", "card code", "security value"
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
date_keywords = ["date", "year", "/", "born", "age", "since", "established"]
|
| 156 |
|
| 157 |
+
# Look for CVV context clues
|
| 158 |
is_cvv_context = any(keyword in context_before or keyword in context_after for keyword in cvv_keywords)
|
| 159 |
|
| 160 |
+
# If explicitly mentioned as a CVV, immediately return true
|
| 161 |
+
if is_cvv_context:
|
| 162 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
+
# If it looks like a year, reject it
|
| 165 |
+
if len(value) == 4 and 1900 <= int(value) <= 2100:
|
| 166 |
+
if any(k in context_before or k in context_after for k in ["year", "born", "established", "since"]):
|
| 167 |
+
return False
|
| 168 |
|
| 169 |
+
# If in expiry date context, reject it
|
| 170 |
+
if re.search(r'\b(0[1-9]|1[0-2])[/\s-]$', context_before.strip()):
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
# If no context clues but we have a credit card mention nearby, it could be a CVV
|
| 174 |
+
card_context = any(k in context_before or k in context_after
|
| 175 |
+
for k in ["card", "credit", "visa", "mastercard", "amex", "discover"])
|
| 176 |
+
|
| 177 |
+
return is_cvv_context or (card_context and len(value) in [3, 4])
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
def verify_phone_number(self, text: str, match: re.Match) -> bool:
|
| 180 |
"""
|
| 181 |
Verify if a match is actually a phone number using validation rules and context.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
"""
|
| 183 |
value = match.group()
|
| 184 |
start, end = match.span()
|
| 185 |
|
| 186 |
+
# Extract only digits to count them
|
| 187 |
+
digits = ''.join(c for c in value if c.isdigit())
|
| 188 |
+
digit_count = len(digits)
|
|
|
|
| 189 |
|
| 190 |
+
# Most phone numbers worldwide have between 7 and 15 digits
|
| 191 |
+
if digit_count < 7 or digit_count > 15:
|
| 192 |
+
return False
|
| 193 |
+
|
| 194 |
+
# Check for common phone number indicators
|
| 195 |
context_window = 50
|
| 196 |
context_before = text[max(0, start - context_window):start].lower()
|
| 197 |
context_after = text[end:min(len(text), end + context_window)].lower()
|
| 198 |
|
| 199 |
+
# Expanded phone keywords
|
| 200 |
phone_keywords = [
|
| 201 |
+
"phone", "call", "tel", "telephone", "contact", "dial", "mobile", "cell",
|
| 202 |
+
"number", "direct", "office", "fax", "reach me at", "call me", "contact me",
|
| 203 |
+
"line", "extension", "ext", "phone number"
|
| 204 |
]
|
| 205 |
|
| 206 |
+
# Check for phone context
|
| 207 |
has_phone_context = any(kw in context_before or kw in context_after for kw in phone_keywords)
|
| 208 |
|
| 209 |
+
# Check for formatting that indicates a phone number
|
| 210 |
+
has_phone_formatting = bool(re.search(r'[-\s.()\+]', value))
|
|
|
|
| 211 |
|
| 212 |
+
# Check for international prefix
|
| 213 |
has_intl_prefix = value.startswith('+') or value.startswith('00')
|
| 214 |
|
| 215 |
+
# Return true if any of these conditions are met:
|
| 216 |
+
# 1. Has explicit phone context
|
| 217 |
+
# 2. Has phone-like formatting AND reasonable digit count
|
| 218 |
+
# 3. Has international prefix AND reasonable digit count
|
| 219 |
+
# 4. Has 10 digits exactly (common in many countries) with formatting
|
| 220 |
+
return has_phone_context or \
|
| 221 |
+
(has_phone_formatting and digit_count >= 7) or \
|
| 222 |
+
(has_intl_prefix) or \
|
| 223 |
+
(digit_count == 10 and has_phone_formatting)
|
| 224 |
|
| 225 |
def detect_name_entities(self, text: str) -> List[Entity]:
|
| 226 |
"""Detect name entities using SpaCy NER"""
|
| 227 |
entities = []
|
| 228 |
doc = self.nlp(text)
|
| 229 |
+
|
| 230 |
for ent in doc.ents:
|
| 231 |
# Use PER for person, common in many models like xx_ent_wiki_sm
|
| 232 |
# Also checking for PERSON as some models might use it.
|
|
|
|
| 304 |
# res is longer, current is dominated
|
| 305 |
temp_resolved.append(res_entity)
|
| 306 |
is_overlapped_or_contained = True # Mark current as handled
|
| 307 |
+
break
|
| 308 |
else: # Same length, keep existing one (res_entity)
|
| 309 |
temp_resolved.append(res_entity)
|
| 310 |
is_overlapped_or_contained = True # Mark current as handled
|