iris_backend / backend /src /preprocess /regex_pii.py
Muhammed Sameer
Initial commit - Iris Full (under development)
ea9ca44
import re
# Regex Constants
EMAIL_REGEX = r"[\w\.-]+@[\w\.-]+\.\w+"
# Robust Regex for International & Indian formats
# Matches:
# +91 98765 43210
# +91-98765-43210
# 9876543210
# 0987-654-3210
# (0)9876543210
PHONE_REGEX = r"(?:\+?(\d{1,3}))?[-. (]*(\d{2,5})[-. )]*(\d{2,5})[-. ]*(\d{2,5})(?:[-. ]*(\d{1,4}))?"
URL_REGEX = r"https?://[^\s,)\"']+"
def extract_contact_info_regex(text: str) -> dict:
"""
Extracts Phone, Email, and Links (LinkedIn, GitHub, Portfolio) using Regex.
Returns a dictionary suitable for merging into the final profile payload.
"""
# 1. Email
email_match = re.search(EMAIL_REGEX, text)
email = email_match.group(0) if email_match else None
# 2. Phone
# Find all matches and pick the longest/most likely one
phone_matches = re.finditer(PHONE_REGEX, text)
phone = None
# Heuristic: Pick the first valid-looking match that is at least 10 chars
for match in phone_matches:
p = match.group(0)
if len(re.sub(r"\D", "", p)) >= 10:
phone = p.strip()
break
# 3. Links
links = re.findall(URL_REGEX, text)
linkedin = next((l for l in links if "linkedin.com" in l), None)
github = next((l for l in links if "github.com" in l), None)
# Portfolio is any other link that isn't specific social media
# Excluding common junk like google.com or fonts.googleapis if they appear
exclude_domains = ["linkedin.com", "github.com", "google.com", "facebook.com", "twitter.com", "instagram.com"]
portfolio = None
for l in links:
if not any(d in l for d in exclude_domains):
portfolio = l
break
return {
"email": email, # While auth handles this, extracting it doesn't hurt
"phone": phone,
"linkedin": linkedin,
"github": github,
"portfolio": portfolio
}
def mask_contact_info_regex(text: str) -> str:
"""
Replaces Phone, Email, and Links with [REDACTED] placeholders
to prevent PII leakage to LLMs.
"""
# Mask Emails
text = re.sub(EMAIL_REGEX, "[EMAIL_REDACTED]", text)
# Mask Phone Numbers
# Using a slightly more aggressive regex for masking to catch variants
text = re.sub(PHONE_REGEX, "[PHONE_REDACTED]", text)
# Mask Links
# We mask ALL links to be safe, or just the specific ones?
# User said extract specific ones.
# Safer to mask all URLs to prevent "portfolio" leaking personal domain names.
text = re.sub(URL_REGEX, "[LINK_REDACTED]", text)
return text