Spaces:

sameer2026
/

iris_backend

Sleeping

iris_backend / backend /src /preprocess /regex_pii.py

Muhammed Sameer

Initial commit - Iris Full (under development)

ea9ca44 4 months ago

2.58 kB

	import re

	# Regex Constants
	EMAIL_REGEX = r"[\w\.-]+@[\w\.-]+\.\w+"
	# Robust Regex for International & Indian formats
	# Matches:
	# +91 98765 43210
	# +91-98765-43210
	# 9876543210
	# 0987-654-3210
	# (0)9876543210
	PHONE_REGEX = r"(?:\+?(\d{1,3}))?[-. (](\d{2,5})[-. )](\d{2,5})[-. ](\d{2,5})(?:[-. ](\d{1,4}))?"
	URL_REGEX = r"https?://[^\s,)\"']+"

	def extract_contact_info_regex(text: str) -> dict:
	"""
	Extracts Phone, Email, and Links (LinkedIn, GitHub, Portfolio) using Regex.
	Returns a dictionary suitable for merging into the final profile payload.
	"""

	# 1. Email
	email_match = re.search(EMAIL_REGEX, text)
	email = email_match.group(0) if email_match else None

	# 2. Phone
	# Find all matches and pick the longest/most likely one
	phone_matches = re.finditer(PHONE_REGEX, text)
	phone = None
	# Heuristic: Pick the first valid-looking match that is at least 10 chars
	for match in phone_matches:
	p = match.group(0)
	if len(re.sub(r"\D", "", p)) >= 10:
	phone = p.strip()
	break

	# 3. Links
	links = re.findall(URL_REGEX, text)

	linkedin = next((l for l in links if "linkedin.com" in l), None)
	github = next((l for l in links if "github.com" in l), None)

	# Portfolio is any other link that isn't specific social media
	# Excluding common junk like google.com or fonts.googleapis if they appear
	exclude_domains = ["linkedin.com", "github.com", "google.com", "facebook.com", "twitter.com", "instagram.com"]
	portfolio = None
	for l in links:
	if not any(d in l for d in exclude_domains):
	portfolio = l
	break

	return {
	"email": email, # While auth handles this, extracting it doesn't hurt
	"phone": phone,
	"linkedin": linkedin,
	"github": github,
	"portfolio": portfolio
	}

	def mask_contact_info_regex(text: str) -> str:
	"""
	Replaces Phone, Email, and Links with [REDACTED] placeholders
	to prevent PII leakage to LLMs.
	"""

	# Mask Emails
	text = re.sub(EMAIL_REGEX, "[EMAIL_REDACTED]", text)

	# Mask Phone Numbers
	# Using a slightly more aggressive regex for masking to catch variants
	text = re.sub(PHONE_REGEX, "[PHONE_REDACTED]", text)

	# Mask Links
	# We mask ALL links to be safe, or just the specific ones?
	# User said extract specific ones.
	# Safer to mask all URLs to prevent "portfolio" leaking personal domain names.
	text = re.sub(URL_REGEX, "[LINK_REDACTED]", text)

	return text