Spaces:
Sleeping
Sleeping
| import re | |
| # Regex Constants | |
| EMAIL_REGEX = r"[\w\.-]+@[\w\.-]+\.\w+" | |
| # Robust Regex for International & Indian formats | |
| # Matches: | |
| # +91 98765 43210 | |
| # +91-98765-43210 | |
| # 9876543210 | |
| # 0987-654-3210 | |
| # (0)9876543210 | |
| PHONE_REGEX = r"(?:\+?(\d{1,3}))?[-. (]*(\d{2,5})[-. )]*(\d{2,5})[-. ]*(\d{2,5})(?:[-. ]*(\d{1,4}))?" | |
| URL_REGEX = r"https?://[^\s,)\"']+" | |
| def extract_contact_info_regex(text: str) -> dict: | |
| """ | |
| Extracts Phone, Email, and Links (LinkedIn, GitHub, Portfolio) using Regex. | |
| Returns a dictionary suitable for merging into the final profile payload. | |
| """ | |
| # 1. Email | |
| email_match = re.search(EMAIL_REGEX, text) | |
| email = email_match.group(0) if email_match else None | |
| # 2. Phone | |
| # Find all matches and pick the longest/most likely one | |
| phone_matches = re.finditer(PHONE_REGEX, text) | |
| phone = None | |
| # Heuristic: Pick the first valid-looking match that is at least 10 chars | |
| for match in phone_matches: | |
| p = match.group(0) | |
| if len(re.sub(r"\D", "", p)) >= 10: | |
| phone = p.strip() | |
| break | |
| # 3. Links | |
| links = re.findall(URL_REGEX, text) | |
| linkedin = next((l for l in links if "linkedin.com" in l), None) | |
| github = next((l for l in links if "github.com" in l), None) | |
| # Portfolio is any other link that isn't specific social media | |
| # Excluding common junk like google.com or fonts.googleapis if they appear | |
| exclude_domains = ["linkedin.com", "github.com", "google.com", "facebook.com", "twitter.com", "instagram.com"] | |
| portfolio = None | |
| for l in links: | |
| if not any(d in l for d in exclude_domains): | |
| portfolio = l | |
| break | |
| return { | |
| "email": email, # While auth handles this, extracting it doesn't hurt | |
| "phone": phone, | |
| "linkedin": linkedin, | |
| "github": github, | |
| "portfolio": portfolio | |
| } | |
| def mask_contact_info_regex(text: str) -> str: | |
| """ | |
| Replaces Phone, Email, and Links with [REDACTED] placeholders | |
| to prevent PII leakage to LLMs. | |
| """ | |
| # Mask Emails | |
| text = re.sub(EMAIL_REGEX, "[EMAIL_REDACTED]", text) | |
| # Mask Phone Numbers | |
| # Using a slightly more aggressive regex for masking to catch variants | |
| text = re.sub(PHONE_REGEX, "[PHONE_REDACTED]", text) | |
| # Mask Links | |
| # We mask ALL links to be safe, or just the specific ones? | |
| # User said extract specific ones. | |
| # Safer to mask all URLs to prevent "portfolio" leaking personal domain names. | |
| text = re.sub(URL_REGEX, "[LINK_REDACTED]", text) | |
| return text | |