File size: 2,577 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re

# Regex Constants
EMAIL_REGEX = r"[\w\.-]+@[\w\.-]+\.\w+"
# Robust Regex for International & Indian formats
# Matches:
# +91 98765 43210
# +91-98765-43210
# 9876543210
# 0987-654-3210
# (0)9876543210
PHONE_REGEX = r"(?:\+?(\d{1,3}))?[-. (]*(\d{2,5})[-. )]*(\d{2,5})[-. ]*(\d{2,5})(?:[-. ]*(\d{1,4}))?"
URL_REGEX = r"https?://[^\s,)\"']+"

def extract_contact_info_regex(text: str) -> dict:
    """
    Extracts Phone, Email, and Links (LinkedIn, GitHub, Portfolio) using Regex.
    Returns a dictionary suitable for merging into the final profile payload.
    """
    
    # 1. Email
    email_match = re.search(EMAIL_REGEX, text)
    email = email_match.group(0) if email_match else None
    
    # 2. Phone
    # Find all matches and pick the longest/most likely one
    phone_matches = re.finditer(PHONE_REGEX, text)
    phone = None
    # Heuristic: Pick the first valid-looking match that is at least 10 chars
    for match in phone_matches:
        p = match.group(0)
        if len(re.sub(r"\D", "", p)) >= 10:
            phone = p.strip()
            break

    # 3. Links
    links = re.findall(URL_REGEX, text)
    
    linkedin = next((l for l in links if "linkedin.com" in l), None)
    github = next((l for l in links if "github.com" in l), None)
    
    # Portfolio is any other link that isn't specific social media
    # Excluding common junk like google.com or fonts.googleapis if they appear
    exclude_domains = ["linkedin.com", "github.com", "google.com", "facebook.com", "twitter.com", "instagram.com"]
    portfolio = None
    for l in links:
        if not any(d in l for d in exclude_domains):
            portfolio = l
            break

    return {
        "email": email, # While auth handles this, extracting it doesn't hurt
        "phone": phone,
        "linkedin": linkedin,
        "github": github,
        "portfolio": portfolio
    }

def mask_contact_info_regex(text: str) -> str:
    """
    Replaces Phone, Email, and Links with [REDACTED] placeholders
    to prevent PII leakage to LLMs.
    """
    
    # Mask Emails
    text = re.sub(EMAIL_REGEX, "[EMAIL_REDACTED]", text)
    
    # Mask Phone Numbers
    # Using a slightly more aggressive regex for masking to catch variants
    text = re.sub(PHONE_REGEX, "[PHONE_REDACTED]", text)
    
    # Mask Links
    # We mask ALL links to be safe, or just the specific ones? 
    # User said extract specific ones. 
    # Safer to mask all URLs to prevent "portfolio" leaking personal domain names.
    text = re.sub(URL_REGEX, "[LINK_REDACTED]", text)
    
    return text