File size: 2,165 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re

def extract_fallback(text: str) -> dict:
    """
    A dumb Regex-based fallback extractor if Gemini fails.
    Extracts basic info like Email, Phone, Links, and keyword-matched Skills.
    """
    
    # 1. Email (Basic)
    email_params = r"[\w\.-]+@[\w\.-]+\.\w+"
    email_match = re.search(email_params, text)
    email = email_match.group(0) if email_match else None
    
    # 2. Phone (Very Basic - catches 10-12 digit numbers)
    phone_match = re.search(r"(\+?\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text)
    phone = phone_match.group(0) if phone_match else None

    # 3. Links (LinkedIn / GitHub / Portfolio)
    links = re.findall(r"https?://[^\s]+", text)
    linkedin = next((l for l in links if "linkedin.com" in l), None)
    github = next((l for l in links if "github.com" in l), None)
    portfolio = next((l for l in links if l not in [linkedin, github]), None)

    # 4. Keyword Matching for Skills (Static List)
    COMMON_SKILLS = [
        "Python", "Java", "JavaScript", "TypeScript", "C++", "C#", "SQL", "NoSQL",
        "React", "Angular", "Vue", "Node.js", "Django", "Flask", "FastAPI",
        "AWS", "Azure", "GCP", "Docker", "Kubernetes", "Git", "CI/CD",
        "Machine Learning", "Deep Learning", "NLP", "Pandas", "NumPy", "TensorFlow", "PyTorch"
    ]
    
    found_skills = [skill for skill in COMMON_SKILLS if re.search(r"\b" + re.escape(skill) + r"\b", text, re.IGNORECASE)]

    # 5. Construct Payload (Matches Schema)
    return {
        "headline": None,
        "summary": text[:500] + "..." if len(text) > 500 else text, # Fallback summary is just first 500 chars
        "skills": found_skills,
        "technical_skills": found_skills, # Duplicate for safety
        "education": [],
        "work_experience": [],
        "certifications": [],
        "languages": [],
        "experience_years": None,
        # Extra fields specific to Supabase Ingest (mapped later)
        # "email": email, # Backend doesn't use extracted email usually (uses auth), but good to have
        "phone": phone,
        "linkedin": linkedin,
        "github": github,
        "portfolio": portfolio
    }