Spaces:

Aramente
/

bored-cv-api

Running

File size: 16,419 Bytes

"""Heuristic parser for LinkedIn PDF exports. Zero LLM calls, instant results.

LinkedIn PDFs use a two-column layout where the right column (skills, languages)
is interleaved line-by-line with the left column (experience). We handle this
by dumping everything between Experience and Education into one stream, then
filtering out right-column noise using known patterns.
"""

import re
from app.models import Education, Experience, Profile


# --- Patterns ---

DATE_LINE = re.compile(
    r"^((?:jan|fév|fé|mar|avr|mai|jun|jui|aoû|ao|sep|oct|nov|déc|"
    r"january|february|march|april|may|june|july|august|september|october|november|december|"
    r"janvier|février|mars|avril|juin|juillet|août|septembre|octobre|novembre|décembre)"
    r"\w*\.?\s+\d{4})"
    r"\s*[-–—]\s*"
    r"((?:jan|fév|fé|mar|avr|mai|jun|jui|aoû|ao|sep|oct|nov|déc|"
    r"january|february|march|april|may|june|july|august|september|october|november|december|"
    r"janvier|février|mars|avril|juin|juillet|août|septembre|octobre|novembre|décembre)"
    r"\w*\.?\s+\d{4}|present|présent|aujourd'hui|current|now)"
    r"\s*(\(.*\))?",
    re.IGNORECASE,
)

# Right-column noise patterns
LANGUAGE_LINE = re.compile(
    r"^(French|English|Spanish|German|Italian|Portuguese|Chinese|Japanese|Korean|Arabic|Russian|Dutch|Hindi|Mandarin|Cantonese|"
    r"Français|Anglais|Espagnol|Allemand|Italien|Portugais|Chinois|Japonais|Coréen|Arabe|Russe|Néerlandais)"
    r"\s*\(", re.IGNORECASE
)
PAGE_MARKER = re.compile(r"^Page \d+ of \d+$")
PHONE_RE = re.compile(r"(\+?\d[\d\s\-\.]{7,15}\d)")
EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")
LINKEDIN_RE = re.compile(r"((?:https?://)?(?:www\.)?linkedin\.com/in/[\w-]+)")
LOCATION_LINE = re.compile(
    r"^(Paris|Lyon|Marseille|Toulouse|Nantes|Bordeaux|Lille|Strasbourg|London|Berlin|"
    r"New York|San Francisco|Remote|France|Région de Paris|Paris Area|Île-de-France)",
    re.IGNORECASE
)

SECTION_HEADERS = {
    "experience": re.compile(r"^(Experience|Expérience)$", re.IGNORECASE),
    "education": re.compile(r"^(Education|Formation)$", re.IGNORECASE),
    "skills": re.compile(r"^(Skills|Compétences|Top Skills|Principales compétences)$", re.IGNORECASE),
    "languages": re.compile(r"^(Languages|Langues)$", re.IGNORECASE),
    "summary": re.compile(r"^(Summary|Résumé|About|À propos)$", re.IGNORECASE),
    "certifications": re.compile(r"^(Certifications|Licences|Licenses)$", re.IGNORECASE),
    "honors": re.compile(r"^(Honors|Distinctions|Honors & Awards)$", re.IGNORECASE),
    "volunteer": re.compile(r"^(Volunteer Experience|Bénévolat)$", re.IGNORECASE),
}

# Sections that end the experience block
END_EXPERIENCE_SECTIONS = {"education", "certifications", "honors", "volunteer"}


def _section_type(line: str) -> str | None:
    for name, pat in SECTION_HEADERS.items():
        if pat.match(line.strip()):
            return name
    return None


def _is_right_column_noise(line: str) -> bool:
    """Check if a line is right-column noise (language proficiency, etc.)."""
    stripped = line.strip()
    if LANGUAGE_LINE.match(stripped):
        return True
    if stripped in ("(LinkedIn)", "(Mobile)"):
        return True
    if PAGE_MARKER.match(stripped):
        return True
    # Duration-only line like "4 ans 3 mois" (multi-role company header)
    if re.match(r"^\d+\s+(an|mois|year|month)", stripped, re.IGNORECASE):
        return False  # Not noise — it's a company group marker
    return False


def _is_location_line(line: str) -> bool:
    return bool(LOCATION_LINE.match(line.strip()))


def parse_linkedin_heuristic(raw_text: str) -> Profile | None:
    """Parse LinkedIn PDF text into a Profile using heuristics.
    Returns None if it doesn't look like a LinkedIn PDF.
    """
    lines = [l.strip() for l in raw_text.split("\n") if l.strip()]
    if len(lines) < 5:
        return None

    # --- Phase 1: Find section boundaries ---
    exp_start = None
    exp_end = None
    edu_start = None
    skills_start = None
    summary_start = None

    for i, line in enumerate(lines):
        sec = _section_type(line)
        if sec == "experience" and exp_start is None:
            exp_start = i + 1
        elif sec in END_EXPERIENCE_SECTIONS and exp_start is not None and exp_end is None:
            exp_end = i
            if sec == "education":
                edu_start = i + 1
        elif sec == "skills" and exp_start is None:
            skills_start = i + 1
        elif sec == "summary" and exp_start is None:
            summary_start = i + 1

    if exp_start is None:
        return None

    if exp_end is None:
        exp_end = len(lines)

    # --- Phase 2: Parse header (everything before experience) ---
    header_lines = lines[:exp_start - 1]
    name, email, phone, linkedin, location, summary = _parse_header(
        header_lines, lines[summary_start:exp_start - 1] if summary_start else []
    )

    # --- Phase 3: Parse skills (before experience) ---
    skills: list[str] = []
    if skills_start:
        for line in lines[skills_start:exp_start - 1]:
            sec = _section_type(line)
            if sec:
                break
            stripped = line.strip()
            if stripped and not _is_right_column_noise(stripped):
                skills.append(stripped)

    # --- Phase 4: Parse experience block ---
    # Filter right-column noise, then extract experiences
    exp_raw = lines[exp_start:exp_end]

    # Collect languages from interleaved right-column
    languages: list[str] = []
    exp_clean: list[str] = []

    for line in exp_raw:
        stripped = line.strip()
        sec = _section_type(stripped)
        if sec in ("skills", "languages"):
            continue  # Skip right-column section headers
        if PAGE_MARKER.match(stripped):
            continue
        if _is_right_column_noise(stripped):
            languages.append(stripped)
            continue
        exp_clean.append(stripped)

    experiences = _parse_experiences(exp_clean)

    # --- Phase 5: Parse education ---
    education: list[Education] = []
    if edu_start:
        edu_lines = lines[edu_start:]
        # Stop at next section or end
        edu_clean = []
        for line in edu_lines:
            sec = _section_type(line)
            if sec and sec != "education":
                break
            if not PAGE_MARKER.match(line.strip()):
                edu_clean.append(line.strip())
        education = _parse_education(edu_clean)

    # Title = first experience title if available
    title = experiences[0].title if experiences else ""

    profile = Profile(
        name=name,
        title=title,
        location=location,
        email=email,
        phone=phone,
        linkedin=linkedin,
        summary=summary,
        experiences=experiences,
        education=education,
        skills=skills,
        languages=languages,
    )

    if profile.name and len(profile.experiences) > 0 and _is_coherent(profile, raw_text):
        return profile
    return None


def _is_coherent(profile: Profile, raw_text: str) -> bool:
    """Quick sanity check — if the parse looks off, return False to trigger LLM fallback."""
    # Name should be 2+ words (not a section header or label)
    if len(profile.name.split()) < 2:
        return False

    # At least 30% of experiences should have a company name
    if profile.experiences:
        with_company = sum(1 for e in profile.experiences if e.company)
        if with_company / len(profile.experiences) < 0.3:
            return False

    # Experiences count should be plausible relative to PDF size
    # A typical LinkedIn PDF has ~500 chars per experience
    expected_min = max(1, len(raw_text) // 2000)
    if len(profile.experiences) < expected_min // 2:
        return False

    # Name should appear somewhere in the raw text
    if profile.name.lower() not in raw_text.lower():
        return False

    return True


def _parse_header(header_lines: list[str], summary_lines: list[str]) -> tuple:
    """Extract contact info from header."""
    name = ""
    email = ""
    phone = ""
    linkedin = ""
    location = ""
    summary = ""

    all_text = "\n".join(header_lines)

    m = EMAIL_RE.search(all_text)
    if m:
        email = m.group()

    m = PHONE_RE.search(all_text)
    if m:
        phone = m.group().strip()

    m = LINKEDIN_RE.search(all_text)
    if m:
        linkedin = m.group()

    for line in header_lines:
        stripped = line.strip()
        if stripped in ("(LinkedIn)", "(Mobile)") or PAGE_MARKER.match(stripped):
            continue
        if EMAIL_RE.search(stripped) or LINKEDIN_RE.search(stripped):
            continue

        # Clean "Coordonnées" prefix
        cleaned = stripped
        for prefix in ("Coordonnées ", "Contact "):
            if cleaned.startswith(prefix):
                cleaned = cleaned[len(prefix):]

        # Phone-only line
        if PHONE_RE.match(cleaned) and cleaned.replace(" ", "").replace("+", "").replace("-", "").replace(".", "").isdigit():
            continue

        if not name and cleaned and len(cleaned) < 60:
            name = cleaned
        elif not location and _is_location_line(cleaned):
            location = cleaned

    if summary_lines:
        summary = " ".join(l.strip() for l in summary_lines if l.strip() and not _section_type(l))

    return name, email, phone, linkedin, location, summary


DUR_PATTERN = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE)


def _find_title_company_before_date(lines: list[str], date_idx: int) -> tuple[str, str, bool]:
    """Look backwards from a date line to find the title, company, and whether this is a multi-role group.

    Returns (company, title, is_multi_role).
    """
    # Collect non-location, non-bullet lines going backwards from the date
    candidates = []
    for i in range(date_idx - 1, max(date_idx - 5, -1), -1):
        line = lines[i].strip()
        if not line:
            continue
        if DATE_LINE.match(line):
            break  # Hit previous date — stop
        if _is_location_line(line):
            continue
        if line.startswith(("- ", "* ", "• ", "· ")):
            continue  # Skip bullets — they belong to the previous experience
        candidates.insert(0, line)

    # Check for duration line (multi-role indicator)
    dur_idx = None
    for ci, c in enumerate(candidates):
        if DUR_PATTERN.match(c):
            dur_idx = ci
            break

    if dur_idx is not None and dur_idx > 0:
        # Multi-role: Company / Duration / Title / Date
        company = candidates[dur_idx - 1]
        title_candidates = candidates[dur_idx + 1:]
        title = title_candidates[0] if title_candidates else ""
        return company, title, True

    if len(candidates) >= 2:
        return candidates[-2], candidates[-1], False
    if len(candidates) == 1:
        return candidates[0], "", False
    return "", "", False


def _parse_experiences(lines: list[str]) -> list[Experience]:
    """Two-pass parser: find dates first, then look backwards for metadata and forwards for bullets."""
    # Pass 1: Find all date lines and extract company/title by looking backwards
    entries: list[dict] = []
    for i, line in enumerate(lines):
        if DATE_LINE.match(line.strip()):
            company, title, is_multi = _find_title_company_before_date(lines, i)
            entries.append({
                "idx": i,
                "company": company,
                "title": title,
                "is_multi": is_multi,
                "dates": re.sub(r"\s*\(.*?\)\s*$", "", line.strip()),
            })

    # Propagate group_company for multi-role entries
    # When LinkedIn shows multiple roles at one company, only the first has
    # Company + Duration. Subsequent roles only show Title + Date.
    # The lookback will mistake the title for a company (single candidate).
    group_company = None
    for e in entries:
        if e["is_multi"]:
            group_company = e["company"]
        elif group_company:
            # The detected "company" is likely a job title (only 1 candidate in lookback)
            # Swap: what we thought was company is actually the title
            if e["company"] and not e["title"]:
                e["title"] = e["company"]
            e["company"] = group_company
        else:
            group_company = None

    # Pass 2: Collect bullets between consecutive dates
    # Build a set of "metadata lines" (company/title) to exclude from bullets
    meta_lines = set()
    for e in entries:
        if e["company"]:
            meta_lines.add(e["company"])
        if e["title"]:
            meta_lines.add(e["title"])

    experiences: list[Experience] = []
    for di, e in enumerate(entries):
        start = e["idx"] + 1
        end = entries[di + 1]["idx"] if di + 1 < len(entries) else len(lines)

        bullets = []
        for i in range(start, end):
            line = lines[i].strip()
            if not line or _is_location_line(line) or DUR_PATTERN.match(line):
                continue
            if line in meta_lines:
                continue
            if line.startswith(("- ", "* ", "• ", "· ")):
                bullets.append(line.lstrip("-*•· ").strip())
            elif len(line) > 10:
                bullets.append(line)

        experiences.append(Experience(
            title=e["title"],
            company=e["company"],
            dates=e["dates"],
            description=" ".join(bullets[:3]),
            bullets=bullets,
        ))

    return _merge_same_company(experiences)


def _merge_same_company(experiences: list[Experience]) -> list[Experience]:
    """Merge consecutive experiences at the same company into a single entry with combined bullets."""
    if not experiences:
        return experiences

    merged: list[Experience] = []
    for exp in experiences:
        if (merged
            and exp.company
            and merged[-1].company
            and exp.company.lower() == merged[-1].company.lower()):
            # Same company — merge into previous
            prev = merged[-1]
            # Combine titles
            if exp.title and exp.title.lower() != prev.title.lower():
                combined_title = f"{prev.title} → {exp.title}"
            else:
                combined_title = prev.title
            # Combine dates (earliest start - latest end)
            combined_dates = f"{exp.dates} / {prev.dates}" if exp.dates != prev.dates else prev.dates
            # Combine bullets, prefixed with role title
            combined_bullets = []
            if prev.bullets:
                combined_bullets.append(f"[{prev.title}]")
                combined_bullets.extend(prev.bullets)
            if exp.bullets:
                combined_bullets.append(f"[{exp.title}]")
                combined_bullets.extend(exp.bullets)

            merged[-1] = Experience(
                title=combined_title,
                company=prev.company,
                dates=combined_dates,
                description=prev.description,
                bullets=combined_bullets if combined_bullets else prev.bullets + exp.bullets,
            )
        else:
            merged.append(exp)

    return merged


def _build_exp(data: dict) -> Experience:
    bullets = data.get("bullets", [])
    desc = " ".join(bullets[:3]) if bullets else ""
    return Experience(
        title=data.get("title", ""),
        company=data.get("company", ""),
        dates=data.get("dates", ""),
        description=desc,
        bullets=bullets,
    )


def _parse_education(lines: list[str]) -> list[Education]:
    """Parse education lines. Format: School / Degree, Field · (2011 - 2015)"""
    entries: list[Education] = []
    i = 0
    while i < len(lines):
        school = lines[i].strip()
        degree = ""
        year = ""

        if i + 1 < len(lines):
            next_line = lines[i + 1].strip()
            year_match = re.search(r"\((\d{4}\s*[-–]\s*\d{4}|\d{4})\)", next_line)
            if year_match:
                year = year_match.group(1)
                degree = next_line[:year_match.start()].rstrip(" ·,")
                i += 2
            else:
                degree = next_line
                i += 2
        else:
            i += 1

        if school:
            entries.append(Education(school=school, degree=degree, year=year))

    return entries