Spaces:
Running
Running
| """Heuristic parser for LinkedIn PDF exports. Zero LLM calls, instant results. | |
| LinkedIn PDFs use a two-column layout where the right column (skills, languages) | |
| is interleaved line-by-line with the left column (experience). We handle this | |
| by dumping everything between Experience and Education into one stream, then | |
| filtering out right-column noise using known patterns. | |
| """ | |
| import re | |
| from app.models import Education, Experience, Profile | |
| # --- Patterns --- | |
| DATE_LINE = re.compile( | |
| r"^((?:jan|fév|fé|mar|avr|mai|jun|jui|aoû|ao|sep|oct|nov|déc|" | |
| r"january|february|march|april|may|june|july|august|september|october|november|december|" | |
| r"janvier|février|mars|avril|juin|juillet|août|septembre|octobre|novembre|décembre)" | |
| r"\w*\.?\s+\d{4})" | |
| r"\s*[-–—]\s*" | |
| r"((?:jan|fév|fé|mar|avr|mai|jun|jui|aoû|ao|sep|oct|nov|déc|" | |
| r"january|february|march|april|may|june|july|august|september|october|november|december|" | |
| r"janvier|février|mars|avril|juin|juillet|août|septembre|octobre|novembre|décembre)" | |
| r"\w*\.?\s+\d{4}|present|présent|aujourd'hui|current|now)" | |
| r"\s*(\(.*\))?", | |
| re.IGNORECASE, | |
| ) | |
| # Right-column noise patterns | |
| LANGUAGE_LINE = re.compile( | |
| r"^(French|English|Spanish|German|Italian|Portuguese|Chinese|Japanese|Korean|Arabic|Russian|Dutch|Hindi|Mandarin|Cantonese|" | |
| r"Français|Anglais|Espagnol|Allemand|Italien|Portugais|Chinois|Japonais|Coréen|Arabe|Russe|Néerlandais)" | |
| r"\s*\(", re.IGNORECASE | |
| ) | |
| PAGE_MARKER = re.compile(r"^Page \d+ of \d+$") | |
| PHONE_RE = re.compile(r"(\+?\d[\d\s\-\.]{7,15}\d)") | |
| EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+") | |
| LINKEDIN_RE = re.compile(r"((?:https?://)?(?:www\.)?linkedin\.com/in/[\w-]+)") | |
| LOCATION_LINE = re.compile( | |
| r"^(Paris|Lyon|Marseille|Toulouse|Nantes|Bordeaux|Lille|Strasbourg|London|Berlin|" | |
| r"New York|San Francisco|Remote|France|Région de Paris|Paris Area|Île-de-France)", | |
| re.IGNORECASE | |
| ) | |
| SECTION_HEADERS = { | |
| "experience": re.compile(r"^(Experience|Expérience)$", re.IGNORECASE), | |
| "education": re.compile(r"^(Education|Formation)$", re.IGNORECASE), | |
| "skills": re.compile(r"^(Skills|Compétences|Top Skills|Principales compétences)$", re.IGNORECASE), | |
| "languages": re.compile(r"^(Languages|Langues)$", re.IGNORECASE), | |
| "summary": re.compile(r"^(Summary|Résumé|About|À propos)$", re.IGNORECASE), | |
| "certifications": re.compile(r"^(Certifications|Licences|Licenses)$", re.IGNORECASE), | |
| "honors": re.compile(r"^(Honors|Distinctions|Honors & Awards)$", re.IGNORECASE), | |
| "volunteer": re.compile(r"^(Volunteer Experience|Bénévolat)$", re.IGNORECASE), | |
| } | |
| # Sections that end the experience block | |
| END_EXPERIENCE_SECTIONS = {"education", "certifications", "honors", "volunteer"} | |
| def _section_type(line: str) -> str | None: | |
| for name, pat in SECTION_HEADERS.items(): | |
| if pat.match(line.strip()): | |
| return name | |
| return None | |
| def _is_right_column_noise(line: str) -> bool: | |
| """Check if a line is right-column noise (language proficiency, etc.).""" | |
| stripped = line.strip() | |
| if LANGUAGE_LINE.match(stripped): | |
| return True | |
| if stripped in ("(LinkedIn)", "(Mobile)"): | |
| return True | |
| if PAGE_MARKER.match(stripped): | |
| return True | |
| # Duration-only line like "4 ans 3 mois" (multi-role company header) | |
| if re.match(r"^\d+\s+(an|mois|year|month)", stripped, re.IGNORECASE): | |
| return False # Not noise — it's a company group marker | |
| return False | |
| def _is_location_line(line: str) -> bool: | |
| return bool(LOCATION_LINE.match(line.strip())) | |
| def parse_linkedin_heuristic(raw_text: str) -> Profile | None: | |
| """Parse LinkedIn PDF text into a Profile using heuristics. | |
| Returns None if it doesn't look like a LinkedIn PDF. | |
| """ | |
| lines = [l.strip() for l in raw_text.split("\n") if l.strip()] | |
| if len(lines) < 5: | |
| return None | |
| # --- Phase 1: Find section boundaries --- | |
| exp_start = None | |
| exp_end = None | |
| edu_start = None | |
| skills_start = None | |
| summary_start = None | |
| for i, line in enumerate(lines): | |
| sec = _section_type(line) | |
| if sec == "experience" and exp_start is None: | |
| exp_start = i + 1 | |
| elif sec in END_EXPERIENCE_SECTIONS and exp_start is not None and exp_end is None: | |
| exp_end = i | |
| if sec == "education": | |
| edu_start = i + 1 | |
| elif sec == "skills" and exp_start is None: | |
| skills_start = i + 1 | |
| elif sec == "summary" and exp_start is None: | |
| summary_start = i + 1 | |
| if exp_start is None: | |
| return None | |
| if exp_end is None: | |
| exp_end = len(lines) | |
| # --- Phase 2: Parse header (everything before experience) --- | |
| header_lines = lines[:exp_start - 1] | |
| name, email, phone, linkedin, location, summary = _parse_header( | |
| header_lines, lines[summary_start:exp_start - 1] if summary_start else [] | |
| ) | |
| # --- Phase 3: Parse skills (before experience) --- | |
| skills: list[str] = [] | |
| if skills_start: | |
| for line in lines[skills_start:exp_start - 1]: | |
| sec = _section_type(line) | |
| if sec: | |
| break | |
| stripped = line.strip() | |
| if stripped and not _is_right_column_noise(stripped): | |
| skills.append(stripped) | |
| # --- Phase 4: Parse experience block --- | |
| # Filter right-column noise, then extract experiences | |
| exp_raw = lines[exp_start:exp_end] | |
| # Collect languages from interleaved right-column | |
| languages: list[str] = [] | |
| exp_clean: list[str] = [] | |
| for line in exp_raw: | |
| stripped = line.strip() | |
| sec = _section_type(stripped) | |
| if sec in ("skills", "languages"): | |
| continue # Skip right-column section headers | |
| if PAGE_MARKER.match(stripped): | |
| continue | |
| if _is_right_column_noise(stripped): | |
| languages.append(stripped) | |
| continue | |
| exp_clean.append(stripped) | |
| experiences = _parse_experiences(exp_clean) | |
| # --- Phase 5: Parse education --- | |
| education: list[Education] = [] | |
| if edu_start: | |
| edu_lines = lines[edu_start:] | |
| # Stop at next section or end | |
| edu_clean = [] | |
| for line in edu_lines: | |
| sec = _section_type(line) | |
| if sec and sec != "education": | |
| break | |
| if not PAGE_MARKER.match(line.strip()): | |
| edu_clean.append(line.strip()) | |
| education = _parse_education(edu_clean) | |
| # Title = first experience title if available | |
| title = experiences[0].title if experiences else "" | |
| profile = Profile( | |
| name=name, | |
| title=title, | |
| location=location, | |
| email=email, | |
| phone=phone, | |
| linkedin=linkedin, | |
| summary=summary, | |
| experiences=experiences, | |
| education=education, | |
| skills=skills, | |
| languages=languages, | |
| ) | |
| if profile.name and len(profile.experiences) > 0 and _is_coherent(profile, raw_text): | |
| return profile | |
| return None | |
| def _is_coherent(profile: Profile, raw_text: str) -> bool: | |
| """Quick sanity check — if the parse looks off, return False to trigger LLM fallback.""" | |
| # Name should be 2+ words (not a section header or label) | |
| if len(profile.name.split()) < 2: | |
| return False | |
| # At least 30% of experiences should have a company name | |
| if profile.experiences: | |
| with_company = sum(1 for e in profile.experiences if e.company) | |
| if with_company / len(profile.experiences) < 0.3: | |
| return False | |
| # Experiences count should be plausible relative to PDF size | |
| # A typical LinkedIn PDF has ~500 chars per experience | |
| expected_min = max(1, len(raw_text) // 2000) | |
| if len(profile.experiences) < expected_min // 2: | |
| return False | |
| # Name should appear somewhere in the raw text | |
| if profile.name.lower() not in raw_text.lower(): | |
| return False | |
| return True | |
| def _parse_header(header_lines: list[str], summary_lines: list[str]) -> tuple: | |
| """Extract contact info from header.""" | |
| name = "" | |
| email = "" | |
| phone = "" | |
| linkedin = "" | |
| location = "" | |
| summary = "" | |
| all_text = "\n".join(header_lines) | |
| m = EMAIL_RE.search(all_text) | |
| if m: | |
| email = m.group() | |
| m = PHONE_RE.search(all_text) | |
| if m: | |
| phone = m.group().strip() | |
| m = LINKEDIN_RE.search(all_text) | |
| if m: | |
| linkedin = m.group() | |
| for line in header_lines: | |
| stripped = line.strip() | |
| if stripped in ("(LinkedIn)", "(Mobile)") or PAGE_MARKER.match(stripped): | |
| continue | |
| if EMAIL_RE.search(stripped) or LINKEDIN_RE.search(stripped): | |
| continue | |
| # Clean "Coordonnées" prefix | |
| cleaned = stripped | |
| for prefix in ("Coordonnées ", "Contact "): | |
| if cleaned.startswith(prefix): | |
| cleaned = cleaned[len(prefix):] | |
| # Phone-only line | |
| if PHONE_RE.match(cleaned) and cleaned.replace(" ", "").replace("+", "").replace("-", "").replace(".", "").isdigit(): | |
| continue | |
| if not name and cleaned and len(cleaned) < 60: | |
| name = cleaned | |
| elif not location and _is_location_line(cleaned): | |
| location = cleaned | |
| if summary_lines: | |
| summary = " ".join(l.strip() for l in summary_lines if l.strip() and not _section_type(l)) | |
| return name, email, phone, linkedin, location, summary | |
| DUR_PATTERN = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE) | |
| def _find_title_company_before_date(lines: list[str], date_idx: int) -> tuple[str, str, bool]: | |
| """Look backwards from a date line to find the title, company, and whether this is a multi-role group. | |
| Returns (company, title, is_multi_role). | |
| """ | |
| # Collect non-location, non-bullet lines going backwards from the date | |
| candidates = [] | |
| for i in range(date_idx - 1, max(date_idx - 5, -1), -1): | |
| line = lines[i].strip() | |
| if not line: | |
| continue | |
| if DATE_LINE.match(line): | |
| break # Hit previous date — stop | |
| if _is_location_line(line): | |
| continue | |
| if line.startswith(("- ", "* ", "• ", "· ")): | |
| continue # Skip bullets — they belong to the previous experience | |
| candidates.insert(0, line) | |
| # Check for duration line (multi-role indicator) | |
| dur_idx = None | |
| for ci, c in enumerate(candidates): | |
| if DUR_PATTERN.match(c): | |
| dur_idx = ci | |
| break | |
| if dur_idx is not None and dur_idx > 0: | |
| # Multi-role: Company / Duration / Title / Date | |
| company = candidates[dur_idx - 1] | |
| title_candidates = candidates[dur_idx + 1:] | |
| title = title_candidates[0] if title_candidates else "" | |
| return company, title, True | |
| if len(candidates) >= 2: | |
| return candidates[-2], candidates[-1], False | |
| if len(candidates) == 1: | |
| return candidates[0], "", False | |
| return "", "", False | |
| def _parse_experiences(lines: list[str]) -> list[Experience]: | |
| """Two-pass parser: find dates first, then look backwards for metadata and forwards for bullets.""" | |
| # Pass 1: Find all date lines and extract company/title by looking backwards | |
| entries: list[dict] = [] | |
| for i, line in enumerate(lines): | |
| if DATE_LINE.match(line.strip()): | |
| company, title, is_multi = _find_title_company_before_date(lines, i) | |
| entries.append({ | |
| "idx": i, | |
| "company": company, | |
| "title": title, | |
| "is_multi": is_multi, | |
| "dates": re.sub(r"\s*\(.*?\)\s*$", "", line.strip()), | |
| }) | |
| # Propagate group_company for multi-role entries | |
| # When LinkedIn shows multiple roles at one company, only the first has | |
| # Company + Duration. Subsequent roles only show Title + Date. | |
| # The lookback will mistake the title for a company (single candidate). | |
| group_company = None | |
| for e in entries: | |
| if e["is_multi"]: | |
| group_company = e["company"] | |
| elif group_company: | |
| # The detected "company" is likely a job title (only 1 candidate in lookback) | |
| # Swap: what we thought was company is actually the title | |
| if e["company"] and not e["title"]: | |
| e["title"] = e["company"] | |
| e["company"] = group_company | |
| else: | |
| group_company = None | |
| # Pass 2: Collect bullets between consecutive dates | |
| # Build a set of "metadata lines" (company/title) to exclude from bullets | |
| meta_lines = set() | |
| for e in entries: | |
| if e["company"]: | |
| meta_lines.add(e["company"]) | |
| if e["title"]: | |
| meta_lines.add(e["title"]) | |
| experiences: list[Experience] = [] | |
| for di, e in enumerate(entries): | |
| start = e["idx"] + 1 | |
| end = entries[di + 1]["idx"] if di + 1 < len(entries) else len(lines) | |
| bullets = [] | |
| for i in range(start, end): | |
| line = lines[i].strip() | |
| if not line or _is_location_line(line) or DUR_PATTERN.match(line): | |
| continue | |
| if line in meta_lines: | |
| continue | |
| if line.startswith(("- ", "* ", "• ", "· ")): | |
| bullets.append(line.lstrip("-*•· ").strip()) | |
| elif len(line) > 10: | |
| bullets.append(line) | |
| experiences.append(Experience( | |
| title=e["title"], | |
| company=e["company"], | |
| dates=e["dates"], | |
| description=" ".join(bullets[:3]), | |
| bullets=bullets, | |
| )) | |
| return _merge_same_company(experiences) | |
| def _merge_same_company(experiences: list[Experience]) -> list[Experience]: | |
| """Merge consecutive experiences at the same company into a single entry with combined bullets.""" | |
| if not experiences: | |
| return experiences | |
| merged: list[Experience] = [] | |
| for exp in experiences: | |
| if (merged | |
| and exp.company | |
| and merged[-1].company | |
| and exp.company.lower() == merged[-1].company.lower()): | |
| # Same company — merge into previous | |
| prev = merged[-1] | |
| # Combine titles | |
| if exp.title and exp.title.lower() != prev.title.lower(): | |
| combined_title = f"{prev.title} → {exp.title}" | |
| else: | |
| combined_title = prev.title | |
| # Combine dates (earliest start - latest end) | |
| combined_dates = f"{exp.dates} / {prev.dates}" if exp.dates != prev.dates else prev.dates | |
| # Combine bullets, prefixed with role title | |
| combined_bullets = [] | |
| if prev.bullets: | |
| combined_bullets.append(f"[{prev.title}]") | |
| combined_bullets.extend(prev.bullets) | |
| if exp.bullets: | |
| combined_bullets.append(f"[{exp.title}]") | |
| combined_bullets.extend(exp.bullets) | |
| merged[-1] = Experience( | |
| title=combined_title, | |
| company=prev.company, | |
| dates=combined_dates, | |
| description=prev.description, | |
| bullets=combined_bullets if combined_bullets else prev.bullets + exp.bullets, | |
| ) | |
| else: | |
| merged.append(exp) | |
| return merged | |
| def _build_exp(data: dict) -> Experience: | |
| bullets = data.get("bullets", []) | |
| desc = " ".join(bullets[:3]) if bullets else "" | |
| return Experience( | |
| title=data.get("title", ""), | |
| company=data.get("company", ""), | |
| dates=data.get("dates", ""), | |
| description=desc, | |
| bullets=bullets, | |
| ) | |
| def _parse_education(lines: list[str]) -> list[Education]: | |
| """Parse education lines. Format: School / Degree, Field · (2011 - 2015)""" | |
| entries: list[Education] = [] | |
| i = 0 | |
| while i < len(lines): | |
| school = lines[i].strip() | |
| degree = "" | |
| year = "" | |
| if i + 1 < len(lines): | |
| next_line = lines[i + 1].strip() | |
| year_match = re.search(r"\((\d{4}\s*[-–]\s*\d{4}|\d{4})\)", next_line) | |
| if year_match: | |
| year = year_match.group(1) | |
| degree = next_line[:year_match.start()].rstrip(" ·,") | |
| i += 2 | |
| else: | |
| degree = next_line | |
| i += 2 | |
| else: | |
| i += 1 | |
| if school: | |
| entries.append(Education(school=school, degree=degree, year=year)) | |
| return entries | |