bored-cv-api / app /services /linkedin_parser.py
Aramente's picture
feat: two-pass LinkedIn parser handles multi-role companies correctly
ce64eb7
"""Heuristic parser for LinkedIn PDF exports. Zero LLM calls, instant results.
LinkedIn PDFs use a two-column layout where the right column (skills, languages)
is interleaved line-by-line with the left column (experience). We handle this
by dumping everything between Experience and Education into one stream, then
filtering out right-column noise using known patterns.
"""
import re
from app.models import Education, Experience, Profile
# --- Patterns ---
DATE_LINE = re.compile(
r"^((?:jan|fév|fé|mar|avr|mai|jun|jui|aoû|ao|sep|oct|nov|déc|"
r"january|february|march|april|may|june|july|august|september|october|november|december|"
r"janvier|février|mars|avril|juin|juillet|août|septembre|octobre|novembre|décembre)"
r"\w*\.?\s+\d{4})"
r"\s*[-–—]\s*"
r"((?:jan|fév|fé|mar|avr|mai|jun|jui|aoû|ao|sep|oct|nov|déc|"
r"january|february|march|april|may|june|july|august|september|october|november|december|"
r"janvier|février|mars|avril|juin|juillet|août|septembre|octobre|novembre|décembre)"
r"\w*\.?\s+\d{4}|present|présent|aujourd'hui|current|now)"
r"\s*(\(.*\))?",
re.IGNORECASE,
)
# Right-column noise patterns
LANGUAGE_LINE = re.compile(
r"^(French|English|Spanish|German|Italian|Portuguese|Chinese|Japanese|Korean|Arabic|Russian|Dutch|Hindi|Mandarin|Cantonese|"
r"Français|Anglais|Espagnol|Allemand|Italien|Portugais|Chinois|Japonais|Coréen|Arabe|Russe|Néerlandais)"
r"\s*\(", re.IGNORECASE
)
PAGE_MARKER = re.compile(r"^Page \d+ of \d+$")
PHONE_RE = re.compile(r"(\+?\d[\d\s\-\.]{7,15}\d)")
EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")
LINKEDIN_RE = re.compile(r"((?:https?://)?(?:www\.)?linkedin\.com/in/[\w-]+)")
LOCATION_LINE = re.compile(
r"^(Paris|Lyon|Marseille|Toulouse|Nantes|Bordeaux|Lille|Strasbourg|London|Berlin|"
r"New York|San Francisco|Remote|France|Région de Paris|Paris Area|Île-de-France)",
re.IGNORECASE
)
SECTION_HEADERS = {
"experience": re.compile(r"^(Experience|Expérience)$", re.IGNORECASE),
"education": re.compile(r"^(Education|Formation)$", re.IGNORECASE),
"skills": re.compile(r"^(Skills|Compétences|Top Skills|Principales compétences)$", re.IGNORECASE),
"languages": re.compile(r"^(Languages|Langues)$", re.IGNORECASE),
"summary": re.compile(r"^(Summary|Résumé|About|À propos)$", re.IGNORECASE),
"certifications": re.compile(r"^(Certifications|Licences|Licenses)$", re.IGNORECASE),
"honors": re.compile(r"^(Honors|Distinctions|Honors & Awards)$", re.IGNORECASE),
"volunteer": re.compile(r"^(Volunteer Experience|Bénévolat)$", re.IGNORECASE),
}
# Sections that end the experience block
END_EXPERIENCE_SECTIONS = {"education", "certifications", "honors", "volunteer"}
def _section_type(line: str) -> str | None:
for name, pat in SECTION_HEADERS.items():
if pat.match(line.strip()):
return name
return None
def _is_right_column_noise(line: str) -> bool:
"""Check if a line is right-column noise (language proficiency, etc.)."""
stripped = line.strip()
if LANGUAGE_LINE.match(stripped):
return True
if stripped in ("(LinkedIn)", "(Mobile)"):
return True
if PAGE_MARKER.match(stripped):
return True
# Duration-only line like "4 ans 3 mois" (multi-role company header)
if re.match(r"^\d+\s+(an|mois|year|month)", stripped, re.IGNORECASE):
return False # Not noise — it's a company group marker
return False
def _is_location_line(line: str) -> bool:
return bool(LOCATION_LINE.match(line.strip()))
def parse_linkedin_heuristic(raw_text: str) -> Profile | None:
"""Parse LinkedIn PDF text into a Profile using heuristics.
Returns None if it doesn't look like a LinkedIn PDF.
"""
lines = [l.strip() for l in raw_text.split("\n") if l.strip()]
if len(lines) < 5:
return None
# --- Phase 1: Find section boundaries ---
exp_start = None
exp_end = None
edu_start = None
skills_start = None
summary_start = None
for i, line in enumerate(lines):
sec = _section_type(line)
if sec == "experience" and exp_start is None:
exp_start = i + 1
elif sec in END_EXPERIENCE_SECTIONS and exp_start is not None and exp_end is None:
exp_end = i
if sec == "education":
edu_start = i + 1
elif sec == "skills" and exp_start is None:
skills_start = i + 1
elif sec == "summary" and exp_start is None:
summary_start = i + 1
if exp_start is None:
return None
if exp_end is None:
exp_end = len(lines)
# --- Phase 2: Parse header (everything before experience) ---
header_lines = lines[:exp_start - 1]
name, email, phone, linkedin, location, summary = _parse_header(
header_lines, lines[summary_start:exp_start - 1] if summary_start else []
)
# --- Phase 3: Parse skills (before experience) ---
skills: list[str] = []
if skills_start:
for line in lines[skills_start:exp_start - 1]:
sec = _section_type(line)
if sec:
break
stripped = line.strip()
if stripped and not _is_right_column_noise(stripped):
skills.append(stripped)
# --- Phase 4: Parse experience block ---
# Filter right-column noise, then extract experiences
exp_raw = lines[exp_start:exp_end]
# Collect languages from interleaved right-column
languages: list[str] = []
exp_clean: list[str] = []
for line in exp_raw:
stripped = line.strip()
sec = _section_type(stripped)
if sec in ("skills", "languages"):
continue # Skip right-column section headers
if PAGE_MARKER.match(stripped):
continue
if _is_right_column_noise(stripped):
languages.append(stripped)
continue
exp_clean.append(stripped)
experiences = _parse_experiences(exp_clean)
# --- Phase 5: Parse education ---
education: list[Education] = []
if edu_start:
edu_lines = lines[edu_start:]
# Stop at next section or end
edu_clean = []
for line in edu_lines:
sec = _section_type(line)
if sec and sec != "education":
break
if not PAGE_MARKER.match(line.strip()):
edu_clean.append(line.strip())
education = _parse_education(edu_clean)
# Title = first experience title if available
title = experiences[0].title if experiences else ""
profile = Profile(
name=name,
title=title,
location=location,
email=email,
phone=phone,
linkedin=linkedin,
summary=summary,
experiences=experiences,
education=education,
skills=skills,
languages=languages,
)
if profile.name and len(profile.experiences) > 0 and _is_coherent(profile, raw_text):
return profile
return None
def _is_coherent(profile: Profile, raw_text: str) -> bool:
"""Quick sanity check — if the parse looks off, return False to trigger LLM fallback."""
# Name should be 2+ words (not a section header or label)
if len(profile.name.split()) < 2:
return False
# At least 30% of experiences should have a company name
if profile.experiences:
with_company = sum(1 for e in profile.experiences if e.company)
if with_company / len(profile.experiences) < 0.3:
return False
# Experiences count should be plausible relative to PDF size
# A typical LinkedIn PDF has ~500 chars per experience
expected_min = max(1, len(raw_text) // 2000)
if len(profile.experiences) < expected_min // 2:
return False
# Name should appear somewhere in the raw text
if profile.name.lower() not in raw_text.lower():
return False
return True
def _parse_header(header_lines: list[str], summary_lines: list[str]) -> tuple:
"""Extract contact info from header."""
name = ""
email = ""
phone = ""
linkedin = ""
location = ""
summary = ""
all_text = "\n".join(header_lines)
m = EMAIL_RE.search(all_text)
if m:
email = m.group()
m = PHONE_RE.search(all_text)
if m:
phone = m.group().strip()
m = LINKEDIN_RE.search(all_text)
if m:
linkedin = m.group()
for line in header_lines:
stripped = line.strip()
if stripped in ("(LinkedIn)", "(Mobile)") or PAGE_MARKER.match(stripped):
continue
if EMAIL_RE.search(stripped) or LINKEDIN_RE.search(stripped):
continue
# Clean "Coordonnées" prefix
cleaned = stripped
for prefix in ("Coordonnées ", "Contact "):
if cleaned.startswith(prefix):
cleaned = cleaned[len(prefix):]
# Phone-only line
if PHONE_RE.match(cleaned) and cleaned.replace(" ", "").replace("+", "").replace("-", "").replace(".", "").isdigit():
continue
if not name and cleaned and len(cleaned) < 60:
name = cleaned
elif not location and _is_location_line(cleaned):
location = cleaned
if summary_lines:
summary = " ".join(l.strip() for l in summary_lines if l.strip() and not _section_type(l))
return name, email, phone, linkedin, location, summary
DUR_PATTERN = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE)
def _find_title_company_before_date(lines: list[str], date_idx: int) -> tuple[str, str, bool]:
"""Look backwards from a date line to find the title, company, and whether this is a multi-role group.
Returns (company, title, is_multi_role).
"""
# Collect non-location, non-bullet lines going backwards from the date
candidates = []
for i in range(date_idx - 1, max(date_idx - 5, -1), -1):
line = lines[i].strip()
if not line:
continue
if DATE_LINE.match(line):
break # Hit previous date — stop
if _is_location_line(line):
continue
if line.startswith(("- ", "* ", "• ", "· ")):
continue # Skip bullets — they belong to the previous experience
candidates.insert(0, line)
# Check for duration line (multi-role indicator)
dur_idx = None
for ci, c in enumerate(candidates):
if DUR_PATTERN.match(c):
dur_idx = ci
break
if dur_idx is not None and dur_idx > 0:
# Multi-role: Company / Duration / Title / Date
company = candidates[dur_idx - 1]
title_candidates = candidates[dur_idx + 1:]
title = title_candidates[0] if title_candidates else ""
return company, title, True
if len(candidates) >= 2:
return candidates[-2], candidates[-1], False
if len(candidates) == 1:
return candidates[0], "", False
return "", "", False
def _parse_experiences(lines: list[str]) -> list[Experience]:
"""Two-pass parser: find dates first, then look backwards for metadata and forwards for bullets."""
# Pass 1: Find all date lines and extract company/title by looking backwards
entries: list[dict] = []
for i, line in enumerate(lines):
if DATE_LINE.match(line.strip()):
company, title, is_multi = _find_title_company_before_date(lines, i)
entries.append({
"idx": i,
"company": company,
"title": title,
"is_multi": is_multi,
"dates": re.sub(r"\s*\(.*?\)\s*$", "", line.strip()),
})
# Propagate group_company for multi-role entries
# When LinkedIn shows multiple roles at one company, only the first has
# Company + Duration. Subsequent roles only show Title + Date.
# The lookback will mistake the title for a company (single candidate).
group_company = None
for e in entries:
if e["is_multi"]:
group_company = e["company"]
elif group_company:
# The detected "company" is likely a job title (only 1 candidate in lookback)
# Swap: what we thought was company is actually the title
if e["company"] and not e["title"]:
e["title"] = e["company"]
e["company"] = group_company
else:
group_company = None
# Pass 2: Collect bullets between consecutive dates
# Build a set of "metadata lines" (company/title) to exclude from bullets
meta_lines = set()
for e in entries:
if e["company"]:
meta_lines.add(e["company"])
if e["title"]:
meta_lines.add(e["title"])
experiences: list[Experience] = []
for di, e in enumerate(entries):
start = e["idx"] + 1
end = entries[di + 1]["idx"] if di + 1 < len(entries) else len(lines)
bullets = []
for i in range(start, end):
line = lines[i].strip()
if not line or _is_location_line(line) or DUR_PATTERN.match(line):
continue
if line in meta_lines:
continue
if line.startswith(("- ", "* ", "• ", "· ")):
bullets.append(line.lstrip("-*•· ").strip())
elif len(line) > 10:
bullets.append(line)
experiences.append(Experience(
title=e["title"],
company=e["company"],
dates=e["dates"],
description=" ".join(bullets[:3]),
bullets=bullets,
))
return _merge_same_company(experiences)
def _merge_same_company(experiences: list[Experience]) -> list[Experience]:
"""Merge consecutive experiences at the same company into a single entry with combined bullets."""
if not experiences:
return experiences
merged: list[Experience] = []
for exp in experiences:
if (merged
and exp.company
and merged[-1].company
and exp.company.lower() == merged[-1].company.lower()):
# Same company — merge into previous
prev = merged[-1]
# Combine titles
if exp.title and exp.title.lower() != prev.title.lower():
combined_title = f"{prev.title}{exp.title}"
else:
combined_title = prev.title
# Combine dates (earliest start - latest end)
combined_dates = f"{exp.dates} / {prev.dates}" if exp.dates != prev.dates else prev.dates
# Combine bullets, prefixed with role title
combined_bullets = []
if prev.bullets:
combined_bullets.append(f"[{prev.title}]")
combined_bullets.extend(prev.bullets)
if exp.bullets:
combined_bullets.append(f"[{exp.title}]")
combined_bullets.extend(exp.bullets)
merged[-1] = Experience(
title=combined_title,
company=prev.company,
dates=combined_dates,
description=prev.description,
bullets=combined_bullets if combined_bullets else prev.bullets + exp.bullets,
)
else:
merged.append(exp)
return merged
def _build_exp(data: dict) -> Experience:
bullets = data.get("bullets", [])
desc = " ".join(bullets[:3]) if bullets else ""
return Experience(
title=data.get("title", ""),
company=data.get("company", ""),
dates=data.get("dates", ""),
description=desc,
bullets=bullets,
)
def _parse_education(lines: list[str]) -> list[Education]:
"""Parse education lines. Format: School / Degree, Field · (2011 - 2015)"""
entries: list[Education] = []
i = 0
while i < len(lines):
school = lines[i].strip()
degree = ""
year = ""
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
year_match = re.search(r"\((\d{4}\s*[-–]\s*\d{4}|\d{4})\)", next_line)
if year_match:
year = year_match.group(1)
degree = next_line[:year_match.start()].rstrip(" ·,")
i += 2
else:
degree = next_line
i += 2
else:
i += 1
if school:
entries.append(Education(school=school, degree=degree, year=year))
return entries