Spaces:

Aramente
/

bored-cv-api

Running

App Files Files Community

bored-cv-api / app /services /linkedin_parser.py

Aramente

feat: two-pass LinkedIn parser handles multi-role companies correctly

ce64eb7 about 1 month ago

raw

history blame contribute delete

16.4 kB

	"""Heuristic parser for LinkedIn PDF exports. Zero LLM calls, instant results.

	LinkedIn PDFs use a two-column layout where the right column (skills, languages)
	is interleaved line-by-line with the left column (experience). We handle this
	by dumping everything between Experience and Education into one stream, then
	filtering out right-column noise using known patterns.
	"""

	import re
	from app.models import Education, Experience, Profile


	# --- Patterns ---

	DATE_LINE = re.compile(
	r"^((?:jan\|fév\|fé\|mar\|avr\|mai\|jun\|jui\|aoû\|ao\|sep\|oct\|nov\|déc\|"
	r"january\|february\|march\|april\|may\|june\|july\|august\|september\|october\|november\|december\|"
	r"janvier\|février\|mars\|avril\|juin\|juillet\|août\|septembre\|octobre\|novembre\|décembre)"
	r"\w*\.?\s+\d{4})"
	r"\s[-–—]\s"
	r"((?:jan\|fév\|fé\|mar\|avr\|mai\|jun\|jui\|aoû\|ao\|sep\|oct\|nov\|déc\|"
	r"january\|february\|march\|april\|may\|june\|july\|august\|september\|october\|november\|december\|"
	r"janvier\|février\|mars\|avril\|juin\|juillet\|août\|septembre\|octobre\|novembre\|décembre)"
	r"\w*\.?\s+\d{4}\|present\|présent\|aujourd'hui\|current\|now)"
	r"\s($.$)?",
	re.IGNORECASE,
	)

	# Right-column noise patterns
	LANGUAGE_LINE = re.compile(
	r"^(French\|English\|Spanish\|German\|Italian\|Portuguese\|Chinese\|Japanese\|Korean\|Arabic\|Russian\|Dutch\|Hindi\|Mandarin\|Cantonese\|"
	r"Français\|Anglais\|Espagnol\|Allemand\|Italien\|Portugais\|Chinois\|Japonais\|Coréen\|Arabe\|Russe\|Néerlandais)"
	r"\s*\(", re.IGNORECASE
	)
	PAGE_MARKER = re.compile(r"^Page \d+ of \d+$")
	PHONE_RE = re.compile(r"(\+?\d[\d\s\-\.]{7,15}\d)")
	EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")
	LINKEDIN_RE = re.compile(r"((?:https?://)?(?:www\.)?linkedin\.com/in/[\w-]+)")
	LOCATION_LINE = re.compile(
	r"^(Paris\|Lyon\|Marseille\|Toulouse\|Nantes\|Bordeaux\|Lille\|Strasbourg\|London\|Berlin\|"
	r"New York\|San Francisco\|Remote\|France\|Région de Paris\|Paris Area\|Île-de-France)",
	re.IGNORECASE
	)

	SECTION_HEADERS = {
	"experience": re.compile(r"^(Experience\|Expérience)$", re.IGNORECASE),
	"education": re.compile(r"^(Education\|Formation)$", re.IGNORECASE),
	"skills": re.compile(r"^(Skills\|Compétences\|Top Skills\|Principales compétences)$", re.IGNORECASE),
	"languages": re.compile(r"^(Languages\|Langues)$", re.IGNORECASE),
	"summary": re.compile(r"^(Summary\|Résumé\|About\|À propos)$", re.IGNORECASE),
	"certifications": re.compile(r"^(Certifications\|Licences\|Licenses)$", re.IGNORECASE),
	"honors": re.compile(r"^(Honors\|Distinctions\|Honors & Awards)$", re.IGNORECASE),
	"volunteer": re.compile(r"^(Volunteer Experience\|Bénévolat)$", re.IGNORECASE),
	}

	# Sections that end the experience block
	END_EXPERIENCE_SECTIONS = {"education", "certifications", "honors", "volunteer"}


	def _section_type(line: str) -> str \| None:
	for name, pat in SECTION_HEADERS.items():
	if pat.match(line.strip()):
	return name
	return None


	def _is_right_column_noise(line: str) -> bool:
	"""Check if a line is right-column noise (language proficiency, etc.)."""
	stripped = line.strip()
	if LANGUAGE_LINE.match(stripped):
	return True
	if stripped in ("(LinkedIn)", "(Mobile)"):
	return True
	if PAGE_MARKER.match(stripped):
	return True
	# Duration-only line like "4 ans 3 mois" (multi-role company header)
	if re.match(r"^\d+\s+(an\|mois\|year\|month)", stripped, re.IGNORECASE):
	return False # Not noise — it's a company group marker
	return False


	def _is_location_line(line: str) -> bool:
	return bool(LOCATION_LINE.match(line.strip()))


	def parse_linkedin_heuristic(raw_text: str) -> Profile \| None:
	"""Parse LinkedIn PDF text into a Profile using heuristics.
	Returns None if it doesn't look like a LinkedIn PDF.
	"""
	lines = [l.strip() for l in raw_text.split("\n") if l.strip()]
	if len(lines) < 5:
	return None

	# --- Phase 1: Find section boundaries ---
	exp_start = None
	exp_end = None
	edu_start = None
	skills_start = None
	summary_start = None

	for i, line in enumerate(lines):
	sec = _section_type(line)
	if sec == "experience" and exp_start is None:
	exp_start = i + 1
	elif sec in END_EXPERIENCE_SECTIONS and exp_start is not None and exp_end is None:
	exp_end = i
	if sec == "education":
	edu_start = i + 1
	elif sec == "skills" and exp_start is None:
	skills_start = i + 1
	elif sec == "summary" and exp_start is None:
	summary_start = i + 1

	if exp_start is None:
	return None

	if exp_end is None:
	exp_end = len(lines)

	# --- Phase 2: Parse header (everything before experience) ---
	header_lines = lines[:exp_start - 1]
	name, email, phone, linkedin, location, summary = _parse_header(
	header_lines, lines[summary_start:exp_start - 1] if summary_start else []
	)

	# --- Phase 3: Parse skills (before experience) ---
	skills: list[str] = []
	if skills_start:
	for line in lines[skills_start:exp_start - 1]:
	sec = _section_type(line)
	if sec:
	break
	stripped = line.strip()
	if stripped and not _is_right_column_noise(stripped):
	skills.append(stripped)

	# --- Phase 4: Parse experience block ---
	# Filter right-column noise, then extract experiences
	exp_raw = lines[exp_start:exp_end]

	# Collect languages from interleaved right-column
	languages: list[str] = []
	exp_clean: list[str] = []

	for line in exp_raw:
	stripped = line.strip()
	sec = _section_type(stripped)
	if sec in ("skills", "languages"):
	continue # Skip right-column section headers
	if PAGE_MARKER.match(stripped):
	continue
	if _is_right_column_noise(stripped):
	languages.append(stripped)
	continue
	exp_clean.append(stripped)

	experiences = _parse_experiences(exp_clean)

	# --- Phase 5: Parse education ---
	education: list[Education] = []
	if edu_start:
	edu_lines = lines[edu_start:]
	# Stop at next section or end
	edu_clean = []
	for line in edu_lines:
	sec = _section_type(line)
	if sec and sec != "education":
	break
	if not PAGE_MARKER.match(line.strip()):
	edu_clean.append(line.strip())
	education = _parse_education(edu_clean)

	# Title = first experience title if available
	title = experiences[0].title if experiences else ""

	profile = Profile(
	name=name,
	title=title,
	location=location,
	email=email,
	phone=phone,
	linkedin=linkedin,
	summary=summary,
	experiences=experiences,
	education=education,
	skills=skills,
	languages=languages,
	)

	if profile.name and len(profile.experiences) > 0 and _is_coherent(profile, raw_text):
	return profile
	return None


	def _is_coherent(profile: Profile, raw_text: str) -> bool:
	"""Quick sanity check — if the parse looks off, return False to trigger LLM fallback."""
	# Name should be 2+ words (not a section header or label)
	if len(profile.name.split()) < 2:
	return False

	# At least 30% of experiences should have a company name
	if profile.experiences:
	with_company = sum(1 for e in profile.experiences if e.company)
	if with_company / len(profile.experiences) < 0.3:
	return False

	# Experiences count should be plausible relative to PDF size
	# A typical LinkedIn PDF has ~500 chars per experience
	expected_min = max(1, len(raw_text) // 2000)
	if len(profile.experiences) < expected_min // 2:
	return False

	# Name should appear somewhere in the raw text
	if profile.name.lower() not in raw_text.lower():
	return False

	return True


	def _parse_header(header_lines: list[str], summary_lines: list[str]) -> tuple:
	"""Extract contact info from header."""
	name = ""
	email = ""
	phone = ""
	linkedin = ""
	location = ""
	summary = ""

	all_text = "\n".join(header_lines)

	m = EMAIL_RE.search(all_text)
	if m:
	email = m.group()

	m = PHONE_RE.search(all_text)
	if m:
	phone = m.group().strip()

	m = LINKEDIN_RE.search(all_text)
	if m:
	linkedin = m.group()

	for line in header_lines:
	stripped = line.strip()
	if stripped in ("(LinkedIn)", "(Mobile)") or PAGE_MARKER.match(stripped):
	continue
	if EMAIL_RE.search(stripped) or LINKEDIN_RE.search(stripped):
	continue

	# Clean "Coordonnées" prefix
	cleaned = stripped
	for prefix in ("Coordonnées ", "Contact "):
	if cleaned.startswith(prefix):
	cleaned = cleaned[len(prefix):]

	# Phone-only line
	if PHONE_RE.match(cleaned) and cleaned.replace(" ", "").replace("+", "").replace("-", "").replace(".", "").isdigit():
	continue

	if not name and cleaned and len(cleaned) < 60:
	name = cleaned
	elif not location and _is_location_line(cleaned):
	location = cleaned

	if summary_lines:
	summary = " ".join(l.strip() for l in summary_lines if l.strip() and not _section_type(l))

	return name, email, phone, linkedin, location, summary


	DUR_PATTERN = re.compile(r"^\d+\s+(an\|mois\|year\|month)", re.IGNORECASE)


	def _find_title_company_before_date(lines: list[str], date_idx: int) -> tuple[str, str, bool]:
	"""Look backwards from a date line to find the title, company, and whether this is a multi-role group.

	Returns (company, title, is_multi_role).
	"""
	# Collect non-location, non-bullet lines going backwards from the date
	candidates = []
	for i in range(date_idx - 1, max(date_idx - 5, -1), -1):
	line = lines[i].strip()
	if not line:
	continue
	if DATE_LINE.match(line):
	break # Hit previous date — stop
	if _is_location_line(line):
	continue
	if line.startswith(("- ", "* ", "• ", "· ")):
	continue # Skip bullets — they belong to the previous experience
	candidates.insert(0, line)

	# Check for duration line (multi-role indicator)
	dur_idx = None
	for ci, c in enumerate(candidates):
	if DUR_PATTERN.match(c):
	dur_idx = ci
	break

	if dur_idx is not None and dur_idx > 0:
	# Multi-role: Company / Duration / Title / Date
	company = candidates[dur_idx - 1]
	title_candidates = candidates[dur_idx + 1:]
	title = title_candidates[0] if title_candidates else ""
	return company, title, True

	if len(candidates) >= 2:
	return candidates[-2], candidates[-1], False
	if len(candidates) == 1:
	return candidates[0], "", False
	return "", "", False


	def _parse_experiences(lines: list[str]) -> list[Experience]:
	"""Two-pass parser: find dates first, then look backwards for metadata and forwards for bullets."""
	# Pass 1: Find all date lines and extract company/title by looking backwards
	entries: list[dict] = []
	for i, line in enumerate(lines):
	if DATE_LINE.match(line.strip()):
	company, title, is_multi = _find_title_company_before_date(lines, i)
	entries.append({
	"idx": i,
	"company": company,
	"title": title,
	"is_multi": is_multi,
	"dates": re.sub(r"\s$.?$\s*$", "", line.strip()),
	})

	# Propagate group_company for multi-role entries
	# When LinkedIn shows multiple roles at one company, only the first has
	# Company + Duration. Subsequent roles only show Title + Date.
	# The lookback will mistake the title for a company (single candidate).
	group_company = None
	for e in entries:
	if e["is_multi"]:
	group_company = e["company"]
	elif group_company:
	# The detected "company" is likely a job title (only 1 candidate in lookback)
	# Swap: what we thought was company is actually the title
	if e["company"] and not e["title"]:
	e["title"] = e["company"]
	e["company"] = group_company
	else:
	group_company = None

	# Pass 2: Collect bullets between consecutive dates
	# Build a set of "metadata lines" (company/title) to exclude from bullets
	meta_lines = set()
	for e in entries:
	if e["company"]:
	meta_lines.add(e["company"])
	if e["title"]:
	meta_lines.add(e["title"])

	experiences: list[Experience] = []
	for di, e in enumerate(entries):
	start = e["idx"] + 1
	end = entries[di + 1]["idx"] if di + 1 < len(entries) else len(lines)

	bullets = []
	for i in range(start, end):
	line = lines[i].strip()
	if not line or _is_location_line(line) or DUR_PATTERN.match(line):
	continue
	if line in meta_lines:
	continue
	if line.startswith(("- ", "* ", "• ", "· ")):
	bullets.append(line.lstrip("-*•· ").strip())
	elif len(line) > 10:
	bullets.append(line)

	experiences.append(Experience(
	title=e["title"],
	company=e["company"],
	dates=e["dates"],
	description=" ".join(bullets[:3]),
	bullets=bullets,
	))

	return _merge_same_company(experiences)


	def _merge_same_company(experiences: list[Experience]) -> list[Experience]:
	"""Merge consecutive experiences at the same company into a single entry with combined bullets."""
	if not experiences:
	return experiences

	merged: list[Experience] = []
	for exp in experiences:
	if (merged
	and exp.company
	and merged[-1].company
	and exp.company.lower() == merged[-1].company.lower()):
	# Same company — merge into previous
	prev = merged[-1]
	# Combine titles
	if exp.title and exp.title.lower() != prev.title.lower():
	combined_title = f"{prev.title} → {exp.title}"
	else:
	combined_title = prev.title
	# Combine dates (earliest start - latest end)
	combined_dates = f"{exp.dates} / {prev.dates}" if exp.dates != prev.dates else prev.dates
	# Combine bullets, prefixed with role title
	combined_bullets = []
	if prev.bullets:
	combined_bullets.append(f"[{prev.title}]")
	combined_bullets.extend(prev.bullets)
	if exp.bullets:
	combined_bullets.append(f"[{exp.title}]")
	combined_bullets.extend(exp.bullets)

	merged[-1] = Experience(
	title=combined_title,
	company=prev.company,
	dates=combined_dates,
	description=prev.description,
	bullets=combined_bullets if combined_bullets else prev.bullets + exp.bullets,
	)
	else:
	merged.append(exp)

	return merged


	def _build_exp(data: dict) -> Experience:
	bullets = data.get("bullets", [])
	desc = " ".join(bullets[:3]) if bullets else ""
	return Experience(
	title=data.get("title", ""),
	company=data.get("company", ""),
	dates=data.get("dates", ""),
	description=desc,
	bullets=bullets,
	)


	def _parse_education(lines: list[str]) -> list[Education]:
	"""Parse education lines. Format: School / Degree, Field · (2011 - 2015)"""
	entries: list[Education] = []
	i = 0
	while i < len(lines):
	school = lines[i].strip()
	degree = ""
	year = ""

	if i + 1 < len(lines):
	next_line = lines[i + 1].strip()
	year_match = re.search(r"$(\d{4}\s[-–]\s\d{4}\|\d{4})$", next_line)
	if year_match:
	year = year_match.group(1)
	degree = next_line[:year_match.start()].rstrip(" ·,")
	i += 2
	else:
	degree = next_line
	i += 2
	else:
	i += 1

	if school:
	entries.append(Education(school=school, degree=degree, year=year))

	return entries