Spaces:
Running
Running
feat: two-pass LinkedIn parser handles multi-role companies correctly
Browse filesLinkedIn PDFs show multiple roles at one company as:
Company / Duration / Role1 / dates / ... / Role2 / dates / ...
New parser finds all dates first, looks backwards for metadata,
propagates company name across sub-roles, then merges into one
experience. Toucan Toco "Lead Com → Growth Marketer" now parsed
correctly instead of split into two separate companies.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- app/services/linkedin_parser.py +144 -56
app/services/linkedin_parser.py
CHANGED
|
@@ -268,70 +268,158 @@ def _parse_header(header_lines: list[str], summary_lines: list[str]) -> tuple:
|
|
| 268 |
return name, email, phone, linkedin, location, summary
|
| 269 |
|
| 270 |
|
| 271 |
-
|
| 272 |
-
"""Parse experience lines into structured experiences."""
|
| 273 |
-
experiences: list[Experience] = []
|
| 274 |
-
pending: list[str] = [] # Lines before current date (company/title candidates)
|
| 275 |
-
current: dict | None = None
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
exp_title = ""
|
| 293 |
-
if len(candidates) >= 2:
|
| 294 |
-
company = candidates[-2]
|
| 295 |
-
exp_title = candidates[-1]
|
| 296 |
-
elif len(candidates) == 1:
|
| 297 |
-
# Could be company or title — treat as company
|
| 298 |
-
company = candidates[0]
|
| 299 |
-
|
| 300 |
-
# Check if there's a duration-only line like "4 ans 3 mois" in candidates
|
| 301 |
-
# That indicates a multi-role company group
|
| 302 |
-
dur_pattern = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE)
|
| 303 |
-
if company and dur_pattern.match(company):
|
| 304 |
-
# Duration line — the company is the line before it
|
| 305 |
-
company = candidates[-3] if len(candidates) >= 3 else ""
|
| 306 |
-
exp_title = candidates[-1]
|
| 307 |
-
|
| 308 |
-
current = {
|
| 309 |
-
"title": exp_title,
|
| 310 |
"company": company,
|
| 311 |
-
"
|
| 312 |
-
"
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
else:
|
| 326 |
-
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
else:
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
if current:
|
| 332 |
-
experiences.append(_build_exp(current))
|
| 333 |
|
| 334 |
-
return
|
| 335 |
|
| 336 |
|
| 337 |
def _build_exp(data: dict) -> Experience:
|
|
|
|
| 268 |
return name, email, phone, linkedin, location, summary
|
| 269 |
|
| 270 |
|
| 271 |
+
DUR_PATTERN = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
|
| 274 |
+
def _find_title_company_before_date(lines: list[str], date_idx: int) -> tuple[str, str, bool]:
|
| 275 |
+
"""Look backwards from a date line to find the title, company, and whether this is a multi-role group.
|
| 276 |
+
|
| 277 |
+
Returns (company, title, is_multi_role).
|
| 278 |
+
"""
|
| 279 |
+
# Collect non-location, non-bullet lines going backwards from the date
|
| 280 |
+
candidates = []
|
| 281 |
+
for i in range(date_idx - 1, max(date_idx - 5, -1), -1):
|
| 282 |
+
line = lines[i].strip()
|
| 283 |
+
if not line:
|
| 284 |
+
continue
|
| 285 |
+
if DATE_LINE.match(line):
|
| 286 |
+
break # Hit previous date — stop
|
| 287 |
+
if _is_location_line(line):
|
| 288 |
continue
|
| 289 |
+
if line.startswith(("- ", "* ", "• ", "· ")):
|
| 290 |
+
continue # Skip bullets — they belong to the previous experience
|
| 291 |
+
candidates.insert(0, line)
|
| 292 |
+
|
| 293 |
+
# Check for duration line (multi-role indicator)
|
| 294 |
+
dur_idx = None
|
| 295 |
+
for ci, c in enumerate(candidates):
|
| 296 |
+
if DUR_PATTERN.match(c):
|
| 297 |
+
dur_idx = ci
|
| 298 |
+
break
|
| 299 |
+
|
| 300 |
+
if dur_idx is not None and dur_idx > 0:
|
| 301 |
+
# Multi-role: Company / Duration / Title / Date
|
| 302 |
+
company = candidates[dur_idx - 1]
|
| 303 |
+
title_candidates = candidates[dur_idx + 1:]
|
| 304 |
+
title = title_candidates[0] if title_candidates else ""
|
| 305 |
+
return company, title, True
|
| 306 |
+
|
| 307 |
+
if len(candidates) >= 2:
|
| 308 |
+
return candidates[-2], candidates[-1], False
|
| 309 |
+
if len(candidates) == 1:
|
| 310 |
+
return candidates[0], "", False
|
| 311 |
+
return "", "", False
|
| 312 |
|
| 313 |
+
|
| 314 |
+
def _parse_experiences(lines: list[str]) -> list[Experience]:
|
| 315 |
+
"""Two-pass parser: find dates first, then look backwards for metadata and forwards for bullets."""
|
| 316 |
+
# Pass 1: Find all date lines and extract company/title by looking backwards
|
| 317 |
+
entries: list[dict] = []
|
| 318 |
+
for i, line in enumerate(lines):
|
| 319 |
+
if DATE_LINE.match(line.strip()):
|
| 320 |
+
company, title, is_multi = _find_title_company_before_date(lines, i)
|
| 321 |
+
entries.append({
|
| 322 |
+
"idx": i,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
"company": company,
|
| 324 |
+
"title": title,
|
| 325 |
+
"is_multi": is_multi,
|
| 326 |
+
"dates": re.sub(r"\s*\(.*?\)\s*$", "", line.strip()),
|
| 327 |
+
})
|
| 328 |
+
|
| 329 |
+
# Propagate group_company for multi-role entries
|
| 330 |
+
# When LinkedIn shows multiple roles at one company, only the first has
|
| 331 |
+
# Company + Duration. Subsequent roles only show Title + Date.
|
| 332 |
+
# The lookback will mistake the title for a company (single candidate).
|
| 333 |
+
group_company = None
|
| 334 |
+
for e in entries:
|
| 335 |
+
if e["is_multi"]:
|
| 336 |
+
group_company = e["company"]
|
| 337 |
+
elif group_company:
|
| 338 |
+
# The detected "company" is likely a job title (only 1 candidate in lookback)
|
| 339 |
+
# Swap: what we thought was company is actually the title
|
| 340 |
+
if e["company"] and not e["title"]:
|
| 341 |
+
e["title"] = e["company"]
|
| 342 |
+
e["company"] = group_company
|
| 343 |
+
else:
|
| 344 |
+
group_company = None
|
| 345 |
+
|
| 346 |
+
# Pass 2: Collect bullets between consecutive dates
|
| 347 |
+
# Build a set of "metadata lines" (company/title) to exclude from bullets
|
| 348 |
+
meta_lines = set()
|
| 349 |
+
for e in entries:
|
| 350 |
+
if e["company"]:
|
| 351 |
+
meta_lines.add(e["company"])
|
| 352 |
+
if e["title"]:
|
| 353 |
+
meta_lines.add(e["title"])
|
| 354 |
|
| 355 |
+
experiences: list[Experience] = []
|
| 356 |
+
for di, e in enumerate(entries):
|
| 357 |
+
start = e["idx"] + 1
|
| 358 |
+
end = entries[di + 1]["idx"] if di + 1 < len(entries) else len(lines)
|
| 359 |
+
|
| 360 |
+
bullets = []
|
| 361 |
+
for i in range(start, end):
|
| 362 |
+
line = lines[i].strip()
|
| 363 |
+
if not line or _is_location_line(line) or DUR_PATTERN.match(line):
|
| 364 |
+
continue
|
| 365 |
+
if line in meta_lines:
|
| 366 |
+
continue
|
| 367 |
+
if line.startswith(("- ", "* ", "• ", "· ")):
|
| 368 |
+
bullets.append(line.lstrip("-*•· ").strip())
|
| 369 |
+
elif len(line) > 10:
|
| 370 |
+
bullets.append(line)
|
| 371 |
+
|
| 372 |
+
experiences.append(Experience(
|
| 373 |
+
title=e["title"],
|
| 374 |
+
company=e["company"],
|
| 375 |
+
dates=e["dates"],
|
| 376 |
+
description=" ".join(bullets[:3]),
|
| 377 |
+
bullets=bullets,
|
| 378 |
+
))
|
| 379 |
+
|
| 380 |
+
return _merge_same_company(experiences)
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def _merge_same_company(experiences: list[Experience]) -> list[Experience]:
|
| 384 |
+
"""Merge consecutive experiences at the same company into a single entry with combined bullets."""
|
| 385 |
+
if not experiences:
|
| 386 |
+
return experiences
|
| 387 |
+
|
| 388 |
+
merged: list[Experience] = []
|
| 389 |
+
for exp in experiences:
|
| 390 |
+
if (merged
|
| 391 |
+
and exp.company
|
| 392 |
+
and merged[-1].company
|
| 393 |
+
and exp.company.lower() == merged[-1].company.lower()):
|
| 394 |
+
# Same company — merge into previous
|
| 395 |
+
prev = merged[-1]
|
| 396 |
+
# Combine titles
|
| 397 |
+
if exp.title and exp.title.lower() != prev.title.lower():
|
| 398 |
+
combined_title = f"{prev.title} → {exp.title}"
|
| 399 |
else:
|
| 400 |
+
combined_title = prev.title
|
| 401 |
+
# Combine dates (earliest start - latest end)
|
| 402 |
+
combined_dates = f"{exp.dates} / {prev.dates}" if exp.dates != prev.dates else prev.dates
|
| 403 |
+
# Combine bullets, prefixed with role title
|
| 404 |
+
combined_bullets = []
|
| 405 |
+
if prev.bullets:
|
| 406 |
+
combined_bullets.append(f"[{prev.title}]")
|
| 407 |
+
combined_bullets.extend(prev.bullets)
|
| 408 |
+
if exp.bullets:
|
| 409 |
+
combined_bullets.append(f"[{exp.title}]")
|
| 410 |
+
combined_bullets.extend(exp.bullets)
|
| 411 |
+
|
| 412 |
+
merged[-1] = Experience(
|
| 413 |
+
title=combined_title,
|
| 414 |
+
company=prev.company,
|
| 415 |
+
dates=combined_dates,
|
| 416 |
+
description=prev.description,
|
| 417 |
+
bullets=combined_bullets if combined_bullets else prev.bullets + exp.bullets,
|
| 418 |
+
)
|
| 419 |
else:
|
| 420 |
+
merged.append(exp)
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
+
return merged
|
| 423 |
|
| 424 |
|
| 425 |
def _build_exp(data: dict) -> Experience:
|