Aramente Claude Opus 4.6 (1M context) commited on
Commit
ce64eb7
·
1 Parent(s): a21020c

feat: two-pass LinkedIn parser handles multi-role companies correctly

Browse files

LinkedIn PDFs show multiple roles at one company as:
Company / Duration / Role1 / dates / ... / Role2 / dates / ...

New parser finds all dates first, looks backwards for metadata,
propagates company name across sub-roles, then merges into one
experience. Toucan Toco "Lead Com → Growth Marketer" now parsed
correctly instead of split into two separate companies.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app/services/linkedin_parser.py +144 -56
app/services/linkedin_parser.py CHANGED
@@ -268,70 +268,158 @@ def _parse_header(header_lines: list[str], summary_lines: list[str]) -> tuple:
268
  return name, email, phone, linkedin, location, summary
269
 
270
 
271
- def _parse_experiences(lines: list[str]) -> list[Experience]:
272
- """Parse experience lines into structured experiences."""
273
- experiences: list[Experience] = []
274
- pending: list[str] = [] # Lines before current date (company/title candidates)
275
- current: dict | None = None
276
 
277
- for line in lines:
278
- stripped = line.strip()
279
- if not stripped:
 
 
 
 
 
 
 
 
 
 
 
 
280
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- date_match = DATE_LINE.match(stripped)
283
- if date_match:
284
- # Save previous experience
285
- if current:
286
- experiences.append(_build_exp(current))
287
-
288
- # Lines before date = company + title (last 2 non-location lines)
289
- candidates = [p for p in pending if not _is_location_line(p) and len(p) > 1]
290
-
291
- company = ""
292
- exp_title = ""
293
- if len(candidates) >= 2:
294
- company = candidates[-2]
295
- exp_title = candidates[-1]
296
- elif len(candidates) == 1:
297
- # Could be company or title — treat as company
298
- company = candidates[0]
299
-
300
- # Check if there's a duration-only line like "4 ans 3 mois" in candidates
301
- # That indicates a multi-role company group
302
- dur_pattern = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE)
303
- if company and dur_pattern.match(company):
304
- # Duration line — the company is the line before it
305
- company = candidates[-3] if len(candidates) >= 3 else ""
306
- exp_title = candidates[-1]
307
-
308
- current = {
309
- "title": exp_title,
310
  "company": company,
311
- "dates": re.sub(r"\s*\(.*?\)\s*$", "", stripped),
312
- "bullets": [],
313
- }
314
- pending = []
315
- elif current is not None:
316
- # Check if this is a location line (skip it as description)
317
- if _is_location_line(stripped):
318
- pending.append(stripped)
319
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- if stripped.startswith(("- ", "* ", "• ", "· ")):
322
- current["bullets"].append(stripped.lstrip("-*•· ").strip())
323
- elif len(stripped) > 15:
324
- current["bullets"].append(stripped)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  else:
326
- # Short line — buffer as potential next company/title
327
- pending.append(stripped)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  else:
329
- pending.append(stripped)
330
-
331
- if current:
332
- experiences.append(_build_exp(current))
333
 
334
- return experiences
335
 
336
 
337
  def _build_exp(data: dict) -> Experience:
 
268
  return name, email, phone, linkedin, location, summary
269
 
270
 
271
+ DUR_PATTERN = re.compile(r"^\d+\s+(an|mois|year|month)", re.IGNORECASE)
 
 
 
 
272
 
273
+
274
+ def _find_title_company_before_date(lines: list[str], date_idx: int) -> tuple[str, str, bool]:
275
+ """Look backwards from a date line to find the title, company, and whether this is a multi-role group.
276
+
277
+ Returns (company, title, is_multi_role).
278
+ """
279
+ # Collect non-location, non-bullet lines going backwards from the date
280
+ candidates = []
281
+ for i in range(date_idx - 1, max(date_idx - 5, -1), -1):
282
+ line = lines[i].strip()
283
+ if not line:
284
+ continue
285
+ if DATE_LINE.match(line):
286
+ break # Hit previous date — stop
287
+ if _is_location_line(line):
288
  continue
289
+ if line.startswith(("- ", "* ", "• ", "· ")):
290
+ continue # Skip bullets — they belong to the previous experience
291
+ candidates.insert(0, line)
292
+
293
+ # Check for duration line (multi-role indicator)
294
+ dur_idx = None
295
+ for ci, c in enumerate(candidates):
296
+ if DUR_PATTERN.match(c):
297
+ dur_idx = ci
298
+ break
299
+
300
+ if dur_idx is not None and dur_idx > 0:
301
+ # Multi-role: Company / Duration / Title / Date
302
+ company = candidates[dur_idx - 1]
303
+ title_candidates = candidates[dur_idx + 1:]
304
+ title = title_candidates[0] if title_candidates else ""
305
+ return company, title, True
306
+
307
+ if len(candidates) >= 2:
308
+ return candidates[-2], candidates[-1], False
309
+ if len(candidates) == 1:
310
+ return candidates[0], "", False
311
+ return "", "", False
312
 
313
+
314
+ def _parse_experiences(lines: list[str]) -> list[Experience]:
315
+ """Two-pass parser: find dates first, then look backwards for metadata and forwards for bullets."""
316
+ # Pass 1: Find all date lines and extract company/title by looking backwards
317
+ entries: list[dict] = []
318
+ for i, line in enumerate(lines):
319
+ if DATE_LINE.match(line.strip()):
320
+ company, title, is_multi = _find_title_company_before_date(lines, i)
321
+ entries.append({
322
+ "idx": i,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  "company": company,
324
+ "title": title,
325
+ "is_multi": is_multi,
326
+ "dates": re.sub(r"\s*\(.*?\)\s*$", "", line.strip()),
327
+ })
328
+
329
+ # Propagate group_company for multi-role entries
330
+ # When LinkedIn shows multiple roles at one company, only the first has
331
+ # Company + Duration. Subsequent roles only show Title + Date.
332
+ # The lookback will mistake the title for a company (single candidate).
333
+ group_company = None
334
+ for e in entries:
335
+ if e["is_multi"]:
336
+ group_company = e["company"]
337
+ elif group_company:
338
+ # The detected "company" is likely a job title (only 1 candidate in lookback)
339
+ # Swap: what we thought was company is actually the title
340
+ if e["company"] and not e["title"]:
341
+ e["title"] = e["company"]
342
+ e["company"] = group_company
343
+ else:
344
+ group_company = None
345
+
346
+ # Pass 2: Collect bullets between consecutive dates
347
+ # Build a set of "metadata lines" (company/title) to exclude from bullets
348
+ meta_lines = set()
349
+ for e in entries:
350
+ if e["company"]:
351
+ meta_lines.add(e["company"])
352
+ if e["title"]:
353
+ meta_lines.add(e["title"])
354
 
355
+ experiences: list[Experience] = []
356
+ for di, e in enumerate(entries):
357
+ start = e["idx"] + 1
358
+ end = entries[di + 1]["idx"] if di + 1 < len(entries) else len(lines)
359
+
360
+ bullets = []
361
+ for i in range(start, end):
362
+ line = lines[i].strip()
363
+ if not line or _is_location_line(line) or DUR_PATTERN.match(line):
364
+ continue
365
+ if line in meta_lines:
366
+ continue
367
+ if line.startswith(("- ", "* ", "• ", "· ")):
368
+ bullets.append(line.lstrip("-*•· ").strip())
369
+ elif len(line) > 10:
370
+ bullets.append(line)
371
+
372
+ experiences.append(Experience(
373
+ title=e["title"],
374
+ company=e["company"],
375
+ dates=e["dates"],
376
+ description=" ".join(bullets[:3]),
377
+ bullets=bullets,
378
+ ))
379
+
380
+ return _merge_same_company(experiences)
381
+
382
+
383
+ def _merge_same_company(experiences: list[Experience]) -> list[Experience]:
384
+ """Merge consecutive experiences at the same company into a single entry with combined bullets."""
385
+ if not experiences:
386
+ return experiences
387
+
388
+ merged: list[Experience] = []
389
+ for exp in experiences:
390
+ if (merged
391
+ and exp.company
392
+ and merged[-1].company
393
+ and exp.company.lower() == merged[-1].company.lower()):
394
+ # Same company — merge into previous
395
+ prev = merged[-1]
396
+ # Combine titles
397
+ if exp.title and exp.title.lower() != prev.title.lower():
398
+ combined_title = f"{prev.title} → {exp.title}"
399
  else:
400
+ combined_title = prev.title
401
+ # Combine dates (earliest start - latest end)
402
+ combined_dates = f"{exp.dates} / {prev.dates}" if exp.dates != prev.dates else prev.dates
403
+ # Combine bullets, prefixed with role title
404
+ combined_bullets = []
405
+ if prev.bullets:
406
+ combined_bullets.append(f"[{prev.title}]")
407
+ combined_bullets.extend(prev.bullets)
408
+ if exp.bullets:
409
+ combined_bullets.append(f"[{exp.title}]")
410
+ combined_bullets.extend(exp.bullets)
411
+
412
+ merged[-1] = Experience(
413
+ title=combined_title,
414
+ company=prev.company,
415
+ dates=combined_dates,
416
+ description=prev.description,
417
+ bullets=combined_bullets if combined_bullets else prev.bullets + exp.bullets,
418
+ )
419
  else:
420
+ merged.append(exp)
 
 
 
421
 
422
+ return merged
423
 
424
 
425
  def _build_exp(data: dict) -> Experience: