Fix LinkedIn title parser: strip both | LinkedIn and - LinkedIn suffixes
Browse files- src/web_search.py +8 -3
src/web_search.py
CHANGED
|
@@ -130,13 +130,18 @@ def _parse_linkedin_title(title: str) -> dict:
|
|
| 130 |
|
| 131 |
Returns dict with name, title, company (all strings, may be empty).
|
| 132 |
"""
|
| 133 |
-
# Strip
|
| 134 |
-
cleaned = re.sub(r"\s*
|
| 135 |
|
| 136 |
parts = [p.strip() for p in cleaned.split(" - ")]
|
| 137 |
|
| 138 |
if len(parts) >= 3:
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
elif len(parts) == 2:
|
| 141 |
return {"name": parts[0], "title": parts[1], "company": ""}
|
| 142 |
else:
|
|
|
|
| 130 |
|
| 131 |
Returns dict with name, title, company (all strings, may be empty).
|
| 132 |
"""
|
| 133 |
+
# Strip "| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results)
|
| 134 |
+
cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip()
|
| 135 |
|
| 136 |
parts = [p.strip() for p in cleaned.split(" - ")]
|
| 137 |
|
| 138 |
if len(parts) >= 3:
|
| 139 |
+
# Take last non-empty part as company (skip any extra segments)
|
| 140 |
+
company = parts[2]
|
| 141 |
+
# Guard: if company is still "LinkedIn" somehow, clear it
|
| 142 |
+
if company.lower() == "linkedin":
|
| 143 |
+
company = ""
|
| 144 |
+
return {"name": parts[0], "title": parts[1], "company": company}
|
| 145 |
elif len(parts) == 2:
|
| 146 |
return {"name": parts[0], "title": parts[1], "company": ""}
|
| 147 |
else:
|