Niketjain2002's picture
Fix LinkedIn title parser: strip both | LinkedIn and - LinkedIn suffixes
9baa8d5 verified
"""
Web Search — Serper.dev Google Search API client + LinkedIn profile parser.
Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results
into structured candidate data (name, title, company, URL).
Requires SERPER_API_KEY environment variable (free tier: 2,500 searches).
"""
import os
import re
from typing import Optional
from urllib.parse import urlparse
import requests
class LinkedInSearcher:
"""Searches Google via Serper.dev and parses LinkedIn profile results."""
API_URL = "https://google.serper.dev/search"
def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key or os.environ.get("SERPER_API_KEY", "")
@property
def is_configured(self) -> bool:
return bool(self.api_key)
def search(self, query: str, num_results: int = 10) -> list[dict]:
"""Run a single Google search via Serper.dev.
Returns raw organic results list.
"""
if not self.api_key:
raise RuntimeError("SERPER_API_KEY is not set.")
resp = requests.post(
self.API_URL,
json={"q": query, "num": num_results},
headers={
"X-API-KEY": self.api_key,
"Content-Type": "application/json",
},
timeout=15,
)
resp.raise_for_status()
return resp.json().get("organic", [])
def search_candidates(
self,
queries: list[str],
max_per_query: int = 10,
max_queries: int = 3,
) -> list[dict]:
"""Run top N queries and return deduplicated LinkedIn candidate profiles.
Returns list of dicts:
{name, title, company, linkedin_url, snippet, source_query, matched_queries}
"""
# Track candidates by normalized URL for deduplication
seen: dict[str, dict] = {}
for query in queries[:max_queries]:
try:
results = self.search(query, num_results=max_per_query)
except Exception:
continue
for item in results:
link = item.get("link", "")
if not _is_linkedin_profile(link):
continue
norm_url = _normalize_linkedin_url(link)
if norm_url in seen:
seen[norm_url]["matched_queries"] += 1
continue
parsed = _parse_linkedin_title(item.get("title", ""))
seen[norm_url] = {
"name": parsed["name"],
"title": parsed["title"],
"company": parsed["company"],
"linkedin_url": link,
"snippet": (item.get("snippet") or "")[:200],
"source_query": query,
"matched_queries": 1,
}
# Sort: most query matches first, then alphabetically
candidates = sorted(
seen.values(),
key=lambda c: (-c["matched_queries"], c["name"].lower()),
)
return candidates
# ── Helpers ──────────────────────────────────────
def _is_linkedin_profile(url: str) -> bool:
"""Return True only for linkedin.com/in/ profile URLs."""
try:
parsed = urlparse(url)
host = parsed.hostname or ""
return (
"linkedin.com" in host
and parsed.path.startswith("/in/")
)
except Exception:
return False
def _normalize_linkedin_url(url: str) -> str:
"""Normalize a LinkedIn profile URL for deduplication."""
try:
parsed = urlparse(url)
# Strip query params and trailing slashes, lowercase
path = parsed.path.rstrip("/").lower()
return f"linkedin.com{path}"
except Exception:
return url.lower()
def _parse_linkedin_title(title: str) -> dict:
"""Parse a Google result title for a LinkedIn profile.
Typical format: "Rahul Sharma - Senior Consultant - Deloitte India | LinkedIn"
Returns dict with name, title, company (all strings, may be empty).
"""
# Strip "| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results)
cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip()
parts = [p.strip() for p in cleaned.split(" - ")]
if len(parts) >= 3:
# Take last non-empty part as company (skip any extra segments)
company = parts[2]
# Guard: if company is still "LinkedIn" somehow, clear it
if company.lower() == "linkedin":
company = ""
return {"name": parts[0], "title": parts[1], "company": company}
elif len(parts) == 2:
return {"name": parts[0], "title": parts[1], "company": ""}
else:
return {"name": cleaned, "title": "", "company": ""}