| | """ |
| | Web Search — Serper.dev Google Search API client + LinkedIn profile parser. |
| | |
| | Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results |
| | into structured candidate data (name, title, company, URL). |
| | |
| | Requires SERPER_API_KEY environment variable (free tier: 2,500 searches). |
| | """ |
| |
|
| | import os |
| | import re |
| | from typing import Optional |
| | from urllib.parse import urlparse |
| |
|
| | import requests |
| |
|
| |
|
| | class LinkedInSearcher: |
| | """Searches Google via Serper.dev and parses LinkedIn profile results.""" |
| |
|
| | API_URL = "https://google.serper.dev/search" |
| |
|
| | def __init__(self, api_key: Optional[str] = None): |
| | self.api_key = api_key or os.environ.get("SERPER_API_KEY", "") |
| |
|
| | @property |
| | def is_configured(self) -> bool: |
| | return bool(self.api_key) |
| |
|
| | def search(self, query: str, num_results: int = 10) -> list[dict]: |
| | """Run a single Google search via Serper.dev. |
| | |
| | Returns raw organic results list. |
| | """ |
| | if not self.api_key: |
| | raise RuntimeError("SERPER_API_KEY is not set.") |
| |
|
| | resp = requests.post( |
| | self.API_URL, |
| | json={"q": query, "num": num_results}, |
| | headers={ |
| | "X-API-KEY": self.api_key, |
| | "Content-Type": "application/json", |
| | }, |
| | timeout=15, |
| | ) |
| | resp.raise_for_status() |
| | return resp.json().get("organic", []) |
| |
|
| | def search_candidates( |
| | self, |
| | queries: list[str], |
| | max_per_query: int = 10, |
| | max_queries: int = 3, |
| | ) -> list[dict]: |
| | """Run top N queries and return deduplicated LinkedIn candidate profiles. |
| | |
| | Returns list of dicts: |
| | {name, title, company, linkedin_url, snippet, source_query, matched_queries} |
| | """ |
| | |
| | seen: dict[str, dict] = {} |
| |
|
| | for query in queries[:max_queries]: |
| | try: |
| | results = self.search(query, num_results=max_per_query) |
| | except Exception: |
| | continue |
| |
|
| | for item in results: |
| | link = item.get("link", "") |
| | if not _is_linkedin_profile(link): |
| | continue |
| |
|
| | norm_url = _normalize_linkedin_url(link) |
| | if norm_url in seen: |
| | seen[norm_url]["matched_queries"] += 1 |
| | continue |
| |
|
| | parsed = _parse_linkedin_title(item.get("title", "")) |
| | seen[norm_url] = { |
| | "name": parsed["name"], |
| | "title": parsed["title"], |
| | "company": parsed["company"], |
| | "linkedin_url": link, |
| | "snippet": (item.get("snippet") or "")[:200], |
| | "source_query": query, |
| | "matched_queries": 1, |
| | } |
| |
|
| | |
| | candidates = sorted( |
| | seen.values(), |
| | key=lambda c: (-c["matched_queries"], c["name"].lower()), |
| | ) |
| | return candidates |
| |
|
| |
|
| | |
| |
|
| |
|
| | def _is_linkedin_profile(url: str) -> bool: |
| | """Return True only for linkedin.com/in/ profile URLs.""" |
| | try: |
| | parsed = urlparse(url) |
| | host = parsed.hostname or "" |
| | return ( |
| | "linkedin.com" in host |
| | and parsed.path.startswith("/in/") |
| | ) |
| | except Exception: |
| | return False |
| |
|
| |
|
| | def _normalize_linkedin_url(url: str) -> str: |
| | """Normalize a LinkedIn profile URL for deduplication.""" |
| | try: |
| | parsed = urlparse(url) |
| | |
| | path = parsed.path.rstrip("/").lower() |
| | return f"linkedin.com{path}" |
| | except Exception: |
| | return url.lower() |
| |
|
| |
|
| | def _parse_linkedin_title(title: str) -> dict: |
| | """Parse a Google result title for a LinkedIn profile. |
| | |
| | Typical format: "Rahul Sharma - Senior Consultant - Deloitte India | LinkedIn" |
| | |
| | Returns dict with name, title, company (all strings, may be empty). |
| | """ |
| | |
| | cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip() |
| |
|
| | parts = [p.strip() for p in cleaned.split(" - ")] |
| |
|
| | if len(parts) >= 3: |
| | |
| | company = parts[2] |
| | |
| | if company.lower() == "linkedin": |
| | company = "" |
| | return {"name": parts[0], "title": parts[1], "company": company} |
| | elif len(parts) == 2: |
| | return {"name": parts[0], "title": parts[1], "company": ""} |
| | else: |
| | return {"name": cleaned, "title": "", "company": ""} |
| |
|