""" Web Search — Serper.dev Google Search API client + LinkedIn profile parser. Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results into structured candidate data (name, title, company, URL). Requires SERPER_API_KEY environment variable (free tier: 2,500 searches). """ import os import re from typing import Optional from urllib.parse import urlparse import requests class LinkedInSearcher: """Searches Google via Serper.dev and parses LinkedIn profile results.""" API_URL = "https://google.serper.dev/search" def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or os.environ.get("SERPER_API_KEY", "") @property def is_configured(self) -> bool: return bool(self.api_key) def search(self, query: str, num_results: int = 10) -> list[dict]: """Run a single Google search via Serper.dev. Returns raw organic results list. """ if not self.api_key: raise RuntimeError("SERPER_API_KEY is not set.") resp = requests.post( self.API_URL, json={"q": query, "num": num_results}, headers={ "X-API-KEY": self.api_key, "Content-Type": "application/json", }, timeout=15, ) resp.raise_for_status() return resp.json().get("organic", []) def search_candidates( self, queries: list[str], max_per_query: int = 10, max_queries: int = 3, ) -> list[dict]: """Run top N queries and return deduplicated LinkedIn candidate profiles. Returns list of dicts: {name, title, company, linkedin_url, snippet, source_query, matched_queries} """ # Track candidates by normalized URL for deduplication seen: dict[str, dict] = {} for query in queries[:max_queries]: try: results = self.search(query, num_results=max_per_query) except Exception: continue for item in results: link = item.get("link", "") if not _is_linkedin_profile(link): continue norm_url = _normalize_linkedin_url(link) if norm_url in seen: seen[norm_url]["matched_queries"] += 1 continue parsed = _parse_linkedin_title(item.get("title", "")) seen[norm_url] = { "name": parsed["name"], "title": parsed["title"], "company": parsed["company"], "linkedin_url": link, "snippet": (item.get("snippet") or "")[:200], "source_query": query, "matched_queries": 1, } # Sort: most query matches first, then alphabetically candidates = sorted( seen.values(), key=lambda c: (-c["matched_queries"], c["name"].lower()), ) return candidates # ── Helpers ────────────────────────────────────── def _is_linkedin_profile(url: str) -> bool: """Return True only for linkedin.com/in/ profile URLs.""" try: parsed = urlparse(url) host = parsed.hostname or "" return ( "linkedin.com" in host and parsed.path.startswith("/in/") ) except Exception: return False def _normalize_linkedin_url(url: str) -> str: """Normalize a LinkedIn profile URL for deduplication.""" try: parsed = urlparse(url) # Strip query params and trailing slashes, lowercase path = parsed.path.rstrip("/").lower() return f"linkedin.com{path}" except Exception: return url.lower() def _parse_linkedin_title(title: str) -> dict: """Parse a Google result title for a LinkedIn profile. Typical format: "Rahul Sharma - Senior Consultant - Deloitte India | LinkedIn" Returns dict with name, title, company (all strings, may be empty). """ # Strip "| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results) cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip() parts = [p.strip() for p in cleaned.split(" - ")] if len(parts) >= 3: # Take last non-empty part as company (skip any extra segments) company = parts[2] # Guard: if company is still "LinkedIn" somehow, clear it if company.lower() == "linkedin": company = "" return {"name": parts[0], "title": parts[1], "company": company} elif len(parts) == 2: return {"name": parts[0], "title": parts[1], "company": ""} else: return {"name": cleaned, "title": "", "company": ""}