File size: 4,883 Bytes
955e73e 9baa8d5 955e73e 9baa8d5 955e73e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """
Web Search — Serper.dev Google Search API client + LinkedIn profile parser.
Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results
into structured candidate data (name, title, company, URL).
Requires SERPER_API_KEY environment variable (free tier: 2,500 searches).
"""
import os
import re
from typing import Optional
from urllib.parse import urlparse
import requests
class LinkedInSearcher:
"""Searches Google via Serper.dev and parses LinkedIn profile results."""
API_URL = "https://google.serper.dev/search"
def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key or os.environ.get("SERPER_API_KEY", "")
@property
def is_configured(self) -> bool:
return bool(self.api_key)
def search(self, query: str, num_results: int = 10) -> list[dict]:
"""Run a single Google search via Serper.dev.
Returns raw organic results list.
"""
if not self.api_key:
raise RuntimeError("SERPER_API_KEY is not set.")
resp = requests.post(
self.API_URL,
json={"q": query, "num": num_results},
headers={
"X-API-KEY": self.api_key,
"Content-Type": "application/json",
},
timeout=15,
)
resp.raise_for_status()
return resp.json().get("organic", [])
def search_candidates(
self,
queries: list[str],
max_per_query: int = 10,
max_queries: int = 3,
) -> list[dict]:
"""Run top N queries and return deduplicated LinkedIn candidate profiles.
Returns list of dicts:
{name, title, company, linkedin_url, snippet, source_query, matched_queries}
"""
# Track candidates by normalized URL for deduplication
seen: dict[str, dict] = {}
for query in queries[:max_queries]:
try:
results = self.search(query, num_results=max_per_query)
except Exception:
continue
for item in results:
link = item.get("link", "")
if not _is_linkedin_profile(link):
continue
norm_url = _normalize_linkedin_url(link)
if norm_url in seen:
seen[norm_url]["matched_queries"] += 1
continue
parsed = _parse_linkedin_title(item.get("title", ""))
seen[norm_url] = {
"name": parsed["name"],
"title": parsed["title"],
"company": parsed["company"],
"linkedin_url": link,
"snippet": (item.get("snippet") or "")[:200],
"source_query": query,
"matched_queries": 1,
}
# Sort: most query matches first, then alphabetically
candidates = sorted(
seen.values(),
key=lambda c: (-c["matched_queries"], c["name"].lower()),
)
return candidates
# ── Helpers ──────────────────────────────────────
def _is_linkedin_profile(url: str) -> bool:
"""Return True only for linkedin.com/in/ profile URLs."""
try:
parsed = urlparse(url)
host = parsed.hostname or ""
return (
"linkedin.com" in host
and parsed.path.startswith("/in/")
)
except Exception:
return False
def _normalize_linkedin_url(url: str) -> str:
"""Normalize a LinkedIn profile URL for deduplication."""
try:
parsed = urlparse(url)
# Strip query params and trailing slashes, lowercase
path = parsed.path.rstrip("/").lower()
return f"linkedin.com{path}"
except Exception:
return url.lower()
def _parse_linkedin_title(title: str) -> dict:
"""Parse a Google result title for a LinkedIn profile.
Typical format: "Rahul Sharma - Senior Consultant - Deloitte India | LinkedIn"
Returns dict with name, title, company (all strings, may be empty).
"""
# Strip "| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results)
cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip()
parts = [p.strip() for p in cleaned.split(" - ")]
if len(parts) >= 3:
# Take last non-empty part as company (skip any extra segments)
company = parts[2]
# Guard: if company is still "LinkedIn" somehow, clear it
if company.lower() == "linkedin":
company = ""
return {"name": parts[0], "title": parts[1], "company": company}
elif len(parts) == 2:
return {"name": parts[0], "title": parts[1], "company": ""}
else:
return {"name": cleaned, "title": "", "company": ""}
|