Spaces:

Niketjain2002
/

recruitment-intelligence

Sleeping

App Files Files Community

recruitment-intelligence / src /web_search.py

Niketjain2002

Fix LinkedIn title parser: strip both | LinkedIn and - LinkedIn suffixes

9baa8d5 verified 5 days ago

raw

history blame contribute delete

4.88 kB

	"""
	Web Search — Serper.dev Google Search API client + LinkedIn profile parser.

	Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results
	into structured candidate data (name, title, company, URL).

	Requires SERPER_API_KEY environment variable (free tier: 2,500 searches).
	"""

	import os
	import re
	from typing import Optional
	from urllib.parse import urlparse

	import requests


	class LinkedInSearcher:
	"""Searches Google via Serper.dev and parses LinkedIn profile results."""

	API_URL = "https://google.serper.dev/search"

	def __init__(self, api_key: Optional[str] = None):
	self.api_key = api_key or os.environ.get("SERPER_API_KEY", "")

	@property
	def is_configured(self) -> bool:
	return bool(self.api_key)

	def search(self, query: str, num_results: int = 10) -> list[dict]:
	"""Run a single Google search via Serper.dev.

	Returns raw organic results list.
	"""
	if not self.api_key:
	raise RuntimeError("SERPER_API_KEY is not set.")

	resp = requests.post(
	self.API_URL,
	json={"q": query, "num": num_results},
	headers={
	"X-API-KEY": self.api_key,
	"Content-Type": "application/json",
	},
	timeout=15,
	)
	resp.raise_for_status()
	return resp.json().get("organic", [])

	def search_candidates(
	self,
	queries: list[str],
	max_per_query: int = 10,
	max_queries: int = 3,
	) -> list[dict]:
	"""Run top N queries and return deduplicated LinkedIn candidate profiles.

	Returns list of dicts:
	{name, title, company, linkedin_url, snippet, source_query, matched_queries}
	"""
	# Track candidates by normalized URL for deduplication
	seen: dict[str, dict] = {}

	for query in queries[:max_queries]:
	try:
	results = self.search(query, num_results=max_per_query)
	except Exception:
	continue

	for item in results:
	link = item.get("link", "")
	if not _is_linkedin_profile(link):
	continue

	norm_url = _normalize_linkedin_url(link)
	if norm_url in seen:
	seen[norm_url]["matched_queries"] += 1
	continue

	parsed = _parse_linkedin_title(item.get("title", ""))
	seen[norm_url] = {
	"name": parsed["name"],
	"title": parsed["title"],
	"company": parsed["company"],
	"linkedin_url": link,
	"snippet": (item.get("snippet") or "")[:200],
	"source_query": query,
	"matched_queries": 1,
	}

	# Sort: most query matches first, then alphabetically
	candidates = sorted(
	seen.values(),
	key=lambda c: (-c["matched_queries"], c["name"].lower()),
	)
	return candidates


	# ── Helpers ──────────────────────────────────────


	def _is_linkedin_profile(url: str) -> bool:
	"""Return True only for linkedin.com/in/ profile URLs."""
	try:
	parsed = urlparse(url)
	host = parsed.hostname or ""
	return (
	"linkedin.com" in host
	and parsed.path.startswith("/in/")
	)
	except Exception:
	return False


	def _normalize_linkedin_url(url: str) -> str:
	"""Normalize a LinkedIn profile URL for deduplication."""
	try:
	parsed = urlparse(url)
	# Strip query params and trailing slashes, lowercase
	path = parsed.path.rstrip("/").lower()
	return f"linkedin.com{path}"
	except Exception:
	return url.lower()


	def _parse_linkedin_title(title: str) -> dict:
	"""Parse a Google result title for a LinkedIn profile.

	Typical format: "Rahul Sharma - Senior Consultant - Deloitte India \| LinkedIn"

	Returns dict with name, title, company (all strings, may be empty).
	"""
	# Strip "\| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results)
	cleaned = re.sub(r"\s[-\|]\sLinkedIn\s*$", "", title, flags=re.IGNORECASE).strip()

	parts = [p.strip() for p in cleaned.split(" - ")]

	if len(parts) >= 3:
	# Take last non-empty part as company (skip any extra segments)
	company = parts[2]
	# Guard: if company is still "LinkedIn" somehow, clear it
	if company.lower() == "linkedin":
	company = ""
	return {"name": parts[0], "title": parts[1], "company": company}
	elif len(parts) == 2:
	return {"name": parts[0], "title": parts[1], "company": ""}
	else:
	return {"name": cleaned, "title": "", "company": ""}