Spaces:

Niketjain2002
/

recruitment-intelligence

Sleeping

File size: 4,883 Bytes

"""
Web Search — Serper.dev Google Search API client + LinkedIn profile parser.

Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results
into structured candidate data (name, title, company, URL).

Requires SERPER_API_KEY environment variable (free tier: 2,500 searches).
"""

import os
import re
from typing import Optional
from urllib.parse import urlparse

import requests


class LinkedInSearcher:
    """Searches Google via Serper.dev and parses LinkedIn profile results."""

    API_URL = "https://google.serper.dev/search"

    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or os.environ.get("SERPER_API_KEY", "")

    @property
    def is_configured(self) -> bool:
        return bool(self.api_key)

    def search(self, query: str, num_results: int = 10) -> list[dict]:
        """Run a single Google search via Serper.dev.

        Returns raw organic results list.
        """
        if not self.api_key:
            raise RuntimeError("SERPER_API_KEY is not set.")

        resp = requests.post(
            self.API_URL,
            json={"q": query, "num": num_results},
            headers={
                "X-API-KEY": self.api_key,
                "Content-Type": "application/json",
            },
            timeout=15,
        )
        resp.raise_for_status()
        return resp.json().get("organic", [])

    def search_candidates(
        self,
        queries: list[str],
        max_per_query: int = 10,
        max_queries: int = 3,
    ) -> list[dict]:
        """Run top N queries and return deduplicated LinkedIn candidate profiles.

        Returns list of dicts:
            {name, title, company, linkedin_url, snippet, source_query, matched_queries}
        """
        # Track candidates by normalized URL for deduplication
        seen: dict[str, dict] = {}

        for query in queries[:max_queries]:
            try:
                results = self.search(query, num_results=max_per_query)
            except Exception:
                continue

            for item in results:
                link = item.get("link", "")
                if not _is_linkedin_profile(link):
                    continue

                norm_url = _normalize_linkedin_url(link)
                if norm_url in seen:
                    seen[norm_url]["matched_queries"] += 1
                    continue

                parsed = _parse_linkedin_title(item.get("title", ""))
                seen[norm_url] = {
                    "name": parsed["name"],
                    "title": parsed["title"],
                    "company": parsed["company"],
                    "linkedin_url": link,
                    "snippet": (item.get("snippet") or "")[:200],
                    "source_query": query,
                    "matched_queries": 1,
                }

        # Sort: most query matches first, then alphabetically
        candidates = sorted(
            seen.values(),
            key=lambda c: (-c["matched_queries"], c["name"].lower()),
        )
        return candidates


# ── Helpers ──────────────────────────────────────


def _is_linkedin_profile(url: str) -> bool:
    """Return True only for linkedin.com/in/ profile URLs."""
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ""
        return (
            "linkedin.com" in host
            and parsed.path.startswith("/in/")
        )
    except Exception:
        return False


def _normalize_linkedin_url(url: str) -> str:
    """Normalize a LinkedIn profile URL for deduplication."""
    try:
        parsed = urlparse(url)
        # Strip query params and trailing slashes, lowercase
        path = parsed.path.rstrip("/").lower()
        return f"linkedin.com{path}"
    except Exception:
        return url.lower()


def _parse_linkedin_title(title: str) -> dict:
    """Parse a Google result title for a LinkedIn profile.

    Typical format: "Rahul Sharma - Senior Consultant - Deloitte India | LinkedIn"

    Returns dict with name, title, company (all strings, may be empty).
    """
    # Strip "| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results)
    cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip()

    parts = [p.strip() for p in cleaned.split(" - ")]

    if len(parts) >= 3:
        # Take last non-empty part as company (skip any extra segments)
        company = parts[2]
        # Guard: if company is still "LinkedIn" somehow, clear it
        if company.lower() == "linkedin":
            company = ""
        return {"name": parts[0], "title": parts[1], "company": company}
    elif len(parts) == 2:
        return {"name": parts[0], "title": parts[1], "company": ""}
    else:
        return {"name": cleaned, "title": "", "company": ""}