File size: 4,883 Bytes
955e73e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9baa8d5
 
955e73e
 
 
 
9baa8d5
 
 
 
 
 
955e73e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Web Search — Serper.dev Google Search API client + LinkedIn profile parser.

Runs Google X-ray queries via Serper.dev and parses LinkedIn profile results
into structured candidate data (name, title, company, URL).

Requires SERPER_API_KEY environment variable (free tier: 2,500 searches).
"""

import os
import re
from typing import Optional
from urllib.parse import urlparse

import requests


class LinkedInSearcher:
    """Searches Google via Serper.dev and parses LinkedIn profile results."""

    API_URL = "https://google.serper.dev/search"

    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or os.environ.get("SERPER_API_KEY", "")

    @property
    def is_configured(self) -> bool:
        return bool(self.api_key)

    def search(self, query: str, num_results: int = 10) -> list[dict]:
        """Run a single Google search via Serper.dev.

        Returns raw organic results list.
        """
        if not self.api_key:
            raise RuntimeError("SERPER_API_KEY is not set.")

        resp = requests.post(
            self.API_URL,
            json={"q": query, "num": num_results},
            headers={
                "X-API-KEY": self.api_key,
                "Content-Type": "application/json",
            },
            timeout=15,
        )
        resp.raise_for_status()
        return resp.json().get("organic", [])

    def search_candidates(
        self,
        queries: list[str],
        max_per_query: int = 10,
        max_queries: int = 3,
    ) -> list[dict]:
        """Run top N queries and return deduplicated LinkedIn candidate profiles.

        Returns list of dicts:
            {name, title, company, linkedin_url, snippet, source_query, matched_queries}
        """
        # Track candidates by normalized URL for deduplication
        seen: dict[str, dict] = {}

        for query in queries[:max_queries]:
            try:
                results = self.search(query, num_results=max_per_query)
            except Exception:
                continue

            for item in results:
                link = item.get("link", "")
                if not _is_linkedin_profile(link):
                    continue

                norm_url = _normalize_linkedin_url(link)
                if norm_url in seen:
                    seen[norm_url]["matched_queries"] += 1
                    continue

                parsed = _parse_linkedin_title(item.get("title", ""))
                seen[norm_url] = {
                    "name": parsed["name"],
                    "title": parsed["title"],
                    "company": parsed["company"],
                    "linkedin_url": link,
                    "snippet": (item.get("snippet") or "")[:200],
                    "source_query": query,
                    "matched_queries": 1,
                }

        # Sort: most query matches first, then alphabetically
        candidates = sorted(
            seen.values(),
            key=lambda c: (-c["matched_queries"], c["name"].lower()),
        )
        return candidates


# ── Helpers ──────────────────────────────────────


def _is_linkedin_profile(url: str) -> bool:
    """Return True only for linkedin.com/in/ profile URLs."""
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ""
        return (
            "linkedin.com" in host
            and parsed.path.startswith("/in/")
        )
    except Exception:
        return False


def _normalize_linkedin_url(url: str) -> str:
    """Normalize a LinkedIn profile URL for deduplication."""
    try:
        parsed = urlparse(url)
        # Strip query params and trailing slashes, lowercase
        path = parsed.path.rstrip("/").lower()
        return f"linkedin.com{path}"
    except Exception:
        return url.lower()


def _parse_linkedin_title(title: str) -> dict:
    """Parse a Google result title for a LinkedIn profile.

    Typical format: "Rahul Sharma - Senior Consultant - Deloitte India | LinkedIn"

    Returns dict with name, title, company (all strings, may be empty).
    """
    # Strip "| LinkedIn" or "- LinkedIn" suffix (both patterns appear in Google results)
    cleaned = re.sub(r"\s*[-|]\s*LinkedIn\s*$", "", title, flags=re.IGNORECASE).strip()

    parts = [p.strip() for p in cleaned.split(" - ")]

    if len(parts) >= 3:
        # Take last non-empty part as company (skip any extra segments)
        company = parts[2]
        # Guard: if company is still "LinkedIn" somehow, clear it
        if company.lower() == "linkedin":
            company = ""
        return {"name": parts[0], "title": parts[1], "company": company}
    elif len(parts) == 2:
        return {"name": parts[0], "title": parts[1], "company": ""}
    else:
        return {"name": cleaned, "title": "", "company": ""}