Spaces:
No application file
No application file
fixed data loading
Browse filesserpapi search working (lnkd, gh),
rapidapi fetch working (lnkd),
github fetch working,
url extraction working,
tests passing,
added sample test data to test/data dir - prevent unnecessary api quota usage. use a test flag to test using this - todo
Signed-off-by: pranshu-raj-211 <pranshuraj65536@gmail.com>
- app/config.py +1 -1
- app/services/api_clients.py +327 -221
- test/data/github_profile.html +0 -0
- test/data/github_search.json +121 -0
- test/data/github_urls.txt +1 -0
- test/data/linkedin_profile.json +173 -0
- test/data/linkedin_search.json +448 -0
- test/data/linkedin_urls.txt +19 -0
app/config.py
CHANGED
|
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
|
|
| 13 |
database_name: str = "linkedin_sourcing"
|
| 14 |
|
| 15 |
max_concurrent_requests: int = 5
|
| 16 |
-
request_delay: float =
|
| 17 |
max_retries: int = 3
|
| 18 |
|
| 19 |
bloom_filter_capacity: int = 100000
|
|
|
|
| 13 |
database_name: str = "linkedin_sourcing"
|
| 14 |
|
| 15 |
max_concurrent_requests: int = 5
|
| 16 |
+
request_delay: float = 5.0
|
| 17 |
max_retries: int = 3
|
| 18 |
|
| 19 |
bloom_filter_capacity: int = 100000
|
app/services/api_clients.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
import asyncio
|
| 2 |
import httpx
|
|
|
|
|
|
|
|
|
|
| 3 |
from typing import Any, Optional
|
| 4 |
from fake_useragent import UserAgent
|
| 5 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 6 |
-
from serpapi import GoogleSearch
|
| 7 |
|
| 8 |
from app.config import settings
|
| 9 |
from app.models.schemas import SearchQuery
|
|
@@ -46,75 +48,145 @@ class SerpAPIClient:
|
|
| 46 |
"""Client for SerpAPI Google search"""
|
| 47 |
|
| 48 |
def __init__(self):
|
| 49 |
-
self.
|
| 50 |
self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
@retry(
|
| 53 |
stop=stop_after_attempt(settings.max_retries),
|
| 54 |
wait=wait_exponential(multiplier=1, min=4, max=10),
|
|
|
|
| 55 |
)
|
| 56 |
-
async def
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
async with self.semaphore:
|
| 59 |
try:
|
| 60 |
-
api_key =
|
| 61 |
-
|
| 62 |
-
params = {
|
| 63 |
-
"q": query.query,
|
| 64 |
-
"api_key": api_key,
|
| 65 |
-
"engine": "google",
|
| 66 |
-
"num": query.max_results,
|
| 67 |
-
"start": 0,
|
| 68 |
-
"gl": "us", # Country
|
| 69 |
-
"hl": "en", # Language
|
| 70 |
-
}
|
| 71 |
|
| 72 |
logger.info(
|
| 73 |
"serpapi_request",
|
| 74 |
-
extra={
|
|
|
|
|
|
|
|
|
|
| 75 |
)
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
result = await asyncio.get_event_loop().run_in_executor(
|
| 80 |
-
None, search.get_dict
|
| 81 |
)
|
|
|
|
| 82 |
|
|
|
|
| 83 |
organic_results = result.get("organic_results", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
logger.info(
|
| 86 |
"serpapi_response",
|
| 87 |
-
extra={
|
|
|
|
|
|
|
|
|
|
| 88 |
)
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
except Exception as e:
|
| 93 |
-
logger.
|
| 94 |
-
"
|
|
|
|
| 95 |
)
|
| 96 |
raise
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
async def batch_search(
|
| 99 |
self, queries: list[SearchQuery]
|
| 100 |
) -> dict[str, list[dict[str, Any]]]:
|
| 101 |
"""Execute multiple search queries concurrently"""
|
| 102 |
tasks = []
|
| 103 |
-
for
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# Add delay between requests to avoid rate limiting
|
|
|
|
| 108 |
await asyncio.sleep(settings.request_delay)
|
| 109 |
|
| 110 |
results = {}
|
| 111 |
for query_text, task in tasks:
|
| 112 |
try:
|
| 113 |
-
|
| 114 |
-
results[query_text] =
|
| 115 |
except Exception as e:
|
| 116 |
-
logger.
|
| 117 |
-
"
|
|
|
|
| 118 |
)
|
| 119 |
results[query_text] = []
|
| 120 |
|
|
@@ -122,56 +194,99 @@ class SerpAPIClient:
|
|
| 122 |
|
| 123 |
|
| 124 |
class LinkedInProfileClient:
|
| 125 |
-
"""Client for fetching LinkedIn profile data"""
|
| 126 |
|
| 127 |
def __init__(self):
|
| 128 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
@retry(
|
| 132 |
stop=stop_after_attempt(settings.max_retries),
|
| 133 |
wait=wait_exponential(multiplier=1, min=2, max=8),
|
|
|
|
| 134 |
)
|
| 135 |
async def fetch_profile(self, linkedin_url: str) -> Optional[dict[str, Any]]:
|
| 136 |
-
"""Fetch LinkedIn profile data"""
|
| 137 |
async with self.semaphore:
|
| 138 |
try:
|
| 139 |
-
|
| 140 |
-
"
|
| 141 |
-
"
|
| 142 |
-
"
|
| 143 |
-
"
|
| 144 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
}
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
)
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
except Exception:
|
| 173 |
logger.exception(
|
| 174 |
-
"Failed to fetch
|
|
|
|
| 175 |
)
|
| 176 |
raise
|
| 177 |
|
|
@@ -184,8 +299,7 @@ class LinkedInProfileClient:
|
|
| 184 |
task = asyncio.create_task(self.fetch_profile(url))
|
| 185 |
tasks.append((url, task))
|
| 186 |
|
| 187 |
-
|
| 188 |
-
await asyncio.sleep(0.5)
|
| 189 |
|
| 190 |
results = {}
|
| 191 |
for url, task in tasks:
|
|
@@ -194,7 +308,8 @@ class LinkedInProfileClient:
|
|
| 194 |
results[url] = result
|
| 195 |
except Exception:
|
| 196 |
logger.exception(
|
| 197 |
-
"Batch
|
|
|
|
| 198 |
)
|
| 199 |
results[url] = None
|
| 200 |
|
|
@@ -202,106 +317,82 @@ class LinkedInProfileClient:
|
|
| 202 |
|
| 203 |
|
| 204 |
class GitHubClient:
|
| 205 |
-
"""Client for fetching GitHub profile
|
| 206 |
|
| 207 |
def __init__(self):
|
| 208 |
-
self.
|
| 209 |
self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
)
|
| 214 |
-
async def fetch_github_profile(self, github_url: str) -> Optional[dict[str, Any]]:
|
| 215 |
-
"""Fetch GitHub profile data"""
|
| 216 |
-
async with self.semaphore:
|
| 217 |
-
try:
|
| 218 |
-
headers = {
|
| 219 |
-
"User-Agent": self.ua_rotator.get_random_ua(),
|
| 220 |
-
"Accept": "application/vnd.github.v3+json",
|
| 221 |
-
}
|
| 222 |
-
|
| 223 |
-
async with httpx.AsyncClient(timeout=20.0, headers=headers) as client:
|
| 224 |
-
logger.debug(
|
| 225 |
-
"Got req to fetch github profile", extra={"gh_url": github_url}
|
| 226 |
-
)
|
| 227 |
-
|
| 228 |
-
response = await client.get(github_url)
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
"url": github_url,
|
| 233 |
-
"html": response.text,
|
| 234 |
-
"status": "success",
|
| 235 |
-
}
|
| 236 |
-
else:
|
| 237 |
-
logger.warning(
|
| 238 |
-
"Failed to fetch gh profile", extra={"gh_url": github_url}
|
| 239 |
-
)
|
| 240 |
-
return None
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
)
|
| 246 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
|
| 249 |
class URLExtractor:
|
| 250 |
-
"""Extracts URLs from search results"""
|
| 251 |
-
|
| 252 |
-
def
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
for result in search_results:
|
| 257 |
link = result.get("link", "")
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
if self._is_valid_linkedin_url(link):
|
| 262 |
-
urls.append(link)
|
| 263 |
-
|
| 264 |
-
# Remove duplicates while preserving order
|
| 265 |
-
unique_urls = []
|
| 266 |
-
seen = set()
|
| 267 |
-
for url in urls:
|
| 268 |
-
if url not in seen:
|
| 269 |
-
unique_urls.append(url)
|
| 270 |
-
seen.add(url)
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
def extract_github_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
|
| 276 |
"""Extract GitHub profile URLs from search results"""
|
| 277 |
-
urls =
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
link = result.get("link", "")
|
| 281 |
-
|
| 282 |
-
if self._is_valid_github_url(link):
|
| 283 |
-
urls.append(link)
|
| 284 |
-
|
| 285 |
-
unique_urls = list(dict.fromkeys(urls)) # Remove duplicates
|
| 286 |
-
logger.info("extracted_github_urls", extra={"count": len(unique_urls)})
|
| 287 |
-
return unique_urls
|
| 288 |
|
| 289 |
def _is_valid_linkedin_url(self, url: str) -> bool:
|
| 290 |
"""Validate LinkedIn profile URL"""
|
| 291 |
if not url:
|
| 292 |
return False
|
| 293 |
|
| 294 |
-
#
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
"www.linkedin.com/in/",
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
# Must contain a LinkedIn profile pattern
|
| 301 |
-
if not any(pattern in url.lower() for pattern in linkedin_patterns):
|
| 302 |
-
return False
|
| 303 |
|
| 304 |
-
# Exclude
|
| 305 |
exclude_patterns = [
|
| 306 |
"/company/",
|
| 307 |
"/school/",
|
|
@@ -309,109 +400,124 @@ class URLExtractor:
|
|
| 309 |
"/jobs/",
|
| 310 |
"/directory/",
|
| 311 |
"/pub/dir/",
|
| 312 |
-
"linkedin.com/pub/",
|
| 313 |
"/learning/",
|
| 314 |
"/events/",
|
| 315 |
"/posts/",
|
| 316 |
]
|
| 317 |
|
|
|
|
| 318 |
if any(pattern in url.lower() for pattern in exclude_patterns):
|
| 319 |
return False
|
| 320 |
|
| 321 |
-
|
|
|
|
| 322 |
|
| 323 |
def _is_valid_github_url(self, url: str) -> bool:
|
| 324 |
"""Validate GitHub profile URL"""
|
| 325 |
if not url:
|
| 326 |
return False
|
| 327 |
|
| 328 |
-
#
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
-
|
| 332 |
-
return False
|
| 333 |
-
|
| 334 |
-
# Exclude repository pages, organization pages, etc.
|
| 335 |
exclude_patterns = [
|
| 336 |
-
"/repos/",
|
| 337 |
-
"/orgs/",
|
| 338 |
-
"/gist/",
|
| 339 |
-
"/issues/",
|
| 340 |
-
"/pulls/",
|
| 341 |
-
"/wiki/",
|
| 342 |
-
"/releases/",
|
| 343 |
"/tree/",
|
| 344 |
"/blob/",
|
| 345 |
"/commit/",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
"/compare/",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
]
|
| 348 |
-
|
| 349 |
if any(pattern in url.lower() for pattern in exclude_patterns):
|
| 350 |
return False
|
|
|
|
| 351 |
|
| 352 |
-
# Simple check: should be github.com/username format
|
| 353 |
-
parts = url.replace("https://", "").replace("http://", "").split("/")
|
| 354 |
-
if len(parts) >= 2 and parts[0] in ["github.com", "www.github.com"]:
|
| 355 |
-
username = parts[1]
|
| 356 |
-
# Basic username validation
|
| 357 |
-
if (
|
| 358 |
-
len(username) > 0
|
| 359 |
-
and not username.startswith("-")
|
| 360 |
-
and not username.endswith("-")
|
| 361 |
-
):
|
| 362 |
-
return True
|
| 363 |
-
|
| 364 |
-
return False
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
# Client factory for easy instantiation
|
| 368 |
-
class APIClientFactory:
|
| 369 |
-
"""Factory for creating API clients"""
|
| 370 |
-
|
| 371 |
-
@staticmethod
|
| 372 |
-
def create_serpapi_client() -> SerpAPIClient:
|
| 373 |
-
return SerpAPIClient()
|
| 374 |
|
| 375 |
-
@staticmethod
|
| 376 |
-
def create_linkedin_client() -> LinkedInProfileClient:
|
| 377 |
-
return LinkedInProfileClient()
|
| 378 |
-
|
| 379 |
-
@staticmethod
|
| 380 |
-
def create_github_client() -> GitHubClient:
|
| 381 |
-
return GitHubClient()
|
| 382 |
-
|
| 383 |
-
@staticmethod
|
| 384 |
-
def create_url_extractor() -> URLExtractor:
|
| 385 |
-
return URLExtractor()
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
# Example usage
|
| 389 |
async def test_clients():
|
| 390 |
"""Test function for API clients"""
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
|
| 417 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import httpx
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
import traceback
|
| 6 |
from typing import Any, Optional
|
| 7 |
from fake_useragent import UserAgent
|
| 8 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
|
| 9 |
|
| 10 |
from app.config import settings
|
| 11 |
from app.models.schemas import SearchQuery
|
|
|
|
| 48 |
"""Client for SerpAPI Google search"""
|
| 49 |
|
| 50 |
def __init__(self):
|
| 51 |
+
self._client = httpx.AsyncClient(base_url="https://serpapi.com/search")
|
| 52 |
self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
|
| 53 |
|
| 54 |
+
# Context manager to ensure httpx client is closed properly
|
| 55 |
+
async def __aenter__(self):
|
| 56 |
+
return self
|
| 57 |
+
|
| 58 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 59 |
+
await self._client.aclose()
|
| 60 |
+
|
| 61 |
@retry(
|
| 62 |
stop=stop_after_attempt(settings.max_retries),
|
| 63 |
wait=wait_exponential(multiplier=1, min=4, max=10),
|
| 64 |
+
reraise=True, # Ensure exceptions are re-raised after retries
|
| 65 |
)
|
| 66 |
+
async def _execute_search_request(
|
| 67 |
+
self, params: dict[str, Any]
|
| 68 |
+
) -> list[dict[str, Any]]:
|
| 69 |
+
"""
|
| 70 |
+
Internal method to execute a single SerpAPI request with retries.
|
| 71 |
+
It takes a dictionary of parameters and returns the JSON response.
|
| 72 |
+
"""
|
| 73 |
async with self.semaphore:
|
| 74 |
try:
|
| 75 |
+
params["api_key"] = settings.serpapi_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
logger.info(
|
| 78 |
"serpapi_request",
|
| 79 |
+
extra={
|
| 80 |
+
"query": params.get("q"),
|
| 81 |
+
"platform": params.get("platform"),
|
| 82 |
+
},
|
| 83 |
)
|
| 84 |
|
| 85 |
+
response = await self._client.get(
|
| 86 |
+
url="https://serpapi.com/search", params=params
|
|
|
|
|
|
|
| 87 |
)
|
| 88 |
+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
|
| 89 |
|
| 90 |
+
result = response.json()
|
| 91 |
organic_results = result.get("organic_results", [])
|
| 92 |
+
if not organic_results:
|
| 93 |
+
logger.warning(
|
| 94 |
+
f"Did not get any organic results for {params.get('q')}"
|
| 95 |
+
)
|
| 96 |
|
| 97 |
logger.info(
|
| 98 |
"serpapi_response",
|
| 99 |
+
extra={
|
| 100 |
+
"results_count": len(result.get("organic_results", [])),
|
| 101 |
+
"query": params.get("q"),
|
| 102 |
+
},
|
| 103 |
)
|
| 104 |
+
return result
|
| 105 |
|
| 106 |
+
except httpx.HTTPStatusError as e:
|
| 107 |
+
logger.exception(
|
| 108 |
+
"serpapi_http_error",
|
| 109 |
+
extra={
|
| 110 |
+
"error": str(e),
|
| 111 |
+
"query": params.get("q"),
|
| 112 |
+
"status_code": e.response.status_code,
|
| 113 |
+
},
|
| 114 |
+
)
|
| 115 |
+
raise
|
| 116 |
+
except httpx.RequestError as e:
|
| 117 |
+
logger.exception(
|
| 118 |
+
"serpapi_request_error",
|
| 119 |
+
extra={"error": str(e), "query": params.get("q")},
|
| 120 |
+
)
|
| 121 |
+
raise
|
| 122 |
+
except json.JSONDecodeError as e:
|
| 123 |
+
logger.exception(
|
| 124 |
+
"serpapi_json_decode_error",
|
| 125 |
+
extra={
|
| 126 |
+
"error": str(e),
|
| 127 |
+
"query": params.get("q"),
|
| 128 |
+
"response_text": (
|
| 129 |
+
response.text if "response" in locals() else "N/A"
|
| 130 |
+
),
|
| 131 |
+
},
|
| 132 |
+
)
|
| 133 |
+
raise
|
| 134 |
except Exception as e:
|
| 135 |
+
logger.exception(
|
| 136 |
+
"serpapi_unexpected_error",
|
| 137 |
+
extra={"error": str(e), "query": params.get("q")},
|
| 138 |
)
|
| 139 |
raise
|
| 140 |
|
| 141 |
+
async def search(
|
| 142 |
+
self, query_string: str, max_results: int = 20
|
| 143 |
+
) -> list[dict[str, Any]]:
|
| 144 |
+
"""
|
| 145 |
+
Execute a single search query via SerpAPI and return the raw JSON response.
|
| 146 |
+
This method directly takes a string query - expected google dork query.
|
| 147 |
+
"""
|
| 148 |
+
params = {
|
| 149 |
+
"q": query_string,
|
| 150 |
+
"engine": "google",
|
| 151 |
+
"num": max_results,
|
| 152 |
+
"start": 0,
|
| 153 |
+
"gl": "us", # Country
|
| 154 |
+
"hl": "en", # Language
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
full_result = await self._execute_search_request(params)
|
| 158 |
+
return full_result
|
| 159 |
+
|
| 160 |
async def batch_search(
|
| 161 |
self, queries: list[SearchQuery]
|
| 162 |
) -> dict[str, list[dict[str, Any]]]:
|
| 163 |
"""Execute multiple search queries concurrently"""
|
| 164 |
tasks = []
|
| 165 |
+
for query_obj in queries:
|
| 166 |
+
params = {
|
| 167 |
+
"q": query_obj.query,
|
| 168 |
+
"engine": "google",
|
| 169 |
+
"num": query_obj.max_results,
|
| 170 |
+
"start": 0,
|
| 171 |
+
"gl": "us",
|
| 172 |
+
"hl": "en",
|
| 173 |
+
}
|
| 174 |
+
task = asyncio.create_task(self._execute_search_request(params))
|
| 175 |
+
tasks.append((query_obj.query, task))
|
| 176 |
|
| 177 |
# Add delay between requests to avoid rate limiting
|
| 178 |
+
# This applies to the *initiation* of requests, not their completion.
|
| 179 |
await asyncio.sleep(settings.request_delay)
|
| 180 |
|
| 181 |
results = {}
|
| 182 |
for query_text, task in tasks:
|
| 183 |
try:
|
| 184 |
+
full_result = await task
|
| 185 |
+
results[query_text] = full_result
|
| 186 |
except Exception as e:
|
| 187 |
+
logger.exception(
|
| 188 |
+
"batch_search_item_error",
|
| 189 |
+
extra={"query": query_text, "error": str(e)},
|
| 190 |
)
|
| 191 |
results[query_text] = []
|
| 192 |
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
class LinkedInProfileClient:
|
| 197 |
+
"""Client for fetching LinkedIn profile data from RapidAPI"""
|
| 198 |
|
| 199 |
def __init__(self):
|
| 200 |
+
self._client = httpx.AsyncClient(
|
| 201 |
+
base_url="https://fresh-linkedin-profile-data.p.rapidapi.com",
|
| 202 |
+
headers={
|
| 203 |
+
"x-rapidapi-host": "fresh-linkedin-profile-data.p.rapidapi.com",
|
| 204 |
+
"x-rapidapi-key": settings.rapidapi_api_key,
|
| 205 |
+
},
|
| 206 |
+
timeout=30.0,
|
| 207 |
+
)
|
| 208 |
self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
|
| 209 |
|
| 210 |
+
async def __aenter__(self):
|
| 211 |
+
return self
|
| 212 |
+
|
| 213 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 214 |
+
await self._client.aclose()
|
| 215 |
+
|
| 216 |
@retry(
|
| 217 |
stop=stop_after_attempt(settings.max_retries),
|
| 218 |
wait=wait_exponential(multiplier=1, min=2, max=8),
|
| 219 |
+
reraise=True,
|
| 220 |
)
|
| 221 |
async def fetch_profile(self, linkedin_url: str) -> Optional[dict[str, Any]]:
|
| 222 |
+
"""Fetch LinkedIn profile data from RapidAPI"""
|
| 223 |
async with self.semaphore:
|
| 224 |
try:
|
| 225 |
+
params = {
|
| 226 |
+
"linkedin_url": linkedin_url,
|
| 227 |
+
"include_skills": "false",
|
| 228 |
+
"include_certifications": "false",
|
| 229 |
+
"include_publications": "false",
|
| 230 |
+
"include_honors": "false",
|
| 231 |
+
"include_volunteers": "false",
|
| 232 |
+
"include_projects": "false",
|
| 233 |
+
"include_patents": "false",
|
| 234 |
+
"include_courses": "false",
|
| 235 |
+
"include_organizations": "false",
|
| 236 |
+
"include_profile_status": "false",
|
| 237 |
+
"include_company_public_url": "true",
|
| 238 |
}
|
| 239 |
|
| 240 |
+
logger.debug("Starting RapidAPI LinkedIn profile fetch.")
|
| 241 |
+
|
| 242 |
+
response = await self._client.get(
|
| 243 |
+
url="/get-linkedin-profile", params=params
|
| 244 |
+
)
|
| 245 |
+
response.raise_for_status()
|
| 246 |
+
|
| 247 |
+
profile_data = response.json()
|
| 248 |
+
logger.debug(
|
| 249 |
+
"Got RapidAPI LinkedIn profile response",
|
| 250 |
+
extra={"status_code": response.status_code},
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
return profile_data
|
| 254 |
+
|
| 255 |
+
except httpx.HTTPStatusError as e:
|
| 256 |
+
logger.exception(
|
| 257 |
+
"RapidAPI LinkedIn fetch HTTP error",
|
| 258 |
+
extra={
|
| 259 |
+
"linkedin_url": linkedin_url,
|
| 260 |
+
"status_code": e.response.status_code,
|
| 261 |
+
"response_text": e.response.text,
|
| 262 |
+
"error": str(e),
|
| 263 |
+
},
|
| 264 |
+
)
|
| 265 |
+
raise
|
| 266 |
+
|
| 267 |
+
except httpx.RequestError as e:
|
| 268 |
+
logger.exception(
|
| 269 |
+
"RapidAPI LinkedIn fetch request error",
|
| 270 |
+
extra={"linkedin_url": linkedin_url, "error": str(e)},
|
| 271 |
+
)
|
| 272 |
+
raise
|
| 273 |
|
| 274 |
+
except json.JSONDecodeError as e:
|
| 275 |
+
logger.exception(
|
| 276 |
+
"RapidAPI LinkedIn JSON decode error",
|
| 277 |
+
extra={
|
| 278 |
+
"linkedin_url": linkedin_url,
|
| 279 |
+
"error": str(e),
|
| 280 |
+
"response_text": (
|
| 281 |
+
response.text if "response" in locals() else "N/A"
|
| 282 |
+
),
|
| 283 |
+
},
|
| 284 |
+
)
|
| 285 |
+
raise
|
| 286 |
except Exception:
|
| 287 |
logger.exception(
|
| 288 |
+
"Failed to fetch LinkedIn profile (unexpected error)",
|
| 289 |
+
extra={"linkedin_url": linkedin_url},
|
| 290 |
)
|
| 291 |
raise
|
| 292 |
|
|
|
|
| 299 |
task = asyncio.create_task(self.fetch_profile(url))
|
| 300 |
tasks.append((url, task))
|
| 301 |
|
| 302 |
+
await asyncio.sleep(5.0)
|
|
|
|
| 303 |
|
| 304 |
results = {}
|
| 305 |
for url, task in tasks:
|
|
|
|
| 308 |
results[url] = result
|
| 309 |
except Exception:
|
| 310 |
logger.exception(
|
| 311 |
+
"Batch LinkedIn profile fetch failed for URL",
|
| 312 |
+
extra={"linkedin_url": url},
|
| 313 |
)
|
| 314 |
results[url] = None
|
| 315 |
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
class GitHubClient:
|
| 320 |
+
"""Client for fetching GitHub profile HTML"""
|
| 321 |
|
| 322 |
def __init__(self):
|
| 323 |
+
self._client = httpx.AsyncClient(timeout=20.0)
|
| 324 |
self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
|
| 325 |
|
| 326 |
+
async def __aenter__(self):
|
| 327 |
+
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 330 |
+
await self._client.aclose()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
+
async def fetch_github_profile_html(self, github_url: str) -> Optional[str]:
|
| 333 |
+
"""Fetch GitHub profile page HTML"""
|
| 334 |
+
async with self.semaphore:
|
| 335 |
+
try:
|
| 336 |
+
response = await self._client.get(github_url)
|
| 337 |
+
response.raise_for_status()
|
| 338 |
+
return response.text
|
| 339 |
+
except httpx.HTTPStatusError as e:
|
| 340 |
+
logger.warning(
|
| 341 |
+
f"Failed to fetch {github_url}: HTTP {e.response.status_code}"
|
| 342 |
)
|
| 343 |
return None
|
| 344 |
+
except httpx.RequestError as e:
|
| 345 |
+
logger.warning(f"Request error for {github_url}: {e}")
|
| 346 |
+
return None
|
| 347 |
+
except Exception:
|
| 348 |
+
logger.exception(f"Unexpected error fetching {github_url}")
|
| 349 |
+
return None
|
| 350 |
|
| 351 |
|
| 352 |
class URLExtractor:
|
| 353 |
+
"""Extracts specific profile URLs from search results"""
|
| 354 |
+
|
| 355 |
+
def _extract_and_filter_urls(
|
| 356 |
+
self, search_results: list[dict[str, Any]], validator_func
|
| 357 |
+
) -> list[str]:
|
| 358 |
+
"""
|
| 359 |
+
Generic method to extract and filter URLs based on a validator function.
|
| 360 |
+
Handles deduplication.
|
| 361 |
+
"""
|
| 362 |
+
unique_urls: set[str] = set()
|
| 363 |
for result in search_results:
|
| 364 |
link = result.get("link", "")
|
| 365 |
+
if validator_func(link):
|
| 366 |
+
unique_urls.add(link)
|
| 367 |
+
return list(unique_urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
+
def extract_linkedin_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
|
| 370 |
+
"""Extract LinkedIn profile URLs from search results"""
|
| 371 |
+
urls = self._extract_and_filter_urls(
|
| 372 |
+
search_results, self._is_valid_linkedin_url
|
| 373 |
+
)
|
| 374 |
+
logger.info("extracted_linkedin_urls", extra={"count": len(urls)})
|
| 375 |
+
return urls
|
| 376 |
|
| 377 |
def extract_github_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
|
| 378 |
"""Extract GitHub profile URLs from search results"""
|
| 379 |
+
urls = self._extract_and_filter_urls(search_results, self._is_valid_github_url)
|
| 380 |
+
logger.info("extracted_github_urls", extra={"count": len(urls)})
|
| 381 |
+
return urls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
def _is_valid_linkedin_url(self, url: str) -> bool:
|
| 384 |
"""Validate LinkedIn profile URL"""
|
| 385 |
if not url:
|
| 386 |
return False
|
| 387 |
|
| 388 |
+
# Use regex for more precise validation of /in/ or /pub/ type profiles
|
| 389 |
+
# Allowing for optional trailing slash or query parameters
|
| 390 |
+
linkedin_profile_pattern = re.compile(
|
| 391 |
+
r"https?:\/\/(?:www\.)?linkedin\.com\/(?:in|pub)\/[a-zA-Z0-9_-]+\/?(?:[?#].*)?$",
|
| 392 |
+
re.IGNORECASE,
|
| 393 |
+
)
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
+
# Exclude common non-profile pages
|
| 396 |
exclude_patterns = [
|
| 397 |
"/company/",
|
| 398 |
"/school/",
|
|
|
|
| 400 |
"/jobs/",
|
| 401 |
"/directory/",
|
| 402 |
"/pub/dir/",
|
|
|
|
| 403 |
"/learning/",
|
| 404 |
"/events/",
|
| 405 |
"/posts/",
|
| 406 |
]
|
| 407 |
|
| 408 |
+
# Check against exclude patterns first (case-insensitive)
|
| 409 |
if any(pattern in url.lower() for pattern in exclude_patterns):
|
| 410 |
return False
|
| 411 |
|
| 412 |
+
# Final check using the profile pattern
|
| 413 |
+
return bool(linkedin_profile_pattern.match(url))
|
| 414 |
|
| 415 |
def _is_valid_github_url(self, url: str) -> bool:
|
| 416 |
"""Validate GitHub profile URL"""
|
| 417 |
if not url:
|
| 418 |
return False
|
| 419 |
|
| 420 |
+
# Regex to match github.com/username, excluding common repo/org paths.
|
| 421 |
+
# This regex tries to ensure it's a root-level profile path.
|
| 422 |
+
github_profile_pattern = re.compile(
|
| 423 |
+
r"https?:\/\/(?:www\.)?github\.com\/[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,37}[a-zA-Z0-9])?\/?(?:[?#].*)?$",
|
| 424 |
+
re.IGNORECASE,
|
| 425 |
+
)
|
| 426 |
|
| 427 |
+
# Exclude common non-profile pages more broadly to catch variations
|
|
|
|
|
|
|
|
|
|
| 428 |
exclude_patterns = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
"/tree/",
|
| 430 |
"/blob/",
|
| 431 |
"/commit/",
|
| 432 |
+
"/pull/",
|
| 433 |
+
"/issues/",
|
| 434 |
+
"/wiki/",
|
| 435 |
+
"/releases/",
|
| 436 |
+
"/actions/",
|
| 437 |
+
"/projects/",
|
| 438 |
+
"/settings/",
|
| 439 |
+
"/new/",
|
| 440 |
+
"/organizations/",
|
| 441 |
+
"/users/",
|
| 442 |
+
"/stars/",
|
| 443 |
+
"/forks/",
|
| 444 |
"/compare/",
|
| 445 |
+
"/marketplace/",
|
| 446 |
+
"/topics/",
|
| 447 |
+
"/collections/",
|
| 448 |
+
"/trending/",
|
| 449 |
+
"/explore/",
|
| 450 |
+
"/codespaces/",
|
| 451 |
+
"/sponsors/",
|
| 452 |
+
"/notifications/",
|
| 453 |
]
|
|
|
|
| 454 |
if any(pattern in url.lower() for pattern in exclude_patterns):
|
| 455 |
return False
|
| 456 |
+
return bool(github_profile_pattern.match(url))
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
async def test_clients():
|
| 460 |
"""Test function for API clients"""
|
| 461 |
+
async with SerpAPIClient() as serpapi:
|
| 462 |
+
try:
|
| 463 |
+
linkedin_search = await serpapi.search(
|
| 464 |
+
'site:linkedin.com/in "ml research" "llm" "mountain view" -student -intern -"recent graduate"'
|
| 465 |
+
)
|
| 466 |
+
github_search = await serpapi.search(
|
| 467 |
+
'site:github.com "pytorch" "transformers" "nlp" followers:>10 repos:>5'
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
with open("test/data/linkedin_search.json", "w") as f:
|
| 471 |
+
json.dump(linkedin_search, f, indent=4)
|
| 472 |
+
with open("test/data/github_search.json", "w") as f:
|
| 473 |
+
json.dump(github_search, f, indent=4)
|
| 474 |
+
|
| 475 |
+
# Extract organic_results from the full response
|
| 476 |
+
linkedin_results = linkedin_search.get("organic_results", [])
|
| 477 |
+
github_results = github_search.get("organic_results", [])
|
| 478 |
+
|
| 479 |
+
print(f"Found {len(linkedin_results)} lnkd search results")
|
| 480 |
+
print(f"Found {len(github_results)} gh search results")
|
| 481 |
+
|
| 482 |
+
extractor = URLExtractor()
|
| 483 |
+
linkedin_urls = extractor.extract_linkedin_urls(linkedin_results)
|
| 484 |
+
github_urls = extractor.extract_github_urls(github_results)
|
| 485 |
+
|
| 486 |
+
# Store extracted URLs
|
| 487 |
+
with open("test/data/linkedin_urls.txt", "w") as f:
|
| 488 |
+
for url in linkedin_urls:
|
| 489 |
+
f.write(url + "\n")
|
| 490 |
+
with open("test/data/github_urls.txt", "w") as f:
|
| 491 |
+
for url in github_urls:
|
| 492 |
+
f.write(url + "\n")
|
| 493 |
+
|
| 494 |
+
print(f"Extracted {len(linkedin_urls)} LinkedIn URLs")
|
| 495 |
+
print(f"Extracted {len(github_urls)} GH URLs")
|
| 496 |
+
|
| 497 |
+
if linkedin_urls:
|
| 498 |
+
async with LinkedInProfileClient() as linkedin_client:
|
| 499 |
+
profile_data = await linkedin_client.fetch_profile(linkedin_urls[0])
|
| 500 |
+
# Store fetched LinkedIn profile data
|
| 501 |
+
with open("test/data/linkedin_profile.json", "w") as f:
|
| 502 |
+
json.dump(profile_data, f, indent=4)
|
| 503 |
+
print(f"Profile fetch result: {profile_data is not None}")
|
| 504 |
+
|
| 505 |
+
if github_urls:
|
| 506 |
+
async with GitHubClient() as github_client:
|
| 507 |
+
github_data = await github_client.fetch_github_profile_html(
|
| 508 |
+
github_urls[0]
|
| 509 |
+
)
|
| 510 |
+
# Store fetched GitHub profile HTML
|
| 511 |
+
if github_data is not None:
|
| 512 |
+
with open("test/data/github_profile.html", "w") as f:
|
| 513 |
+
f.write(github_data)
|
| 514 |
+
print(f"GH profile fetched: {github_data is not None}")
|
| 515 |
+
if github_data:
|
| 516 |
+
print(f"gh profile html len {len(github_data)}")
|
| 517 |
+
|
| 518 |
+
except Exception as e:
|
| 519 |
+
print(f"Test failed: {e}")
|
| 520 |
+
traceback.print_exc()
|
| 521 |
|
| 522 |
|
| 523 |
if __name__ == "__main__":
|
test/data/github_profile.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
test/data/github_search.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"search_metadata": {
|
| 3 |
+
"id": "6862ba316adf27aef7fa1728",
|
| 4 |
+
"status": "Success",
|
| 5 |
+
"json_endpoint": "https://serpapi.com/searches/19d6b92708c05940/6862ba316adf27aef7fa1728.json",
|
| 6 |
+
"created_at": "2025-06-30 16:24:17 UTC",
|
| 7 |
+
"processed_at": "2025-06-30 16:24:17 UTC",
|
| 8 |
+
"google_url": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=0&sourceid=chrome&ie=UTF-8",
|
| 9 |
+
"raw_html_file": "https://serpapi.com/searches/19d6b92708c05940/6862ba316adf27aef7fa1728.html",
|
| 10 |
+
"total_time_taken": 4.53
|
| 11 |
+
},
|
| 12 |
+
"search_parameters": {
|
| 13 |
+
"engine": "google",
|
| 14 |
+
"q": "site:github.com \"pytorch\" \"transformers\" \"nlp\" followers:>10 repos:>5",
|
| 15 |
+
"google_domain": "google.com",
|
| 16 |
+
"hl": "en",
|
| 17 |
+
"gl": "us",
|
| 18 |
+
"start": 0,
|
| 19 |
+
"num": "20",
|
| 20 |
+
"device": "desktop"
|
| 21 |
+
},
|
| 22 |
+
"search_information": {
|
| 23 |
+
"query_displayed": "site:github.com \"pytorch\" \"transformers\" \"nlp\" followers:>10 repos:>5",
|
| 24 |
+
"total_results": 2,
|
| 25 |
+
"time_taken_displayed": 0.25,
|
| 26 |
+
"organic_results_state": "Results for exact spelling"
|
| 27 |
+
},
|
| 28 |
+
"organic_results": [
|
| 29 |
+
{
|
| 30 |
+
"position": 1,
|
| 31 |
+
"title": "Awesome Transformer & Transfer Learning in NLP Awesome",
|
| 32 |
+
"link": "https://github.com/cedrickchee/awesome-transformer-nlp",
|
| 33 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://github.com/cedrickchee/awesome-transformer-nlp&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQFnoECBoQAQ",
|
| 34 |
+
"displayed_link": "https://github.com \u203a cedrickchee \u203a awesome-transforme...",
|
| 35 |
+
"favicon": "https://serpapi.com/searches/6862ba316adf27aef7fa1728/images/f4c4dd624bf17ff4927fea0004812b7ae14156863c39ff22ad6d01ea9993e985.png",
|
| 36 |
+
"snippet": "How the Transformers broke NLP leaderboards by Anna Rogers. ... PyTorch and TensorFlow. Hugging Face Transformers (formerly known as pytorch-transformers ...",
|
| 37 |
+
"snippet_highlighted_words": [
|
| 38 |
+
"Transformers",
|
| 39 |
+
"NLP",
|
| 40 |
+
"PyTorch",
|
| 41 |
+
"Transformers",
|
| 42 |
+
"pytorch",
|
| 43 |
+
"transformers"
|
| 44 |
+
],
|
| 45 |
+
"source": "GitHub"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"position": 2,
|
| 49 |
+
"title": "Hugging Face",
|
| 50 |
+
"link": "https://github.com/huggingface",
|
| 51 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://github.com/huggingface&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQFnoECB0QAQ",
|
| 52 |
+
"displayed_link": "https://github.com \u203a huggingface",
|
| 53 |
+
"favicon": "https://serpapi.com/searches/6862ba316adf27aef7fa1728/images/f4c4dd624bf17ff4927fea0004812b7a1c6fc5dbd006969ca023f9f5e48fb80d.png",
|
| 54 |
+
"snippet": "Showing 10 of 327 repositories. transformers Public. Transformers: the ... pytorch machine-learning nlp transformers deep-learning \u00b7 GitHub Sponsor ...",
|
| 55 |
+
"snippet_highlighted_words": [
|
| 56 |
+
"10",
|
| 57 |
+
"repositories",
|
| 58 |
+
"transformers",
|
| 59 |
+
"Transformers",
|
| 60 |
+
"pytorch",
|
| 61 |
+
"nlp transformers"
|
| 62 |
+
],
|
| 63 |
+
"sitelinks": {
|
| 64 |
+
"inline": [
|
| 65 |
+
{
|
| 66 |
+
"title": "Transformers",
|
| 67 |
+
"link": "https://github.com/huggingface/transformers"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"title": "Repositories",
|
| 71 |
+
"link": "https://github.com/orgs/huggingface/repositories"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"title": "Huggingface/datasets",
|
| 75 |
+
"link": "https://github.com/huggingface/datasets"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"title": "Diffusers",
|
| 79 |
+
"link": "https://github.com/huggingface/diffusers"
|
| 80 |
+
}
|
| 81 |
+
]
|
| 82 |
+
},
|
| 83 |
+
"source": "GitHub"
|
| 84 |
+
}
|
| 85 |
+
],
|
| 86 |
+
"related_searches": [
|
| 87 |
+
{
|
| 88 |
+
"block_position": 1,
|
| 89 |
+
"query": "Pytorch transformers nlp followers :> 10 repos :> 5 github",
|
| 90 |
+
"link": "https://www.google.com/search?num=20&sca_esv=6d27a280cc9bfc7a&hl=en&gl=us&q=Pytorch+transformers+nlp+followers+:%3E+10+repos+:%3E+5+github&sa=X&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQ1QJ6BAgSEAE",
|
| 91 |
+
"serpapi_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=Pytorch+transformers+nlp+followers+%3A%3E+10+repos+%3A%3E+5+github&start=0"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"block_position": 1,
|
| 95 |
+
"query": "Pytorch transformers nlp followers :> 10 repos :> 5 example",
|
| 96 |
+
"link": "https://www.google.com/search?num=20&sca_esv=6d27a280cc9bfc7a&hl=en&gl=us&q=Pytorch+transformers+nlp+followers+:%3E+10+repos+:%3E+5+example&sa=X&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQ1QJ6BAgREAE",
|
| 97 |
+
"serpapi_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=Pytorch+transformers+nlp+followers+%3A%3E+10+repos+%3A%3E+5+example&start=0"
|
| 98 |
+
}
|
| 99 |
+
],
|
| 100 |
+
"pagination": {
|
| 101 |
+
"current": 1,
|
| 102 |
+
"next": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=20&sourceid=chrome&ie=UTF-8",
|
| 103 |
+
"other_pages": {
|
| 104 |
+
"2": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=20&sourceid=chrome&ie=UTF-8",
|
| 105 |
+
"3": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=40&sourceid=chrome&ie=UTF-8",
|
| 106 |
+
"4": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=60&sourceid=chrome&ie=UTF-8",
|
| 107 |
+
"5": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=80&sourceid=chrome&ie=UTF-8"
|
| 108 |
+
}
|
| 109 |
+
},
|
| 110 |
+
"serpapi_pagination": {
|
| 111 |
+
"current": 1,
|
| 112 |
+
"next_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=20",
|
| 113 |
+
"next": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=20",
|
| 114 |
+
"other_pages": {
|
| 115 |
+
"2": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=20",
|
| 116 |
+
"3": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=40",
|
| 117 |
+
"4": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=60",
|
| 118 |
+
"5": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=80"
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
}
|
test/data/github_urls.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://github.com/huggingface
|
test/data/linkedin_profile.json
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"about": "Experienced applied scientist with demonstrated success developing and leveraging cutting-edge AI technology with a Gen AI focus to solve real-world problems. Adept at quickly building proof-of-concepts of latest AI/ML research and transitioning them into deployable solutions to deliver business value. Proven track record in applied AI research and innovation with publications in top-tier AI journals, best paper award, and over 1000 citations. Proficient in communicating complex technical concepts and results to diverse audiences.",
|
| 4 |
+
"city": "Houston",
|
| 5 |
+
"company": "NVIDIA",
|
| 6 |
+
"company_description": "Since its founding in 1993, NVIDIA (NASDAQ: NVDA) has been a pioneer in accelerated computing. The company\u2019s invention of the GPU in 1999 sparked the growth of the PC gaming market, redefined computer graphics, ignited the era of modern AI and is fueling the creation of the metaverse. NVIDIA is now a full-stack computing company with data-center-scale offerings that are reshaping industry.",
|
| 7 |
+
"company_domain": "nvidia.com",
|
| 8 |
+
"company_employee_count": 42856,
|
| 9 |
+
"company_employee_range": "10001+",
|
| 10 |
+
"company_industry": "Computer Hardware Manufacturing",
|
| 11 |
+
"company_linkedin_url": "https://www.linkedin.com/company/nvidia",
|
| 12 |
+
"company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQGV36q2EowSyw/company-logo_400_400/company-logo_400_400/0/1724881581208/nvidia_logo?e=1756944000&v=beta&t=hWYnbN29CDwsfzVnmcC87Jh2-TqH_oIPymyMkRXfPJk",
|
| 13 |
+
"company_website": "http://www.nvidia.com",
|
| 14 |
+
"company_year_founded": 1993,
|
| 15 |
+
"connection_count": 544,
|
| 16 |
+
"country": "United States",
|
| 17 |
+
"current_company_join_month": 4,
|
| 18 |
+
"current_company_join_year": 2024,
|
| 19 |
+
"current_job_duration": "1 yr 3 mos",
|
| 20 |
+
"educations": [
|
| 21 |
+
{
|
| 22 |
+
"activities": "",
|
| 23 |
+
"date_range": "2014 - 2017",
|
| 24 |
+
"degree": "Doctor of Philosophy (PhD)",
|
| 25 |
+
"end_month": "",
|
| 26 |
+
"end_year": 2017,
|
| 27 |
+
"field_of_study": "Civil/Structural Engineering",
|
| 28 |
+
"school": "Louisiana State University",
|
| 29 |
+
"school_id": "164771",
|
| 30 |
+
"school_linkedin_url": "https://www.linkedin.com/company/164771/",
|
| 31 |
+
"school_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQEhy54pBgGE9g/company-logo_200_200/company-logo_200_200/0/1681160348853/louisiana_state_university_logo?e=1756944000&v=beta&t=Uz-JKcul07L8O2pEk1nV7ImIGLtw3Dr8thHISsfqSJA",
|
| 32 |
+
"start_month": "",
|
| 33 |
+
"start_year": 2014
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"activities": "",
|
| 37 |
+
"date_range": "2010 - 2014",
|
| 38 |
+
"degree": "Bachelor's degree",
|
| 39 |
+
"end_month": "",
|
| 40 |
+
"end_year": 2014,
|
| 41 |
+
"field_of_study": "Civil Engineering",
|
| 42 |
+
"school": "Hunan University",
|
| 43 |
+
"school_id": "91038",
|
| 44 |
+
"school_linkedin_url": "https://www.linkedin.com/company/91038/",
|
| 45 |
+
"school_logo_url": "https://media.licdn.com/dms/image/v2/C510BAQHo9p4G79briw/company-logo_200_200/company-logo_200_200/0/1630564627893/hunan_university_logo?e=1756944000&v=beta&t=mtLHSeTKy8TSfjLFaL5RuXMXzhbJ4U8bTkKPdO_XGrk",
|
| 46 |
+
"start_month": "",
|
| 47 |
+
"start_year": 2010
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"email": "",
|
| 51 |
+
"experiences": [
|
| 52 |
+
{
|
| 53 |
+
"company": "NVIDIA",
|
| 54 |
+
"company_id": "3608",
|
| 55 |
+
"company_linkedin_url": "https://www.linkedin.com/company/3608",
|
| 56 |
+
"company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQGV36q2EowSyw/company-logo_200_200/company-logo_200_200/0/1724881581208/nvidia_logo?e=1756944000&v=beta&t=6JH-MY7mJHJkjhcK60TGYlKcL0qKpSvGS_R2l6mE-Kw",
|
| 57 |
+
"company_public_url": "https://www.linkedin.com/company/nvidia",
|
| 58 |
+
"date_range": "Apr 2024 - Present",
|
| 59 |
+
"description": "",
|
| 60 |
+
"duration": "1 yr 3 mos",
|
| 61 |
+
"end_month": "",
|
| 62 |
+
"end_year": "",
|
| 63 |
+
"is_current": true,
|
| 64 |
+
"job_type": "Full-time",
|
| 65 |
+
"location": "Remote",
|
| 66 |
+
"skills": "",
|
| 67 |
+
"start_month": 4,
|
| 68 |
+
"start_year": 2024,
|
| 69 |
+
"title": "Senior Data Scientist, Gen AI"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"company": "Raytheon Technologies",
|
| 73 |
+
"company_id": "40653509",
|
| 74 |
+
"company_linkedin_url": "https://www.linkedin.com/company/40653509",
|
| 75 |
+
"company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQFITBtnCzFO9w/company-logo_200_200/company-logo_200_200/0/1687169432565/raytheontechnologies_logo?e=1756944000&v=beta&t=eEmqdsWL-7uKO6qOuFCxhUKQea4KCv4P4c3BorEP4-c",
|
| 76 |
+
"company_public_url": "https://www.linkedin.com/company/rtx",
|
| 77 |
+
"date_range": "Aug 2022 - Apr 2024",
|
| 78 |
+
"description": "At Applied AI team in Enterprise Data Services. Lead the technical development of key Generative AI projects to improve productivity and enhance efficiency across organizations, including:\n\nBuilt and optimized retrieval-augmented generation (RAG) pipeline for RTX's enterprise-level ChatGPT using Azure and OpenAI services to enable interacting with company internal data via chat.\n\nDeveloped Gen AI-powered tools to streamline document review process using open-source LLMs hosted on-premise and optimized LLM inference.\n\nCreated rapidly deployable computer vision solution for automating visual inspection of industrial components using multi-modal (vision-language) models without task-specific training, reducing onboarding time for new inspection programs.\n\nDeveloped novel deep learning framework for data-driven augmentation of physics-based models using generative adversarial network (GAN) for improving jet engine design.",
|
| 79 |
+
"duration": "1 yr 9 mos",
|
| 80 |
+
"end_month": 4,
|
| 81 |
+
"end_year": 2024,
|
| 82 |
+
"is_current": false,
|
| 83 |
+
"job_type": "Full-time",
|
| 84 |
+
"location": "Remote",
|
| 85 |
+
"skills": "Python (Programming Language) \u00b7 Machine Learning \u00b7 Large Language Models (LLM) \u00b7 Deep Learning \u00b7 Computer Vision \u00b7 Data Science \u00b7 PyTorch \u00b7 Natural Language Processing (NLP) \u00b7 Generative AI \u00b7 Information Retrieval \u00b7 OpenAI \u00b7 Azure Databricks \u00b7 TensorFlow \u00b7 Azure \u00b7 Prompt Engineering",
|
| 86 |
+
"start_month": 8,
|
| 87 |
+
"start_year": 2022,
|
| 88 |
+
"title": "Senior Applied Scientist, Generative AI"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"company": "Penn State University",
|
| 92 |
+
"company_id": "3657",
|
| 93 |
+
"company_linkedin_url": "https://www.linkedin.com/company/3657",
|
| 94 |
+
"company_logo_url": "https://media.licdn.com/dms/image/v2/C560BAQEFv0lqpE-gsQ/company-logo_200_200/company-logo_200_200/0/1631308876392?e=1756944000&v=beta&t=XHqHEPjKP8Y_NtwerKGr0g4oTkBBK7gbUgdUxs5m73U",
|
| 95 |
+
"company_public_url": "https://www.linkedin.com/school/penn-state-university/",
|
| 96 |
+
"date_range": "Mar 2021 - Aug 2022",
|
| 97 |
+
"description": "Developed probabilistic deep learning framework for differential equation discovery, implemented it using Tensorflow, and published research work in top-tier AI journal. Wrote grant proposals and led research team of Ph.D. students to develop novel AI/ML methods for statistical model calibration.",
|
| 98 |
+
"duration": "1 yr 6 mos",
|
| 99 |
+
"end_month": 8,
|
| 100 |
+
"end_year": 2022,
|
| 101 |
+
"is_current": false,
|
| 102 |
+
"job_type": "",
|
| 103 |
+
"location": "University Park, Pennsylvania, United States",
|
| 104 |
+
"skills": "Python (Programming Language) \u00b7 Machine Learning \u00b7 Deep Learning \u00b7 Data Science \u00b7 Statistical Modeling \u00b7 TensorFlow",
|
| 105 |
+
"start_month": 3,
|
| 106 |
+
"start_year": 2021,
|
| 107 |
+
"title": "Assistant Research Professor, Applied AI"
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"company": "Arizona State University",
|
| 111 |
+
"company_id": "4292",
|
| 112 |
+
"company_linkedin_url": "https://www.linkedin.com/company/4292",
|
| 113 |
+
"company_logo_url": "https://media.licdn.com/dms/image/v2/C560BAQHDGjY1IZJuog/company-logo_200_200/company-logo_200_200/0/1631309406468?e=1756944000&v=beta&t=5zshFEI8NfV0B7jpay03j-VLchpTh_bc-Xm_4UMFKLM",
|
| 114 |
+
"company_public_url": "https://www.linkedin.com/school/arizona-state-university/",
|
| 115 |
+
"date_range": "Jan 2018 - Mar 2021",
|
| 116 |
+
"description": "Developed hybrid physics-AI model for dynamical system modeling, simulation and identification, and published research work in top-tier AI journals. Wrote grant proposals and served as Co-PI for USDOT-funded project to develop computer-vision based solution and prototype robotic platform for gas pipeline anomaly detection.",
|
| 117 |
+
"duration": "3 yrs 3 mos",
|
| 118 |
+
"end_month": 3,
|
| 119 |
+
"end_year": 2021,
|
| 120 |
+
"is_current": false,
|
| 121 |
+
"job_type": "",
|
| 122 |
+
"location": "Tempe, Arizona, United States",
|
| 123 |
+
"skills": "Python (Programming Language) \u00b7 Deep Learning \u00b7 Computer Vision \u00b7 Predictive Modeling \u00b7 TensorFlow \u00b7 Uncertainty Quantification",
|
| 124 |
+
"start_month": 1,
|
| 125 |
+
"start_year": 2018,
|
| 126 |
+
"title": "Postdoctoral Fellow, Applied AI"
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"company": "Louisiana State University",
|
| 130 |
+
"company_id": "164771",
|
| 131 |
+
"company_linkedin_url": "https://www.linkedin.com/company/164771",
|
| 132 |
+
"company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQEhy54pBgGE9g/company-logo_200_200/company-logo_200_200/0/1681160348853/louisiana_state_university_logo?e=1756944000&v=beta&t=Uz-JKcul07L8O2pEk1nV7ImIGLtw3Dr8thHISsfqSJA",
|
| 133 |
+
"company_public_url": "https://www.linkedin.com/school/louisiana-state-university/",
|
| 134 |
+
"date_range": "Aug 2014 - Dec 2017",
|
| 135 |
+
"description": "Collected, cleaned, and analyzed time series monitoring data from over 100 sensors (strain, acceleration, etc.) on I-10 Twin Span Bridge. Developed data-driven methods for extracting traffic insights from the monitoring data and built predictive model to forecast the extreme traffic loading effect on the bridge using Bayesian regression analysis and extreme value statistics, published research in peer-reviewed journals.",
|
| 136 |
+
"duration": "3 yrs 5 mos",
|
| 137 |
+
"end_month": 12,
|
| 138 |
+
"end_year": 2017,
|
| 139 |
+
"is_current": false,
|
| 140 |
+
"job_type": "",
|
| 141 |
+
"location": "Baton Rouge Metropolitan Area",
|
| 142 |
+
"skills": "Python (Programming Language) \u00b7 Data Science \u00b7 Predictive Modeling \u00b7 Digital Signal Processing",
|
| 143 |
+
"start_month": 8,
|
| 144 |
+
"start_year": 2014,
|
| 145 |
+
"title": "Research Assistant"
|
| 146 |
+
}
|
| 147 |
+
],
|
| 148 |
+
"first_name": "Yang",
|
| 149 |
+
"follower_count": 562,
|
| 150 |
+
"full_name": "Yang Yu",
|
| 151 |
+
"headline": "Gen AI @ Nvidia",
|
| 152 |
+
"hq_city": "Santa Clara",
|
| 153 |
+
"hq_country": "US",
|
| 154 |
+
"hq_region": "CA",
|
| 155 |
+
"is_creator": false,
|
| 156 |
+
"is_influencer": false,
|
| 157 |
+
"is_premium": false,
|
| 158 |
+
"is_verified": true,
|
| 159 |
+
"job_title": "Senior Data Scientist, Gen AI",
|
| 160 |
+
"languages": [],
|
| 161 |
+
"last_name": "Yu",
|
| 162 |
+
"linkedin_url": "https://www.linkedin.com/in/yangyu190/",
|
| 163 |
+
"location": "Houston, Texas, United States",
|
| 164 |
+
"phone": "",
|
| 165 |
+
"profile_id": "375556281",
|
| 166 |
+
"profile_image_url": "https://media.licdn.com/dms/image/v2/C4E03AQHX39ssDV26rw/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1517381869648?e=1756944000&v=beta&t=uEiiGq_HmA9CFPtwyDFk5NbKYQd_AYCyjkaFZg-LEfY",
|
| 167 |
+
"public_id": "yangyu190",
|
| 168 |
+
"school": "Louisiana State University",
|
| 169 |
+
"state": "Texas",
|
| 170 |
+
"urn": "ACoAABZiiLkBi19TRzv41wjO9aXI9nrCJMofQfo"
|
| 171 |
+
},
|
| 172 |
+
"message": "ok"
|
| 173 |
+
}
|
test/data/linkedin_search.json
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"search_metadata": {
|
| 3 |
+
"id": "6862bbabeeabd394c127d48f",
|
| 4 |
+
"status": "Success",
|
| 5 |
+
"json_endpoint": "https://serpapi.com/searches/63d251104f0cfcb2/6862bbabeeabd394c127d48f.json",
|
| 6 |
+
"created_at": "2025-06-30 16:30:35 UTC",
|
| 7 |
+
"processed_at": "2025-06-30 16:30:35 UTC",
|
| 8 |
+
"google_url": "https://www.google.com/search?q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&oq=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&hl=en&gl=us&num=20&start=0&sourceid=chrome&ie=UTF-8",
|
| 9 |
+
"raw_html_file": "https://serpapi.com/searches/63d251104f0cfcb2/6862bbabeeabd394c127d48f.html",
|
| 10 |
+
"total_time_taken": 3.5
|
| 11 |
+
},
|
| 12 |
+
"search_parameters": {
|
| 13 |
+
"engine": "google",
|
| 14 |
+
"q": "site:linkedin.com/in \"ml research\" \"llm\" \"mountain view\" -student -intern -\"recent graduate\"",
|
| 15 |
+
"google_domain": "google.com",
|
| 16 |
+
"hl": "en",
|
| 17 |
+
"gl": "us",
|
| 18 |
+
"start": 0,
|
| 19 |
+
"num": "20",
|
| 20 |
+
"device": "desktop"
|
| 21 |
+
},
|
| 22 |
+
"search_information": {
|
| 23 |
+
"query_displayed": "site:linkedin.com/in \"ml research\" \"llm\" \"mountain view\" -student -intern -\"recent graduate\"",
|
| 24 |
+
"total_results": 92,
|
| 25 |
+
"time_taken_displayed": 0.22,
|
| 26 |
+
"organic_results_state": "Results for exact spelling"
|
| 27 |
+
},
|
| 28 |
+
"organic_results": [
|
| 29 |
+
{
|
| 30 |
+
"position": 1,
|
| 31 |
+
"title": "Changwan Ryu - ML/Research Engineer - Google DeepMind",
|
| 32 |
+
"link": "https://www.linkedin.com/in/changwan-ryu-6aba477",
|
| 33 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/changwan-ryu-6aba477&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBcQAQ",
|
| 34 |
+
"displayed_link": "180+ followers",
|
| 35 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab967e4ac06153bb5fc3f1fbd8c12d57c8.png",
|
| 36 |
+
"snippet": "ML/Research Engineer \u00b7 Experience: Google DeepMind \u00b7 Education: Korea Advanced Institute of Science and Technology \u00b7 Location: Mountain View \u00b7 172 ...",
|
| 37 |
+
"snippet_highlighted_words": [
|
| 38 |
+
"ML/Research",
|
| 39 |
+
"Mountain View"
|
| 40 |
+
],
|
| 41 |
+
"rich_snippet": {
|
| 42 |
+
"top": {
|
| 43 |
+
"extensions": [
|
| 44 |
+
"Mountain View, California, United States",
|
| 45 |
+
"ML/Research Engineer",
|
| 46 |
+
"Google DeepMind"
|
| 47 |
+
]
|
| 48 |
+
}
|
| 49 |
+
},
|
| 50 |
+
"source": "LinkedIn \u00b7 Changwan Ryu"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"position": 2,
|
| 54 |
+
"title": "Shobha Vasudevan - Google",
|
| 55 |
+
"link": "https://www.linkedin.com/in/shobha-vasudevan-72154b11",
|
| 56 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/shobha-vasudevan-72154b11&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBYQAQ",
|
| 57 |
+
"displayed_link": "1.1K+ followers",
|
| 58 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abdbfc1c2aa5e9602675e5e15d69d4b5d9.png",
|
| 59 |
+
"snippet": "Shobha Vasudevan is an AI/ML research scientist in Google. She has worked on LLM post\u2026 \u00b7 Experience: Google \u00b7 Education: The University of Texas at Austin ...",
|
| 60 |
+
"snippet_highlighted_words": [
|
| 61 |
+
"ML research",
|
| 62 |
+
"LLM"
|
| 63 |
+
],
|
| 64 |
+
"rich_snippet": {
|
| 65 |
+
"top": {
|
| 66 |
+
"extensions": [
|
| 67 |
+
"Palo Alto, California, United States",
|
| 68 |
+
"Google"
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
"source": "LinkedIn \u00b7 Shobha Vasudevan"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"position": 3,
|
| 76 |
+
"title": "Nitin Singhal - AI ML Engineer Architect - United IT",
|
| 77 |
+
"link": "https://www.linkedin.com/in/nitinsinghal03",
|
| 78 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/nitinsinghal03&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBgQAQ",
|
| 79 |
+
"displayed_link": "40+ followers",
|
| 80 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abe3091cbf47af0a11060fa652e5856d82.png",
|
| 81 |
+
"snippet": "... Mountain View \u00b7 40 connections on LinkedIn. View Nitin Singhal's profile on ... LLM and multimodal models. It contains sub repos by topic containing ...",
|
| 82 |
+
"snippet_highlighted_words": [
|
| 83 |
+
"Mountain View",
|
| 84 |
+
"LLM"
|
| 85 |
+
],
|
| 86 |
+
"rich_snippet": {
|
| 87 |
+
"top": {
|
| 88 |
+
"extensions": [
|
| 89 |
+
"Mountain View, California, United States",
|
| 90 |
+
"AI ML Engineer Architect",
|
| 91 |
+
"United IT"
|
| 92 |
+
]
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
"source": "LinkedIn \u00b7 Nitin Singhal"
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"position": 4,
|
| 99 |
+
"title": "Roxana Rivkani - AI/ML Research Engineer",
|
| 100 |
+
"link": "https://www.linkedin.com/in/roxanarvk",
|
| 101 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/roxanarvk&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECB0QAQ",
|
| 102 |
+
"displayed_link": "1.3K+ followers",
|
| 103 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abc790cb6a6151bdf0112f0249f2923133.png",
|
| 104 |
+
"snippet": "AI/ML Research Engineer. NASA Ames Research Center. Jun 2023 - Present 2 years 1 month. Mountain View, California, United States. Stealth Startup Graphic. Co ...",
|
| 105 |
+
"snippet_highlighted_words": [
|
| 106 |
+
"ML Research",
|
| 107 |
+
"Mountain View"
|
| 108 |
+
],
|
| 109 |
+
"rich_snippet": {
|
| 110 |
+
"top": {
|
| 111 |
+
"extensions": [
|
| 112 |
+
"San Francisco Bay Area",
|
| 113 |
+
"AI/ML Research Engineer",
|
| 114 |
+
"NASA Ames Research Center"
|
| 115 |
+
]
|
| 116 |
+
}
|
| 117 |
+
},
|
| 118 |
+
"source": "LinkedIn \u00b7 Roxana Rivkani"
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"position": 5,
|
| 122 |
+
"title": "Andy Su - ML Research - Meta",
|
| 123 |
+
"link": "https://www.linkedin.com/in/andyssu",
|
| 124 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/andyssu&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBkQAQ",
|
| 125 |
+
"displayed_link": "3.6K+ followers",
|
| 126 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab5413cd18de5a0ad5eed1827f03b46b90.png",
|
| 127 |
+
"snippet": "LLM researcher | andysu.org \u00b7 Experience: Meta \u00b7 Education: Princeton ... Experience. Meta Graphic. ML Research. Meta. 2024 - Present 1 year. Apple Graphic ...",
|
| 128 |
+
"snippet_highlighted_words": [
|
| 129 |
+
"LLM",
|
| 130 |
+
"ML Research"
|
| 131 |
+
],
|
| 132 |
+
"rich_snippet": {
|
| 133 |
+
"top": {
|
| 134 |
+
"extensions": [
|
| 135 |
+
"New York City Metropolitan Area",
|
| 136 |
+
"ML Research",
|
| 137 |
+
"Meta"
|
| 138 |
+
]
|
| 139 |
+
}
|
| 140 |
+
},
|
| 141 |
+
"source": "LinkedIn \u00b7 Andy Su"
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"position": 6,
|
| 145 |
+
"title": "Vaskar Nath - Machine Learning Researcher - Scale AI",
|
| 146 |
+
"link": "https://www.linkedin.com/in/vaskarnath",
|
| 147 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/vaskarnath&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBoQAQ",
|
| 148 |
+
"displayed_link": "1K+ followers",
|
| 149 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab42774fef2b747d799adfefb958279c14.png",
|
| 150 |
+
"snippet": "ML Research Engineer. Nuro. May 2022 - Aug 2022 4 months. Mountain View, California, United States. Localization and Mapping. Scale AI Graphic. Software ...",
|
| 151 |
+
"snippet_highlighted_words": [
|
| 152 |
+
"ML Research",
|
| 153 |
+
"Mountain View"
|
| 154 |
+
],
|
| 155 |
+
"rich_snippet": {
|
| 156 |
+
"top": {
|
| 157 |
+
"extensions": [
|
| 158 |
+
"New York, New York, United States",
|
| 159 |
+
"Machine Learning Researcher",
|
| 160 |
+
"Scale AI"
|
| 161 |
+
]
|
| 162 |
+
}
|
| 163 |
+
},
|
| 164 |
+
"source": "LinkedIn \u00b7 Vaskar Nath"
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"position": 7,
|
| 168 |
+
"title": "Ming Yin - Engineering Lead - AI Agent - Cresta",
|
| 169 |
+
"link": "https://www.linkedin.com/in/myin8",
|
| 170 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/myin8&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCgQAQ",
|
| 171 |
+
"displayed_link": "1.5K+ followers",
|
| 172 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab7d58d32b3b3d8d90697271a460e8938a.png",
|
| 173 |
+
"snippet": "AI Agent Engineering Lead @ Cresta | Building LLM & Gen AI ... Leading engineering and ML research teams to push the boundaries of conversational AI.",
|
| 174 |
+
"snippet_highlighted_words": [
|
| 175 |
+
"LLM",
|
| 176 |
+
"ML research"
|
| 177 |
+
],
|
| 178 |
+
"rich_snippet": {
|
| 179 |
+
"top": {
|
| 180 |
+
"extensions": [
|
| 181 |
+
"Greater Seattle Area",
|
| 182 |
+
"Engineering Lead",
|
| 183 |
+
"AI Agent",
|
| 184 |
+
"Cresta"
|
| 185 |
+
]
|
| 186 |
+
}
|
| 187 |
+
},
|
| 188 |
+
"source": "LinkedIn \u00b7 Ming Yin"
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"position": 8,
|
| 192 |
+
"title": "Di Zhou - San Francisco Bay Area | Professional Profile",
|
| 193 |
+
"link": "https://www.linkedin.com/in/di-zhou-57b031ab",
|
| 194 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/di-zhou-57b031ab&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECC4QAQ",
|
| 195 |
+
"displayed_link": "40+ followers",
|
| 196 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abda086b98580a3f10f8974db19a523e44.png",
|
| 197 |
+
"snippet": "Experience ; Software Engineer. Lymba. Feb 2016 \u00b7 Aug 2018 2 years 7 months ; Software Engineer. Baidu, Inc. Nov 2012 \u00b7 Aug 2014 1 year 10 months ; Software ...",
|
| 198 |
+
"source": "LinkedIn \u00b7 Di Zhou"
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"position": 9,
|
| 202 |
+
"title": "Kevin H. - AI & Engineering Manager at Bluestone Analytics",
|
| 203 |
+
"link": "https://www.linkedin.com/in/kevin-h-a6044a18a",
|
| 204 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/kevin-h-a6044a18a&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCkQAQ",
|
| 205 |
+
"displayed_link": "200+ followers",
|
| 206 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abdfcf6959a576ecaa030c613812861c8c.png",
|
| 207 |
+
"snippet": "AI & Engineering Manager at Bluestone Analytics \u00b7 Experience: Bluestone Analytics, a CACI Company \u00b7 Education: University of Virginia \u00b7 Location: Greater ...",
|
| 208 |
+
"source": "LinkedIn \u00b7 Kevin H."
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"position": 10,
|
| 212 |
+
"title": "Jing Z. - Staff GenAI ML DS - Meta",
|
| 213 |
+
"link": "https://www.linkedin.com/in/zhujing",
|
| 214 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/zhujing&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECC0QAQ",
|
| 215 |
+
"displayed_link": "730+ followers",
|
| 216 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab80cc4f5683489ab16d774ebb7de2a845.png",
|
| 217 |
+
"snippet": "... LLM modeling and evaluation, including designing data flywheel systems and ... ML research and business impact, translating complex technical concepts ...",
|
| 218 |
+
"snippet_highlighted_words": [
|
| 219 |
+
"LLM",
|
| 220 |
+
"ML research"
|
| 221 |
+
],
|
| 222 |
+
"rich_snippet": {
|
| 223 |
+
"top": {
|
| 224 |
+
"extensions": [
|
| 225 |
+
"Fremont, California, United States",
|
| 226 |
+
"Staff GenAI ML DS",
|
| 227 |
+
"Meta"
|
| 228 |
+
]
|
| 229 |
+
}
|
| 230 |
+
},
|
| 231 |
+
"source": "LinkedIn \u00b7 Jing Z."
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"position": 11,
|
| 235 |
+
"title": "\u5f20\u667a\u98a2 - Algorithmic Engineer - \u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8",
|
| 236 |
+
"link": "https://www.linkedin.com/in/%E6%99%BA%E9%A2%A2-%E5%BC%A0-a373b113b",
|
| 237 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/%25E6%2599%25BA%25E9%25A2%25A2-%25E5%25BC%25A0-a373b113b&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECC8QAQ",
|
| 238 |
+
"displayed_link": "40+ followers",
|
| 239 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab4710c706ab9c1c7d4de30394af62859d.png",
|
| 240 |
+
"snippet": "\u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8 - \u7b97\u6cd5\u5de5\u7a0b\u5e08 \u00b7 Experience: \u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8 \u00b7 Education: University of Connecticut \u00b7 Location: Mansfield Center \u00b7 44 connections on ...",
|
| 241 |
+
"rich_snippet": {
|
| 242 |
+
"top": {
|
| 243 |
+
"extensions": [
|
| 244 |
+
"Mansfield Center, Connecticut, United States",
|
| 245 |
+
"Algorithmic Engineer",
|
| 246 |
+
"\u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8"
|
| 247 |
+
]
|
| 248 |
+
}
|
| 249 |
+
},
|
| 250 |
+
"source": "LinkedIn \u00b7 \u5f20\u667a\u98a2"
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"position": 12,
|
| 254 |
+
"title": "Yang Yu - Senior Data Scientist, Gen AI - NVIDIA",
|
| 255 |
+
"link": "https://www.linkedin.com/in/yangyu190",
|
| 256 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/yangyu190&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEoQAQ",
|
| 257 |
+
"displayed_link": "560+ followers",
|
| 258 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab58c1b0c2e2293e13fd8d88913fa661cb.png",
|
| 259 |
+
"snippet": "Adept at quickly building proof-of-concepts of latest AI/ML research and transitioning them into deployable solutions to deliver business value. ... LLM inference ...",
|
| 260 |
+
"snippet_highlighted_words": [
|
| 261 |
+
"ML research",
|
| 262 |
+
"LLM"
|
| 263 |
+
],
|
| 264 |
+
"rich_snippet": {
|
| 265 |
+
"top": {
|
| 266 |
+
"extensions": [
|
| 267 |
+
"Houston, Texas, United States",
|
| 268 |
+
"Senior Data Scientist, Gen AI",
|
| 269 |
+
"NVIDIA"
|
| 270 |
+
]
|
| 271 |
+
}
|
| 272 |
+
},
|
| 273 |
+
"source": "LinkedIn \u00b7 Yang Yu"
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"position": 13,
|
| 277 |
+
"title": "Ravi Raagav Srinivas - U.S. Bank",
|
| 278 |
+
"link": "https://www.linkedin.com/in/raviraagav",
|
| 279 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/raviraagav&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCoQAQ",
|
| 280 |
+
"displayed_link": "500+ followers",
|
| 281 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abb3dcb76c3b3b80ef500f0ffd5b9747a0.png",
|
| 282 |
+
"snippet": "Innovative, energetic software developer with five years of experience in the design and\u2026 \u00b7 Experience: U.S. Bank \u00b7 Education: Rochester Institute of ...",
|
| 283 |
+
"rich_snippet": {
|
| 284 |
+
"top": {
|
| 285 |
+
"extensions": [
|
| 286 |
+
"San Francisco Bay Area",
|
| 287 |
+
"U.S. Bank"
|
| 288 |
+
]
|
| 289 |
+
}
|
| 290 |
+
},
|
| 291 |
+
"source": "LinkedIn \u00b7 Ravi Raagav Srinivas"
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"position": 14,
|
| 295 |
+
"title": "Jing Lu - Google",
|
| 296 |
+
"link": "https://www.linkedin.com/in/jing-lu-a0b07123",
|
| 297 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/jing-lu-a0b07123&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCsQAQ",
|
| 298 |
+
"displayed_link": "1.2K+ followers",
|
| 299 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab4c0a0b511dbaffae0af0fd8fa9cc8584.png",
|
| 300 |
+
"snippet": "Experience \u00b7 Google Graphic \u00b7 Google. Mountain View, California, United States \u00b7 -. San Jose, California, United States \u00b7 -. United States \u00b7 - \u00b7 -. 428 Church Street ...",
|
| 301 |
+
"snippet_highlighted_words": [
|
| 302 |
+
"Mountain View"
|
| 303 |
+
],
|
| 304 |
+
"rich_snippet": {
|
| 305 |
+
"top": {
|
| 306 |
+
"extensions": [
|
| 307 |
+
"San Francisco Bay Area",
|
| 308 |
+
"Google"
|
| 309 |
+
]
|
| 310 |
+
}
|
| 311 |
+
},
|
| 312 |
+
"source": "LinkedIn \u00b7 Jing Lu"
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"position": 15,
|
| 316 |
+
"title": "Zhi (Kevin) P. - Technical consultant, Partner - Empire 55 LLC",
|
| 317 |
+
"link": "https://www.linkedin.com/in/zhi-kevin-p-3718934",
|
| 318 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/zhi-kevin-p-3718934&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECDAQAQ",
|
| 319 |
+
"displayed_link": "150+ followers",
|
| 320 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab3672252d2b16e2613378ac9aa7881af2.png",
|
| 321 |
+
"snippet": "Full Stack AI/ML Software Engineer (AI/ML, E2E, Cloud, AWS, NLP, Deep Learning, LLM, RAG, Data modeling and processing, model training ... AI/ML Research and ...",
|
| 322 |
+
"snippet_highlighted_words": [
|
| 323 |
+
"LLM",
|
| 324 |
+
"ML Research"
|
| 325 |
+
],
|
| 326 |
+
"rich_snippet": {
|
| 327 |
+
"top": {
|
| 328 |
+
"detected_extensions": {
|
| 329 |
+
"empire_llc": 55
|
| 330 |
+
},
|
| 331 |
+
"extensions": [
|
| 332 |
+
"San Jose, California, United States",
|
| 333 |
+
"Technical consultant, Partner",
|
| 334 |
+
"Empire 55 LLC"
|
| 335 |
+
]
|
| 336 |
+
}
|
| 337 |
+
},
|
| 338 |
+
"source": "LinkedIn \u00b7 Zhi (Kevin) P."
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"position": 16,
|
| 342 |
+
"title": "Zhe Su - Evanston, Illinois, United States",
|
| 343 |
+
"link": "https://www.linkedin.com/in/zhe-su-86b389b3",
|
| 344 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/zhe-su-86b389b3&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCwQAQ",
|
| 345 |
+
"displayed_link": "400+ followers",
|
| 346 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab0e94b092cc68821a9d339dc28fab567e.png",
|
| 347 |
+
"snippet": "ML Research Scientist with expertise in machine learning and data analysis. \u00b7 Experience: Upstart \u00b7 Education: Northwestern University \u00b7 Location: Evanston ...",
|
| 348 |
+
"snippet_highlighted_words": [
|
| 349 |
+
"ML Research"
|
| 350 |
+
],
|
| 351 |
+
"source": "LinkedIn \u00b7 Zhe Su"
|
| 352 |
+
},
|
| 353 |
+
{
|
| 354 |
+
"position": 17,
|
| 355 |
+
"title": "Husain Al-Mohssen - Chrome Bird",
|
| 356 |
+
"link": "https://www.linkedin.com/in/husainalmohssen",
|
| 357 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/husainalmohssen&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEcQAQ",
|
| 358 |
+
"displayed_link": "1.4K+ followers",
|
| 359 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab33a55fb264eed8720356e465394ec4f7.png",
|
| 360 |
+
"snippet": "I am an Executive AI Leader, specializing in translating cutting-edge AI/ML research\u2026 \u00b7 ... (LLM Based Error Action Recommender). Filed October 21, 2024 #63 ...",
|
| 361 |
+
"snippet_highlighted_words": [
|
| 362 |
+
"ML research",
|
| 363 |
+
"LLM"
|
| 364 |
+
],
|
| 365 |
+
"rich_snippet": {
|
| 366 |
+
"top": {
|
| 367 |
+
"extensions": [
|
| 368 |
+
"Cambridge, Massachusetts, United States",
|
| 369 |
+
"Chrome Bird"
|
| 370 |
+
]
|
| 371 |
+
}
|
| 372 |
+
},
|
| 373 |
+
"source": "LinkedIn \u00b7 Husain Al-Mohssen"
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"position": 18,
|
| 377 |
+
"title": "Yang Tian - Tencent Americas",
|
| 378 |
+
"link": "https://www.linkedin.com/in/yang-tian-5a0719b0",
|
| 379 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/yang-tian-5a0719b0&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEgQAQ",
|
| 380 |
+
"displayed_link": "730+ followers",
|
| 381 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab80e4705eefebd31bdb2d6005ee831582.png",
|
| 382 |
+
"snippet": "Yang is an excellent data engineer. He's a great mentor who is patient and knowledgeable in his field and is always willing and able to assist the Junior ...",
|
| 383 |
+
"rich_snippet": {
|
| 384 |
+
"top": {
|
| 385 |
+
"extensions": [
|
| 386 |
+
"Sunnyvale, California, United States",
|
| 387 |
+
"Tencent Americas"
|
| 388 |
+
]
|
| 389 |
+
}
|
| 390 |
+
},
|
| 391 |
+
"source": "LinkedIn \u00b7 Yang Tian"
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"position": 19,
|
| 395 |
+
"title": "Peter Mains - Senior Data Engineer",
|
| 396 |
+
"link": "https://www.linkedin.com/in/peter-mains-aa8674336",
|
| 397 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/peter-mains-aa8674336&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEUQAQ",
|
| 398 |
+
"displayed_link": "6 followers",
|
| 399 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293aba29b014e345ad182be26fc3a794a2b19.png",
|
| 400 |
+
"snippet": "Senior Data Engineer with 12 years of expertise in designing scalable data architectures, optimizing ETL workflows, and building real-time data solutions.",
|
| 401 |
+
"rich_snippet": {
|
| 402 |
+
"top": {
|
| 403 |
+
"extensions": [
|
| 404 |
+
"Gilbert, Arizona, United States",
|
| 405 |
+
"Senior Data Engineer",
|
| 406 |
+
"Luxevision Consulting LLC"
|
| 407 |
+
]
|
| 408 |
+
}
|
| 409 |
+
},
|
| 410 |
+
"source": "LinkedIn \u00b7 Peter Mains"
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"position": 20,
|
| 414 |
+
"title": "Krishan K. - CodeSignal",
|
| 415 |
+
"link": "https://www.linkedin.com/in/krishan-kumar38",
|
| 416 |
+
"redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/krishan-kumar38&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEYQAQ",
|
| 417 |
+
"displayed_link": "740+ followers",
|
| 418 |
+
"favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab4508b5b4dfbd6acccd1770f2c8abee37.png",
|
| 419 |
+
"snippet": "I am an engineer with 6+ years experience in the industry. I love solving real world\u2026 \u00b7 CodeSignal \u00b7 The University of Texas at Dallas \u00b7 Nashville \u00b7 500+ ...",
|
| 420 |
+
"rich_snippet": {
|
| 421 |
+
"top": {
|
| 422 |
+
"extensions": [
|
| 423 |
+
"Nashville, Tennessee, United States",
|
| 424 |
+
"CodeSignal"
|
| 425 |
+
]
|
| 426 |
+
}
|
| 427 |
+
},
|
| 428 |
+
"source": "LinkedIn \u00b7 Krishan K."
|
| 429 |
+
}
|
| 430 |
+
],
|
| 431 |
+
"pagination": {
|
| 432 |
+
"current": 1,
|
| 433 |
+
"next": "https://www.google.com/search?q=site:linkedin.com/in+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&num=20&sca_esv=6d27a280cc9bfc7a&gl=us&hl=en&ei=rrtiaI3eB7u8juMP17PPiA8&start=20&sa=N&sstk=Ac65TH6KgSWXzfjLxTxsWJDmqXpCNcLB5EhPkbehJWRoqTOswsTJXIfLU0u1V9OX6jXgr3D_qdQhoUwxwhPVsVBnGER9y4lieXQLsw&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQ8NMDegQIChAI",
|
| 434 |
+
"other_pages": {
|
| 435 |
+
"2": "https://www.google.com/search?q=site:linkedin.com/in+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&num=20&sca_esv=6d27a280cc9bfc7a&gl=us&hl=en&ei=rrtiaI3eB7u8juMP17PPiA8&start=20&sa=N&sstk=Ac65TH6KgSWXzfjLxTxsWJDmqXpCNcLB5EhPkbehJWRoqTOswsTJXIfLU0u1V9OX6jXgr3D_qdQhoUwxwhPVsVBnGER9y4lieXQLsw&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQ8tMDegQIChAE",
|
| 436 |
+
"3": "https://www.google.com/search?q=site:linkedin.com/in+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&num=20&sca_esv=6d27a280cc9bfc7a&gl=us&hl=en&ei=rrtiaI3eB7u8juMP17PPiA8&start=40&sa=N&sstk=Ac65TH6KgSWXzfjLxTxsWJDmqXpCNcLB5EhPkbehJWRoqTOswsTJXIfLU0u1V9OX6jXgr3D_qdQhoUwxwhPVsVBnGER9y4lieXQLsw&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQ8tMDegQIChAG"
|
| 437 |
+
}
|
| 438 |
+
},
|
| 439 |
+
"serpapi_pagination": {
|
| 440 |
+
"current": 1,
|
| 441 |
+
"next_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=20",
|
| 442 |
+
"next": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=20",
|
| 443 |
+
"other_pages": {
|
| 444 |
+
"2": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=20",
|
| 445 |
+
"3": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=40"
|
| 446 |
+
}
|
| 447 |
+
}
|
| 448 |
+
}
|
test/data/linkedin_urls.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
https://www.linkedin.com/in/yangyu190
|
| 2 |
+
https://www.linkedin.com/in/raviraagav
|
| 3 |
+
https://www.linkedin.com/in/shobha-vasudevan-72154b11
|
| 4 |
+
https://www.linkedin.com/in/kevin-h-a6044a18a
|
| 5 |
+
https://www.linkedin.com/in/vaskarnath
|
| 6 |
+
https://www.linkedin.com/in/jing-lu-a0b07123
|
| 7 |
+
https://www.linkedin.com/in/myin8
|
| 8 |
+
https://www.linkedin.com/in/zhe-su-86b389b3
|
| 9 |
+
https://www.linkedin.com/in/zhujing
|
| 10 |
+
https://www.linkedin.com/in/nitinsinghal03
|
| 11 |
+
https://www.linkedin.com/in/andyssu
|
| 12 |
+
https://www.linkedin.com/in/krishan-kumar38
|
| 13 |
+
https://www.linkedin.com/in/changwan-ryu-6aba477
|
| 14 |
+
https://www.linkedin.com/in/roxanarvk
|
| 15 |
+
https://www.linkedin.com/in/husainalmohssen
|
| 16 |
+
https://www.linkedin.com/in/zhi-kevin-p-3718934
|
| 17 |
+
https://www.linkedin.com/in/di-zhou-57b031ab
|
| 18 |
+
https://www.linkedin.com/in/peter-mains-aa8674336
|
| 19 |
+
https://www.linkedin.com/in/yang-tian-5a0719b0
|