Kaito117 commited on
Commit
7aa0cea
·
1 Parent(s): 8db7cd5

fixed data loading

Browse files

serpapi search working (lnkd, gh),
rapidapi fetch working (lnkd),
github fetch working,
url extraction working,
tests passing,
added sample test data to test/data dir - prevent unnecessary api quota usage. use a test flag to test using this - todo

Signed-off-by: pranshu-raj-211 <pranshuraj65536@gmail.com>

app/config.py CHANGED
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
13
  database_name: str = "linkedin_sourcing"
14
 
15
  max_concurrent_requests: int = 5
16
- request_delay: float = 1.0
17
  max_retries: int = 3
18
 
19
  bloom_filter_capacity: int = 100000
 
13
  database_name: str = "linkedin_sourcing"
14
 
15
  max_concurrent_requests: int = 5
16
+ request_delay: float = 5.0
17
  max_retries: int = 3
18
 
19
  bloom_filter_capacity: int = 100000
app/services/api_clients.py CHANGED
@@ -1,9 +1,11 @@
1
  import asyncio
2
  import httpx
 
 
 
3
  from typing import Any, Optional
4
  from fake_useragent import UserAgent
5
  from tenacity import retry, stop_after_attempt, wait_exponential
6
- from serpapi import GoogleSearch
7
 
8
  from app.config import settings
9
  from app.models.schemas import SearchQuery
@@ -46,75 +48,145 @@ class SerpAPIClient:
46
  """Client for SerpAPI Google search"""
47
 
48
  def __init__(self):
49
- self.api_key_rotator = APIKeyRotator(settings.serpapi_keys)
50
  self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
51
 
 
 
 
 
 
 
 
52
  @retry(
53
  stop=stop_after_attempt(settings.max_retries),
54
  wait=wait_exponential(multiplier=1, min=4, max=10),
 
55
  )
56
- async def search(self, query: SearchQuery) -> list[dict[str, Any]]:
57
- """Execute search query via SerpAPI"""
 
 
 
 
 
58
  async with self.semaphore:
59
  try:
60
- api_key = self.api_key_rotator.get_next_key()
61
-
62
- params = {
63
- "q": query.query,
64
- "api_key": api_key,
65
- "engine": "google",
66
- "num": query.max_results,
67
- "start": 0,
68
- "gl": "us", # Country
69
- "hl": "en", # Language
70
- }
71
 
72
  logger.info(
73
  "serpapi_request",
74
- extra={"query": query.query, "platform": query.platform},
 
 
 
75
  )
76
 
77
- # Use sync SerpAPI client in async context
78
- search = GoogleSearch(params)
79
- result = await asyncio.get_event_loop().run_in_executor(
80
- None, search.get_dict
81
  )
 
82
 
 
83
  organic_results = result.get("organic_results", [])
 
 
 
 
84
 
85
  logger.info(
86
  "serpapi_response",
87
- extra={"results_count": len(organic_results), "query": query.query},
 
 
 
88
  )
 
89
 
90
- return organic_results
91
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  except Exception as e:
93
- logger.error(
94
- "serpapi_error", extra={"error": str(e), "query": query.query}
 
95
  )
96
  raise
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  async def batch_search(
99
  self, queries: list[SearchQuery]
100
  ) -> dict[str, list[dict[str, Any]]]:
101
  """Execute multiple search queries concurrently"""
102
  tasks = []
103
- for query in queries:
104
- task = asyncio.create_task(self.search(query))
105
- tasks.append((query.query, task))
 
 
 
 
 
 
 
 
106
 
107
  # Add delay between requests to avoid rate limiting
 
108
  await asyncio.sleep(settings.request_delay)
109
 
110
  results = {}
111
  for query_text, task in tasks:
112
  try:
113
- result = await task
114
- results[query_text] = result
115
  except Exception as e:
116
- logger.error(
117
- "batch_search_error", extra={"query": query_text, "error": str(e)}
 
118
  )
119
  results[query_text] = []
120
 
@@ -122,56 +194,99 @@ class SerpAPIClient:
122
 
123
 
124
  class LinkedInProfileClient:
125
- """Client for fetching LinkedIn profile data"""
126
 
127
  def __init__(self):
128
- self.ua_rotator = UserAgentRotator()
 
 
 
 
 
 
 
129
  self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
130
 
 
 
 
 
 
 
131
  @retry(
132
  stop=stop_after_attempt(settings.max_retries),
133
  wait=wait_exponential(multiplier=1, min=2, max=8),
 
134
  )
135
  async def fetch_profile(self, linkedin_url: str) -> Optional[dict[str, Any]]:
136
- """Fetch LinkedIn profile data"""
137
  async with self.semaphore:
138
  try:
139
- headers = {
140
- "User-Agent": self.ua_rotator.get_random_ua(),
141
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
142
- "Accept-Language": "en-US,en;q=0.5",
143
- "Accept-Encoding": "gzip, deflate",
144
- "Connection": "keep-alive",
 
 
 
 
 
 
 
145
  }
146
 
147
- async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
148
- logger.debug("Started fetch linkedin profile.")
149
-
150
- response = await client.get(linkedin_url)
151
- logger.debug("Got linkedin profile response", response.status_code)
152
- if response.status_code == 200:
153
- return {
154
- "url": linkedin_url,
155
- "html": response.text,
156
- "status": "success",
157
- }
158
- elif response.status_code == 429:
159
- # Rate limited - wait longer
160
- await asyncio.sleep(10)
161
- raise Exception(f"Rate limited: {response.status_code}")
162
- else:
163
- logger.warning(
164
- "Linkedin fetch failed",
165
- extra={
166
- "lnkd_url": linkedin_url,
167
- "status_code": response.status_code,
168
- },
169
- )
170
- return None
 
 
 
 
 
 
 
 
 
171
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  except Exception:
173
  logger.exception(
174
- "Failed to fetch linkedin profile", extra={"lnkd_url": linkedin_url}
 
175
  )
176
  raise
177
 
@@ -184,8 +299,7 @@ class LinkedInProfileClient:
184
  task = asyncio.create_task(self.fetch_profile(url))
185
  tasks.append((url, task))
186
 
187
- # Stagger requests to avoid overwhelming the server
188
- await asyncio.sleep(0.5)
189
 
190
  results = {}
191
  for url, task in tasks:
@@ -194,7 +308,8 @@ class LinkedInProfileClient:
194
  results[url] = result
195
  except Exception:
196
  logger.exception(
197
- "Batch lnkd profile fetch failed", extra={"lnkd_url": url}
 
198
  )
199
  results[url] = None
200
 
@@ -202,106 +317,82 @@ class LinkedInProfileClient:
202
 
203
 
204
  class GitHubClient:
205
- """Client for fetching GitHub profile data"""
206
 
207
  def __init__(self):
208
- self.ua_rotator = UserAgentRotator()
209
  self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
210
 
211
- @retry(
212
- stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=5)
213
- )
214
- async def fetch_github_profile(self, github_url: str) -> Optional[dict[str, Any]]:
215
- """Fetch GitHub profile data"""
216
- async with self.semaphore:
217
- try:
218
- headers = {
219
- "User-Agent": self.ua_rotator.get_random_ua(),
220
- "Accept": "application/vnd.github.v3+json",
221
- }
222
-
223
- async with httpx.AsyncClient(timeout=20.0, headers=headers) as client:
224
- logger.debug(
225
- "Got req to fetch github profile", extra={"gh_url": github_url}
226
- )
227
-
228
- response = await client.get(github_url)
229
 
230
- if response.status_code == 200:
231
- return {
232
- "url": github_url,
233
- "html": response.text,
234
- "status": "success",
235
- }
236
- else:
237
- logger.warning(
238
- "Failed to fetch gh profile", extra={"gh_url": github_url}
239
- )
240
- return None
241
 
242
- except Exception:
243
- logger.exception(
244
- "Failed to fetch gh profile", extra={"gh_url": github_url}
 
 
 
 
 
 
 
245
  )
246
  return None
 
 
 
 
 
 
247
 
248
 
249
  class URLExtractor:
250
- """Extracts URLs from search results"""
251
-
252
- def extract_linkedin_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
253
- """Extract LinkedIn profile URLs from search results"""
254
- urls = []
255
-
 
 
 
 
256
  for result in search_results:
257
  link = result.get("link", "")
258
- title = result.get("title", "")
259
-
260
- # Validate LinkedIn profile URL
261
- if self._is_valid_linkedin_url(link):
262
- urls.append(link)
263
-
264
- # Remove duplicates while preserving order
265
- unique_urls = []
266
- seen = set()
267
- for url in urls:
268
- if url not in seen:
269
- unique_urls.append(url)
270
- seen.add(url)
271
 
272
- logger.info("extracted_linkedin_urls", extra={"count": len(unique_urls)})
273
- return unique_urls
 
 
 
 
 
274
 
275
  def extract_github_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
276
  """Extract GitHub profile URLs from search results"""
277
- urls = []
278
-
279
- for result in search_results:
280
- link = result.get("link", "")
281
-
282
- if self._is_valid_github_url(link):
283
- urls.append(link)
284
-
285
- unique_urls = list(dict.fromkeys(urls)) # Remove duplicates
286
- logger.info("extracted_github_urls", extra={"count": len(unique_urls)})
287
- return unique_urls
288
 
289
  def _is_valid_linkedin_url(self, url: str) -> bool:
290
  """Validate LinkedIn profile URL"""
291
  if not url:
292
  return False
293
 
294
- # Check for LinkedIn profile patterns
295
- linkedin_patterns = [
296
- "linkedin.com/in/",
297
- "www.linkedin.com/in/",
298
- ]
299
-
300
- # Must contain a LinkedIn profile pattern
301
- if not any(pattern in url.lower() for pattern in linkedin_patterns):
302
- return False
303
 
304
- # Exclude directory pages, company pages, etc.
305
  exclude_patterns = [
306
  "/company/",
307
  "/school/",
@@ -309,109 +400,124 @@ class URLExtractor:
309
  "/jobs/",
310
  "/directory/",
311
  "/pub/dir/",
312
- "linkedin.com/pub/",
313
  "/learning/",
314
  "/events/",
315
  "/posts/",
316
  ]
317
 
 
318
  if any(pattern in url.lower() for pattern in exclude_patterns):
319
  return False
320
 
321
- return True
 
322
 
323
  def _is_valid_github_url(self, url: str) -> bool:
324
  """Validate GitHub profile URL"""
325
  if not url:
326
  return False
327
 
328
- # Check for GitHub profile patterns
329
- github_patterns = ["github.com/", "www.github.com/"]
 
 
 
 
330
 
331
- if not any(pattern in url.lower() for pattern in github_patterns):
332
- return False
333
-
334
- # Exclude repository pages, organization pages, etc.
335
  exclude_patterns = [
336
- "/repos/",
337
- "/orgs/",
338
- "/gist/",
339
- "/issues/",
340
- "/pulls/",
341
- "/wiki/",
342
- "/releases/",
343
  "/tree/",
344
  "/blob/",
345
  "/commit/",
 
 
 
 
 
 
 
 
 
 
 
 
346
  "/compare/",
 
 
 
 
 
 
 
 
347
  ]
348
-
349
  if any(pattern in url.lower() for pattern in exclude_patterns):
350
  return False
 
351
 
352
- # Simple check: should be github.com/username format
353
- parts = url.replace("https://", "").replace("http://", "").split("/")
354
- if len(parts) >= 2 and parts[0] in ["github.com", "www.github.com"]:
355
- username = parts[1]
356
- # Basic username validation
357
- if (
358
- len(username) > 0
359
- and not username.startswith("-")
360
- and not username.endswith("-")
361
- ):
362
- return True
363
-
364
- return False
365
-
366
-
367
- # Client factory for easy instantiation
368
- class APIClientFactory:
369
- """Factory for creating API clients"""
370
-
371
- @staticmethod
372
- def create_serpapi_client() -> SerpAPIClient:
373
- return SerpAPIClient()
374
 
375
- @staticmethod
376
- def create_linkedin_client() -> LinkedInProfileClient:
377
- return LinkedInProfileClient()
378
-
379
- @staticmethod
380
- def create_github_client() -> GitHubClient:
381
- return GitHubClient()
382
-
383
- @staticmethod
384
- def create_url_extractor() -> URLExtractor:
385
- return URLExtractor()
386
-
387
-
388
- # Example usage
389
  async def test_clients():
390
  """Test function for API clients"""
391
-
392
- # Test SerpAPI
393
- serpapi = APIClientFactory.create_serpapi_client()
394
- query = SearchQuery(
395
- query='site:linkedin.com/in "software engineer" python', platform="linkedin"
396
- )
397
-
398
- try:
399
- results = await serpapi.search(query)
400
- print(f"Found {len(results)} search results")
401
-
402
- # Test URL extraction
403
- extractor = APIClientFactory.create_url_extractor()
404
- linkedin_urls = extractor.extract_linkedin_urls(results)
405
- print(f"Extracted {len(linkedin_urls)} LinkedIn URLs")
406
-
407
- # Test profile fetching
408
- if linkedin_urls:
409
- linkedin_client = APIClientFactory.create_linkedin_client()
410
- profile_data = await linkedin_client.fetch_profile(linkedin_urls[0])
411
- print(f"Profile fetch result: {profile_data is not None}")
412
-
413
- except Exception as e:
414
- print(f"Test failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
 
417
  if __name__ == "__main__":
 
1
  import asyncio
2
  import httpx
3
+ import json
4
+ import re
5
+ import traceback
6
  from typing import Any, Optional
7
  from fake_useragent import UserAgent
8
  from tenacity import retry, stop_after_attempt, wait_exponential
 
9
 
10
  from app.config import settings
11
  from app.models.schemas import SearchQuery
 
48
  """Client for SerpAPI Google search"""
49
 
50
  def __init__(self):
51
+ self._client = httpx.AsyncClient(base_url="https://serpapi.com/search")
52
  self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
53
 
54
+ # Context manager to ensure httpx client is closed properly
55
+ async def __aenter__(self):
56
+ return self
57
+
58
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
59
+ await self._client.aclose()
60
+
61
  @retry(
62
  stop=stop_after_attempt(settings.max_retries),
63
  wait=wait_exponential(multiplier=1, min=4, max=10),
64
+ reraise=True, # Ensure exceptions are re-raised after retries
65
  )
66
+ async def _execute_search_request(
67
+ self, params: dict[str, Any]
68
+ ) -> list[dict[str, Any]]:
69
+ """
70
+ Internal method to execute a single SerpAPI request with retries.
71
+ It takes a dictionary of parameters and returns the JSON response.
72
+ """
73
  async with self.semaphore:
74
  try:
75
+ params["api_key"] = settings.serpapi_key
 
 
 
 
 
 
 
 
 
 
76
 
77
  logger.info(
78
  "serpapi_request",
79
+ extra={
80
+ "query": params.get("q"),
81
+ "platform": params.get("platform"),
82
+ },
83
  )
84
 
85
+ response = await self._client.get(
86
+ url="https://serpapi.com/search", params=params
 
 
87
  )
88
+ response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
89
 
90
+ result = response.json()
91
  organic_results = result.get("organic_results", [])
92
+ if not organic_results:
93
+ logger.warning(
94
+ f"Did not get any organic results for {params.get('q')}"
95
+ )
96
 
97
  logger.info(
98
  "serpapi_response",
99
+ extra={
100
+ "results_count": len(result.get("organic_results", [])),
101
+ "query": params.get("q"),
102
+ },
103
  )
104
+ return result
105
 
106
+ except httpx.HTTPStatusError as e:
107
+ logger.exception(
108
+ "serpapi_http_error",
109
+ extra={
110
+ "error": str(e),
111
+ "query": params.get("q"),
112
+ "status_code": e.response.status_code,
113
+ },
114
+ )
115
+ raise
116
+ except httpx.RequestError as e:
117
+ logger.exception(
118
+ "serpapi_request_error",
119
+ extra={"error": str(e), "query": params.get("q")},
120
+ )
121
+ raise
122
+ except json.JSONDecodeError as e:
123
+ logger.exception(
124
+ "serpapi_json_decode_error",
125
+ extra={
126
+ "error": str(e),
127
+ "query": params.get("q"),
128
+ "response_text": (
129
+ response.text if "response" in locals() else "N/A"
130
+ ),
131
+ },
132
+ )
133
+ raise
134
  except Exception as e:
135
+ logger.exception(
136
+ "serpapi_unexpected_error",
137
+ extra={"error": str(e), "query": params.get("q")},
138
  )
139
  raise
140
 
141
+ async def search(
142
+ self, query_string: str, max_results: int = 20
143
+ ) -> list[dict[str, Any]]:
144
+ """
145
+ Execute a single search query via SerpAPI and return the raw JSON response.
146
+ This method directly takes a string query - expected google dork query.
147
+ """
148
+ params = {
149
+ "q": query_string,
150
+ "engine": "google",
151
+ "num": max_results,
152
+ "start": 0,
153
+ "gl": "us", # Country
154
+ "hl": "en", # Language
155
+ }
156
+
157
+ full_result = await self._execute_search_request(params)
158
+ return full_result
159
+
160
  async def batch_search(
161
  self, queries: list[SearchQuery]
162
  ) -> dict[str, list[dict[str, Any]]]:
163
  """Execute multiple search queries concurrently"""
164
  tasks = []
165
+ for query_obj in queries:
166
+ params = {
167
+ "q": query_obj.query,
168
+ "engine": "google",
169
+ "num": query_obj.max_results,
170
+ "start": 0,
171
+ "gl": "us",
172
+ "hl": "en",
173
+ }
174
+ task = asyncio.create_task(self._execute_search_request(params))
175
+ tasks.append((query_obj.query, task))
176
 
177
  # Add delay between requests to avoid rate limiting
178
+ # This applies to the *initiation* of requests, not their completion.
179
  await asyncio.sleep(settings.request_delay)
180
 
181
  results = {}
182
  for query_text, task in tasks:
183
  try:
184
+ full_result = await task
185
+ results[query_text] = full_result
186
  except Exception as e:
187
+ logger.exception(
188
+ "batch_search_item_error",
189
+ extra={"query": query_text, "error": str(e)},
190
  )
191
  results[query_text] = []
192
 
 
194
 
195
 
196
  class LinkedInProfileClient:
197
+ """Client for fetching LinkedIn profile data from RapidAPI"""
198
 
199
  def __init__(self):
200
+ self._client = httpx.AsyncClient(
201
+ base_url="https://fresh-linkedin-profile-data.p.rapidapi.com",
202
+ headers={
203
+ "x-rapidapi-host": "fresh-linkedin-profile-data.p.rapidapi.com",
204
+ "x-rapidapi-key": settings.rapidapi_api_key,
205
+ },
206
+ timeout=30.0,
207
+ )
208
  self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
209
 
210
+ async def __aenter__(self):
211
+ return self
212
+
213
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
214
+ await self._client.aclose()
215
+
216
  @retry(
217
  stop=stop_after_attempt(settings.max_retries),
218
  wait=wait_exponential(multiplier=1, min=2, max=8),
219
+ reraise=True,
220
  )
221
  async def fetch_profile(self, linkedin_url: str) -> Optional[dict[str, Any]]:
222
+ """Fetch LinkedIn profile data from RapidAPI"""
223
  async with self.semaphore:
224
  try:
225
+ params = {
226
+ "linkedin_url": linkedin_url,
227
+ "include_skills": "false",
228
+ "include_certifications": "false",
229
+ "include_publications": "false",
230
+ "include_honors": "false",
231
+ "include_volunteers": "false",
232
+ "include_projects": "false",
233
+ "include_patents": "false",
234
+ "include_courses": "false",
235
+ "include_organizations": "false",
236
+ "include_profile_status": "false",
237
+ "include_company_public_url": "true",
238
  }
239
 
240
+ logger.debug("Starting RapidAPI LinkedIn profile fetch.")
241
+
242
+ response = await self._client.get(
243
+ url="/get-linkedin-profile", params=params
244
+ )
245
+ response.raise_for_status()
246
+
247
+ profile_data = response.json()
248
+ logger.debug(
249
+ "Got RapidAPI LinkedIn profile response",
250
+ extra={"status_code": response.status_code},
251
+ )
252
+
253
+ return profile_data
254
+
255
+ except httpx.HTTPStatusError as e:
256
+ logger.exception(
257
+ "RapidAPI LinkedIn fetch HTTP error",
258
+ extra={
259
+ "linkedin_url": linkedin_url,
260
+ "status_code": e.response.status_code,
261
+ "response_text": e.response.text,
262
+ "error": str(e),
263
+ },
264
+ )
265
+ raise
266
+
267
+ except httpx.RequestError as e:
268
+ logger.exception(
269
+ "RapidAPI LinkedIn fetch request error",
270
+ extra={"linkedin_url": linkedin_url, "error": str(e)},
271
+ )
272
+ raise
273
 
274
+ except json.JSONDecodeError as e:
275
+ logger.exception(
276
+ "RapidAPI LinkedIn JSON decode error",
277
+ extra={
278
+ "linkedin_url": linkedin_url,
279
+ "error": str(e),
280
+ "response_text": (
281
+ response.text if "response" in locals() else "N/A"
282
+ ),
283
+ },
284
+ )
285
+ raise
286
  except Exception:
287
  logger.exception(
288
+ "Failed to fetch LinkedIn profile (unexpected error)",
289
+ extra={"linkedin_url": linkedin_url},
290
  )
291
  raise
292
 
 
299
  task = asyncio.create_task(self.fetch_profile(url))
300
  tasks.append((url, task))
301
 
302
+ await asyncio.sleep(5.0)
 
303
 
304
  results = {}
305
  for url, task in tasks:
 
308
  results[url] = result
309
  except Exception:
310
  logger.exception(
311
+ "Batch LinkedIn profile fetch failed for URL",
312
+ extra={"linkedin_url": url},
313
  )
314
  results[url] = None
315
 
 
317
 
318
 
319
  class GitHubClient:
320
+ """Client for fetching GitHub profile HTML"""
321
 
322
  def __init__(self):
323
+ self._client = httpx.AsyncClient(timeout=20.0)
324
  self.semaphore = asyncio.Semaphore(settings.max_concurrent_requests)
325
 
326
+ async def __aenter__(self):
327
+ return self
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
330
+ await self._client.aclose()
 
 
 
 
 
 
 
 
 
331
 
332
+ async def fetch_github_profile_html(self, github_url: str) -> Optional[str]:
333
+ """Fetch GitHub profile page HTML"""
334
+ async with self.semaphore:
335
+ try:
336
+ response = await self._client.get(github_url)
337
+ response.raise_for_status()
338
+ return response.text
339
+ except httpx.HTTPStatusError as e:
340
+ logger.warning(
341
+ f"Failed to fetch {github_url}: HTTP {e.response.status_code}"
342
  )
343
  return None
344
+ except httpx.RequestError as e:
345
+ logger.warning(f"Request error for {github_url}: {e}")
346
+ return None
347
+ except Exception:
348
+ logger.exception(f"Unexpected error fetching {github_url}")
349
+ return None
350
 
351
 
352
  class URLExtractor:
353
+ """Extracts specific profile URLs from search results"""
354
+
355
+ def _extract_and_filter_urls(
356
+ self, search_results: list[dict[str, Any]], validator_func
357
+ ) -> list[str]:
358
+ """
359
+ Generic method to extract and filter URLs based on a validator function.
360
+ Handles deduplication.
361
+ """
362
+ unique_urls: set[str] = set()
363
  for result in search_results:
364
  link = result.get("link", "")
365
+ if validator_func(link):
366
+ unique_urls.add(link)
367
+ return list(unique_urls)
 
 
 
 
 
 
 
 
 
 
368
 
369
+ def extract_linkedin_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
370
+ """Extract LinkedIn profile URLs from search results"""
371
+ urls = self._extract_and_filter_urls(
372
+ search_results, self._is_valid_linkedin_url
373
+ )
374
+ logger.info("extracted_linkedin_urls", extra={"count": len(urls)})
375
+ return urls
376
 
377
  def extract_github_urls(self, search_results: list[dict[str, Any]]) -> list[str]:
378
  """Extract GitHub profile URLs from search results"""
379
+ urls = self._extract_and_filter_urls(search_results, self._is_valid_github_url)
380
+ logger.info("extracted_github_urls", extra={"count": len(urls)})
381
+ return urls
 
 
 
 
 
 
 
 
382
 
383
  def _is_valid_linkedin_url(self, url: str) -> bool:
384
  """Validate LinkedIn profile URL"""
385
  if not url:
386
  return False
387
 
388
+ # Use regex for more precise validation of /in/ or /pub/ type profiles
389
+ # Allowing for optional trailing slash or query parameters
390
+ linkedin_profile_pattern = re.compile(
391
+ r"https?:\/\/(?:www\.)?linkedin\.com\/(?:in|pub)\/[a-zA-Z0-9_-]+\/?(?:[?#].*)?$",
392
+ re.IGNORECASE,
393
+ )
 
 
 
394
 
395
+ # Exclude common non-profile pages
396
  exclude_patterns = [
397
  "/company/",
398
  "/school/",
 
400
  "/jobs/",
401
  "/directory/",
402
  "/pub/dir/",
 
403
  "/learning/",
404
  "/events/",
405
  "/posts/",
406
  ]
407
 
408
+ # Check against exclude patterns first (case-insensitive)
409
  if any(pattern in url.lower() for pattern in exclude_patterns):
410
  return False
411
 
412
+ # Final check using the profile pattern
413
+ return bool(linkedin_profile_pattern.match(url))
414
 
415
  def _is_valid_github_url(self, url: str) -> bool:
416
  """Validate GitHub profile URL"""
417
  if not url:
418
  return False
419
 
420
+ # Regex to match github.com/username, excluding common repo/org paths.
421
+ # This regex tries to ensure it's a root-level profile path.
422
+ github_profile_pattern = re.compile(
423
+ r"https?:\/\/(?:www\.)?github\.com\/[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,37}[a-zA-Z0-9])?\/?(?:[?#].*)?$",
424
+ re.IGNORECASE,
425
+ )
426
 
427
+ # Exclude common non-profile pages more broadly to catch variations
 
 
 
428
  exclude_patterns = [
 
 
 
 
 
 
 
429
  "/tree/",
430
  "/blob/",
431
  "/commit/",
432
+ "/pull/",
433
+ "/issues/",
434
+ "/wiki/",
435
+ "/releases/",
436
+ "/actions/",
437
+ "/projects/",
438
+ "/settings/",
439
+ "/new/",
440
+ "/organizations/",
441
+ "/users/",
442
+ "/stars/",
443
+ "/forks/",
444
  "/compare/",
445
+ "/marketplace/",
446
+ "/topics/",
447
+ "/collections/",
448
+ "/trending/",
449
+ "/explore/",
450
+ "/codespaces/",
451
+ "/sponsors/",
452
+ "/notifications/",
453
  ]
 
454
  if any(pattern in url.lower() for pattern in exclude_patterns):
455
  return False
456
+ return bool(github_profile_pattern.match(url))
457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  async def test_clients():
460
  """Test function for API clients"""
461
+ async with SerpAPIClient() as serpapi:
462
+ try:
463
+ linkedin_search = await serpapi.search(
464
+ 'site:linkedin.com/in "ml research" "llm" "mountain view" -student -intern -"recent graduate"'
465
+ )
466
+ github_search = await serpapi.search(
467
+ 'site:github.com "pytorch" "transformers" "nlp" followers:>10 repos:>5'
468
+ )
469
+
470
+ with open("test/data/linkedin_search.json", "w") as f:
471
+ json.dump(linkedin_search, f, indent=4)
472
+ with open("test/data/github_search.json", "w") as f:
473
+ json.dump(github_search, f, indent=4)
474
+
475
+ # Extract organic_results from the full response
476
+ linkedin_results = linkedin_search.get("organic_results", [])
477
+ github_results = github_search.get("organic_results", [])
478
+
479
+ print(f"Found {len(linkedin_results)} lnkd search results")
480
+ print(f"Found {len(github_results)} gh search results")
481
+
482
+ extractor = URLExtractor()
483
+ linkedin_urls = extractor.extract_linkedin_urls(linkedin_results)
484
+ github_urls = extractor.extract_github_urls(github_results)
485
+
486
+ # Store extracted URLs
487
+ with open("test/data/linkedin_urls.txt", "w") as f:
488
+ for url in linkedin_urls:
489
+ f.write(url + "\n")
490
+ with open("test/data/github_urls.txt", "w") as f:
491
+ for url in github_urls:
492
+ f.write(url + "\n")
493
+
494
+ print(f"Extracted {len(linkedin_urls)} LinkedIn URLs")
495
+ print(f"Extracted {len(github_urls)} GH URLs")
496
+
497
+ if linkedin_urls:
498
+ async with LinkedInProfileClient() as linkedin_client:
499
+ profile_data = await linkedin_client.fetch_profile(linkedin_urls[0])
500
+ # Store fetched LinkedIn profile data
501
+ with open("test/data/linkedin_profile.json", "w") as f:
502
+ json.dump(profile_data, f, indent=4)
503
+ print(f"Profile fetch result: {profile_data is not None}")
504
+
505
+ if github_urls:
506
+ async with GitHubClient() as github_client:
507
+ github_data = await github_client.fetch_github_profile_html(
508
+ github_urls[0]
509
+ )
510
+ # Store fetched GitHub profile HTML
511
+ if github_data is not None:
512
+ with open("test/data/github_profile.html", "w") as f:
513
+ f.write(github_data)
514
+ print(f"GH profile fetched: {github_data is not None}")
515
+ if github_data:
516
+ print(f"gh profile html len {len(github_data)}")
517
+
518
+ except Exception as e:
519
+ print(f"Test failed: {e}")
520
+ traceback.print_exc()
521
 
522
 
523
  if __name__ == "__main__":
test/data/github_profile.html ADDED
The diff for this file is too large to render. See raw diff
 
test/data/github_search.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "search_metadata": {
3
+ "id": "6862ba316adf27aef7fa1728",
4
+ "status": "Success",
5
+ "json_endpoint": "https://serpapi.com/searches/19d6b92708c05940/6862ba316adf27aef7fa1728.json",
6
+ "created_at": "2025-06-30 16:24:17 UTC",
7
+ "processed_at": "2025-06-30 16:24:17 UTC",
8
+ "google_url": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=0&sourceid=chrome&ie=UTF-8",
9
+ "raw_html_file": "https://serpapi.com/searches/19d6b92708c05940/6862ba316adf27aef7fa1728.html",
10
+ "total_time_taken": 4.53
11
+ },
12
+ "search_parameters": {
13
+ "engine": "google",
14
+ "q": "site:github.com \"pytorch\" \"transformers\" \"nlp\" followers:>10 repos:>5",
15
+ "google_domain": "google.com",
16
+ "hl": "en",
17
+ "gl": "us",
18
+ "start": 0,
19
+ "num": "20",
20
+ "device": "desktop"
21
+ },
22
+ "search_information": {
23
+ "query_displayed": "site:github.com \"pytorch\" \"transformers\" \"nlp\" followers:>10 repos:>5",
24
+ "total_results": 2,
25
+ "time_taken_displayed": 0.25,
26
+ "organic_results_state": "Results for exact spelling"
27
+ },
28
+ "organic_results": [
29
+ {
30
+ "position": 1,
31
+ "title": "Awesome Transformer & Transfer Learning in NLP Awesome",
32
+ "link": "https://github.com/cedrickchee/awesome-transformer-nlp",
33
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://github.com/cedrickchee/awesome-transformer-nlp&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQFnoECBoQAQ",
34
+ "displayed_link": "https://github.com \u203a cedrickchee \u203a awesome-transforme...",
35
+ "favicon": "https://serpapi.com/searches/6862ba316adf27aef7fa1728/images/f4c4dd624bf17ff4927fea0004812b7ae14156863c39ff22ad6d01ea9993e985.png",
36
+ "snippet": "How the Transformers broke NLP leaderboards by Anna Rogers. ... PyTorch and TensorFlow. Hugging Face Transformers (formerly known as pytorch-transformers ...",
37
+ "snippet_highlighted_words": [
38
+ "Transformers",
39
+ "NLP",
40
+ "PyTorch",
41
+ "Transformers",
42
+ "pytorch",
43
+ "transformers"
44
+ ],
45
+ "source": "GitHub"
46
+ },
47
+ {
48
+ "position": 2,
49
+ "title": "Hugging Face",
50
+ "link": "https://github.com/huggingface",
51
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://github.com/huggingface&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQFnoECB0QAQ",
52
+ "displayed_link": "https://github.com \u203a huggingface",
53
+ "favicon": "https://serpapi.com/searches/6862ba316adf27aef7fa1728/images/f4c4dd624bf17ff4927fea0004812b7a1c6fc5dbd006969ca023f9f5e48fb80d.png",
54
+ "snippet": "Showing 10 of 327 repositories. transformers Public. Transformers: the ... pytorch machine-learning nlp transformers deep-learning \u00b7 GitHub Sponsor ...",
55
+ "snippet_highlighted_words": [
56
+ "10",
57
+ "repositories",
58
+ "transformers",
59
+ "Transformers",
60
+ "pytorch",
61
+ "nlp transformers"
62
+ ],
63
+ "sitelinks": {
64
+ "inline": [
65
+ {
66
+ "title": "Transformers",
67
+ "link": "https://github.com/huggingface/transformers"
68
+ },
69
+ {
70
+ "title": "Repositories",
71
+ "link": "https://github.com/orgs/huggingface/repositories"
72
+ },
73
+ {
74
+ "title": "Huggingface/datasets",
75
+ "link": "https://github.com/huggingface/datasets"
76
+ },
77
+ {
78
+ "title": "Diffusers",
79
+ "link": "https://github.com/huggingface/diffusers"
80
+ }
81
+ ]
82
+ },
83
+ "source": "GitHub"
84
+ }
85
+ ],
86
+ "related_searches": [
87
+ {
88
+ "block_position": 1,
89
+ "query": "Pytorch transformers nlp followers :> 10 repos :> 5 github",
90
+ "link": "https://www.google.com/search?num=20&sca_esv=6d27a280cc9bfc7a&hl=en&gl=us&q=Pytorch+transformers+nlp+followers+:%3E+10+repos+:%3E+5+github&sa=X&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQ1QJ6BAgSEAE",
91
+ "serpapi_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=Pytorch+transformers+nlp+followers+%3A%3E+10+repos+%3A%3E+5+github&start=0"
92
+ },
93
+ {
94
+ "block_position": 1,
95
+ "query": "Pytorch transformers nlp followers :> 10 repos :> 5 example",
96
+ "link": "https://www.google.com/search?num=20&sca_esv=6d27a280cc9bfc7a&hl=en&gl=us&q=Pytorch+transformers+nlp+followers+:%3E+10+repos+:%3E+5+example&sa=X&ved=2ahUKEwjR9Ouux5mOAxXLRTABHRUfMfAQ1QJ6BAgREAE",
97
+ "serpapi_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=Pytorch+transformers+nlp+followers+%3A%3E+10+repos+%3A%3E+5+example&start=0"
98
+ }
99
+ ],
100
+ "pagination": {
101
+ "current": 1,
102
+ "next": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=20&sourceid=chrome&ie=UTF-8",
103
+ "other_pages": {
104
+ "2": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=20&sourceid=chrome&ie=UTF-8",
105
+ "3": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=40&sourceid=chrome&ie=UTF-8",
106
+ "4": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=60&sourceid=chrome&ie=UTF-8",
107
+ "5": "https://www.google.com/search?q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&oq=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&hl=en&gl=us&num=20&start=80&sourceid=chrome&ie=UTF-8"
108
+ }
109
+ },
110
+ "serpapi_pagination": {
111
+ "current": 1,
112
+ "next_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=20",
113
+ "next": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=20",
114
+ "other_pages": {
115
+ "2": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=20",
116
+ "3": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=40",
117
+ "4": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=60",
118
+ "5": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Agithub.com+%22pytorch%22+%22transformers%22+%22nlp%22+followers%3A%3E10+repos%3A%3E5&start=80"
119
+ }
120
+ }
121
+ }
test/data/github_urls.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://github.com/huggingface
test/data/linkedin_profile.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "about": "Experienced applied scientist with demonstrated success developing and leveraging cutting-edge AI technology with a Gen AI focus to solve real-world problems. Adept at quickly building proof-of-concepts of latest AI/ML research and transitioning them into deployable solutions to deliver business value. Proven track record in applied AI research and innovation with publications in top-tier AI journals, best paper award, and over 1000 citations. Proficient in communicating complex technical concepts and results to diverse audiences.",
4
+ "city": "Houston",
5
+ "company": "NVIDIA",
6
+ "company_description": "Since its founding in 1993, NVIDIA (NASDAQ: NVDA) has been a pioneer in accelerated computing. The company\u2019s invention of the GPU in 1999 sparked the growth of the PC gaming market, redefined computer graphics, ignited the era of modern AI and is fueling the creation of the metaverse. NVIDIA is now a full-stack computing company with data-center-scale offerings that are reshaping industry.",
7
+ "company_domain": "nvidia.com",
8
+ "company_employee_count": 42856,
9
+ "company_employee_range": "10001+",
10
+ "company_industry": "Computer Hardware Manufacturing",
11
+ "company_linkedin_url": "https://www.linkedin.com/company/nvidia",
12
+ "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQGV36q2EowSyw/company-logo_400_400/company-logo_400_400/0/1724881581208/nvidia_logo?e=1756944000&v=beta&t=hWYnbN29CDwsfzVnmcC87Jh2-TqH_oIPymyMkRXfPJk",
13
+ "company_website": "http://www.nvidia.com",
14
+ "company_year_founded": 1993,
15
+ "connection_count": 544,
16
+ "country": "United States",
17
+ "current_company_join_month": 4,
18
+ "current_company_join_year": 2024,
19
+ "current_job_duration": "1 yr 3 mos",
20
+ "educations": [
21
+ {
22
+ "activities": "",
23
+ "date_range": "2014 - 2017",
24
+ "degree": "Doctor of Philosophy (PhD)",
25
+ "end_month": "",
26
+ "end_year": 2017,
27
+ "field_of_study": "Civil/Structural Engineering",
28
+ "school": "Louisiana State University",
29
+ "school_id": "164771",
30
+ "school_linkedin_url": "https://www.linkedin.com/company/164771/",
31
+ "school_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQEhy54pBgGE9g/company-logo_200_200/company-logo_200_200/0/1681160348853/louisiana_state_university_logo?e=1756944000&v=beta&t=Uz-JKcul07L8O2pEk1nV7ImIGLtw3Dr8thHISsfqSJA",
32
+ "start_month": "",
33
+ "start_year": 2014
34
+ },
35
+ {
36
+ "activities": "",
37
+ "date_range": "2010 - 2014",
38
+ "degree": "Bachelor's degree",
39
+ "end_month": "",
40
+ "end_year": 2014,
41
+ "field_of_study": "Civil Engineering",
42
+ "school": "Hunan University",
43
+ "school_id": "91038",
44
+ "school_linkedin_url": "https://www.linkedin.com/company/91038/",
45
+ "school_logo_url": "https://media.licdn.com/dms/image/v2/C510BAQHo9p4G79briw/company-logo_200_200/company-logo_200_200/0/1630564627893/hunan_university_logo?e=1756944000&v=beta&t=mtLHSeTKy8TSfjLFaL5RuXMXzhbJ4U8bTkKPdO_XGrk",
46
+ "start_month": "",
47
+ "start_year": 2010
48
+ }
49
+ ],
50
+ "email": "",
51
+ "experiences": [
52
+ {
53
+ "company": "NVIDIA",
54
+ "company_id": "3608",
55
+ "company_linkedin_url": "https://www.linkedin.com/company/3608",
56
+ "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQGV36q2EowSyw/company-logo_200_200/company-logo_200_200/0/1724881581208/nvidia_logo?e=1756944000&v=beta&t=6JH-MY7mJHJkjhcK60TGYlKcL0qKpSvGS_R2l6mE-Kw",
57
+ "company_public_url": "https://www.linkedin.com/company/nvidia",
58
+ "date_range": "Apr 2024 - Present",
59
+ "description": "",
60
+ "duration": "1 yr 3 mos",
61
+ "end_month": "",
62
+ "end_year": "",
63
+ "is_current": true,
64
+ "job_type": "Full-time",
65
+ "location": "Remote",
66
+ "skills": "",
67
+ "start_month": 4,
68
+ "start_year": 2024,
69
+ "title": "Senior Data Scientist, Gen AI"
70
+ },
71
+ {
72
+ "company": "Raytheon Technologies",
73
+ "company_id": "40653509",
74
+ "company_linkedin_url": "https://www.linkedin.com/company/40653509",
75
+ "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQFITBtnCzFO9w/company-logo_200_200/company-logo_200_200/0/1687169432565/raytheontechnologies_logo?e=1756944000&v=beta&t=eEmqdsWL-7uKO6qOuFCxhUKQea4KCv4P4c3BorEP4-c",
76
+ "company_public_url": "https://www.linkedin.com/company/rtx",
77
+ "date_range": "Aug 2022 - Apr 2024",
78
+ "description": "At Applied AI team in Enterprise Data Services. Lead the technical development of key Generative AI projects to improve productivity and enhance efficiency across organizations, including:\n\nBuilt and optimized retrieval-augmented generation (RAG) pipeline for RTX's enterprise-level ChatGPT using Azure and OpenAI services to enable interacting with company internal data via chat.\n\nDeveloped Gen AI-powered tools to streamline document review process using open-source LLMs hosted on-premise and optimized LLM inference.\n\nCreated rapidly deployable computer vision solution for automating visual inspection of industrial components using multi-modal (vision-language) models without task-specific training, reducing onboarding time for new inspection programs.\n\nDeveloped novel deep learning framework for data-driven augmentation of physics-based models using generative adversarial network (GAN) for improving jet engine design.",
79
+ "duration": "1 yr 9 mos",
80
+ "end_month": 4,
81
+ "end_year": 2024,
82
+ "is_current": false,
83
+ "job_type": "Full-time",
84
+ "location": "Remote",
85
+ "skills": "Python (Programming Language) \u00b7 Machine Learning \u00b7 Large Language Models (LLM) \u00b7 Deep Learning \u00b7 Computer Vision \u00b7 Data Science \u00b7 PyTorch \u00b7 Natural Language Processing (NLP) \u00b7 Generative AI \u00b7 Information Retrieval \u00b7 OpenAI \u00b7 Azure Databricks \u00b7 TensorFlow \u00b7 Azure \u00b7 Prompt Engineering",
86
+ "start_month": 8,
87
+ "start_year": 2022,
88
+ "title": "Senior Applied Scientist, Generative AI"
89
+ },
90
+ {
91
+ "company": "Penn State University",
92
+ "company_id": "3657",
93
+ "company_linkedin_url": "https://www.linkedin.com/company/3657",
94
+ "company_logo_url": "https://media.licdn.com/dms/image/v2/C560BAQEFv0lqpE-gsQ/company-logo_200_200/company-logo_200_200/0/1631308876392?e=1756944000&v=beta&t=XHqHEPjKP8Y_NtwerKGr0g4oTkBBK7gbUgdUxs5m73U",
95
+ "company_public_url": "https://www.linkedin.com/school/penn-state-university/",
96
+ "date_range": "Mar 2021 - Aug 2022",
97
+ "description": "Developed probabilistic deep learning framework for differential equation discovery, implemented it using Tensorflow, and published research work in top-tier AI journal. Wrote grant proposals and led research team of Ph.D. students to develop novel AI/ML methods for statistical model calibration.",
98
+ "duration": "1 yr 6 mos",
99
+ "end_month": 8,
100
+ "end_year": 2022,
101
+ "is_current": false,
102
+ "job_type": "",
103
+ "location": "University Park, Pennsylvania, United States",
104
+ "skills": "Python (Programming Language) \u00b7 Machine Learning \u00b7 Deep Learning \u00b7 Data Science \u00b7 Statistical Modeling \u00b7 TensorFlow",
105
+ "start_month": 3,
106
+ "start_year": 2021,
107
+ "title": "Assistant Research Professor, Applied AI"
108
+ },
109
+ {
110
+ "company": "Arizona State University",
111
+ "company_id": "4292",
112
+ "company_linkedin_url": "https://www.linkedin.com/company/4292",
113
+ "company_logo_url": "https://media.licdn.com/dms/image/v2/C560BAQHDGjY1IZJuog/company-logo_200_200/company-logo_200_200/0/1631309406468?e=1756944000&v=beta&t=5zshFEI8NfV0B7jpay03j-VLchpTh_bc-Xm_4UMFKLM",
114
+ "company_public_url": "https://www.linkedin.com/school/arizona-state-university/",
115
+ "date_range": "Jan 2018 - Mar 2021",
116
+ "description": "Developed hybrid physics-AI model for dynamical system modeling, simulation and identification, and published research work in top-tier AI journals. Wrote grant proposals and served as Co-PI for USDOT-funded project to develop computer-vision based solution and prototype robotic platform for gas pipeline anomaly detection.",
117
+ "duration": "3 yrs 3 mos",
118
+ "end_month": 3,
119
+ "end_year": 2021,
120
+ "is_current": false,
121
+ "job_type": "",
122
+ "location": "Tempe, Arizona, United States",
123
+ "skills": "Python (Programming Language) \u00b7 Deep Learning \u00b7 Computer Vision \u00b7 Predictive Modeling \u00b7 TensorFlow \u00b7 Uncertainty Quantification",
124
+ "start_month": 1,
125
+ "start_year": 2018,
126
+ "title": "Postdoctoral Fellow, Applied AI"
127
+ },
128
+ {
129
+ "company": "Louisiana State University",
130
+ "company_id": "164771",
131
+ "company_linkedin_url": "https://www.linkedin.com/company/164771",
132
+ "company_logo_url": "https://media.licdn.com/dms/image/v2/D560BAQEhy54pBgGE9g/company-logo_200_200/company-logo_200_200/0/1681160348853/louisiana_state_university_logo?e=1756944000&v=beta&t=Uz-JKcul07L8O2pEk1nV7ImIGLtw3Dr8thHISsfqSJA",
133
+ "company_public_url": "https://www.linkedin.com/school/louisiana-state-university/",
134
+ "date_range": "Aug 2014 - Dec 2017",
135
+ "description": "Collected, cleaned, and analyzed time series monitoring data from over 100 sensors (strain, acceleration, etc.) on I-10 Twin Span Bridge. Developed data-driven methods for extracting traffic insights from the monitoring data and built predictive model to forecast the extreme traffic loading effect on the bridge using Bayesian regression analysis and extreme value statistics, published research in peer-reviewed journals.",
136
+ "duration": "3 yrs 5 mos",
137
+ "end_month": 12,
138
+ "end_year": 2017,
139
+ "is_current": false,
140
+ "job_type": "",
141
+ "location": "Baton Rouge Metropolitan Area",
142
+ "skills": "Python (Programming Language) \u00b7 Data Science \u00b7 Predictive Modeling \u00b7 Digital Signal Processing",
143
+ "start_month": 8,
144
+ "start_year": 2014,
145
+ "title": "Research Assistant"
146
+ }
147
+ ],
148
+ "first_name": "Yang",
149
+ "follower_count": 562,
150
+ "full_name": "Yang Yu",
151
+ "headline": "Gen AI @ Nvidia",
152
+ "hq_city": "Santa Clara",
153
+ "hq_country": "US",
154
+ "hq_region": "CA",
155
+ "is_creator": false,
156
+ "is_influencer": false,
157
+ "is_premium": false,
158
+ "is_verified": true,
159
+ "job_title": "Senior Data Scientist, Gen AI",
160
+ "languages": [],
161
+ "last_name": "Yu",
162
+ "linkedin_url": "https://www.linkedin.com/in/yangyu190/",
163
+ "location": "Houston, Texas, United States",
164
+ "phone": "",
165
+ "profile_id": "375556281",
166
+ "profile_image_url": "https://media.licdn.com/dms/image/v2/C4E03AQHX39ssDV26rw/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1517381869648?e=1756944000&v=beta&t=uEiiGq_HmA9CFPtwyDFk5NbKYQd_AYCyjkaFZg-LEfY",
167
+ "public_id": "yangyu190",
168
+ "school": "Louisiana State University",
169
+ "state": "Texas",
170
+ "urn": "ACoAABZiiLkBi19TRzv41wjO9aXI9nrCJMofQfo"
171
+ },
172
+ "message": "ok"
173
+ }
test/data/linkedin_search.json ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "search_metadata": {
3
+ "id": "6862bbabeeabd394c127d48f",
4
+ "status": "Success",
5
+ "json_endpoint": "https://serpapi.com/searches/63d251104f0cfcb2/6862bbabeeabd394c127d48f.json",
6
+ "created_at": "2025-06-30 16:30:35 UTC",
7
+ "processed_at": "2025-06-30 16:30:35 UTC",
8
+ "google_url": "https://www.google.com/search?q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&oq=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&hl=en&gl=us&num=20&start=0&sourceid=chrome&ie=UTF-8",
9
+ "raw_html_file": "https://serpapi.com/searches/63d251104f0cfcb2/6862bbabeeabd394c127d48f.html",
10
+ "total_time_taken": 3.5
11
+ },
12
+ "search_parameters": {
13
+ "engine": "google",
14
+ "q": "site:linkedin.com/in \"ml research\" \"llm\" \"mountain view\" -student -intern -\"recent graduate\"",
15
+ "google_domain": "google.com",
16
+ "hl": "en",
17
+ "gl": "us",
18
+ "start": 0,
19
+ "num": "20",
20
+ "device": "desktop"
21
+ },
22
+ "search_information": {
23
+ "query_displayed": "site:linkedin.com/in \"ml research\" \"llm\" \"mountain view\" -student -intern -\"recent graduate\"",
24
+ "total_results": 92,
25
+ "time_taken_displayed": 0.22,
26
+ "organic_results_state": "Results for exact spelling"
27
+ },
28
+ "organic_results": [
29
+ {
30
+ "position": 1,
31
+ "title": "Changwan Ryu - ML/Research Engineer - Google DeepMind",
32
+ "link": "https://www.linkedin.com/in/changwan-ryu-6aba477",
33
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/changwan-ryu-6aba477&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBcQAQ",
34
+ "displayed_link": "180+ followers",
35
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab967e4ac06153bb5fc3f1fbd8c12d57c8.png",
36
+ "snippet": "ML/Research Engineer \u00b7 Experience: Google DeepMind \u00b7 Education: Korea Advanced Institute of Science and Technology \u00b7 Location: Mountain View \u00b7 172 ...",
37
+ "snippet_highlighted_words": [
38
+ "ML/Research",
39
+ "Mountain View"
40
+ ],
41
+ "rich_snippet": {
42
+ "top": {
43
+ "extensions": [
44
+ "Mountain View, California, United States",
45
+ "ML/Research Engineer",
46
+ "Google DeepMind"
47
+ ]
48
+ }
49
+ },
50
+ "source": "LinkedIn \u00b7 Changwan Ryu"
51
+ },
52
+ {
53
+ "position": 2,
54
+ "title": "Shobha Vasudevan - Google",
55
+ "link": "https://www.linkedin.com/in/shobha-vasudevan-72154b11",
56
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/shobha-vasudevan-72154b11&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBYQAQ",
57
+ "displayed_link": "1.1K+ followers",
58
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abdbfc1c2aa5e9602675e5e15d69d4b5d9.png",
59
+ "snippet": "Shobha Vasudevan is an AI/ML research scientist in Google. She has worked on LLM post\u2026 \u00b7 Experience: Google \u00b7 Education: The University of Texas at Austin ...",
60
+ "snippet_highlighted_words": [
61
+ "ML research",
62
+ "LLM"
63
+ ],
64
+ "rich_snippet": {
65
+ "top": {
66
+ "extensions": [
67
+ "Palo Alto, California, United States",
68
+ "Google"
69
+ ]
70
+ }
71
+ },
72
+ "source": "LinkedIn \u00b7 Shobha Vasudevan"
73
+ },
74
+ {
75
+ "position": 3,
76
+ "title": "Nitin Singhal - AI ML Engineer Architect - United IT",
77
+ "link": "https://www.linkedin.com/in/nitinsinghal03",
78
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/nitinsinghal03&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBgQAQ",
79
+ "displayed_link": "40+ followers",
80
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abe3091cbf47af0a11060fa652e5856d82.png",
81
+ "snippet": "... Mountain View \u00b7 40 connections on LinkedIn. View Nitin Singhal's profile on ... LLM and multimodal models. It contains sub repos by topic containing ...",
82
+ "snippet_highlighted_words": [
83
+ "Mountain View",
84
+ "LLM"
85
+ ],
86
+ "rich_snippet": {
87
+ "top": {
88
+ "extensions": [
89
+ "Mountain View, California, United States",
90
+ "AI ML Engineer Architect",
91
+ "United IT"
92
+ ]
93
+ }
94
+ },
95
+ "source": "LinkedIn \u00b7 Nitin Singhal"
96
+ },
97
+ {
98
+ "position": 4,
99
+ "title": "Roxana Rivkani - AI/ML Research Engineer",
100
+ "link": "https://www.linkedin.com/in/roxanarvk",
101
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/roxanarvk&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECB0QAQ",
102
+ "displayed_link": "1.3K+ followers",
103
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abc790cb6a6151bdf0112f0249f2923133.png",
104
+ "snippet": "AI/ML Research Engineer. NASA Ames Research Center. Jun 2023 - Present 2 years 1 month. Mountain View, California, United States. Stealth Startup Graphic. Co ...",
105
+ "snippet_highlighted_words": [
106
+ "ML Research",
107
+ "Mountain View"
108
+ ],
109
+ "rich_snippet": {
110
+ "top": {
111
+ "extensions": [
112
+ "San Francisco Bay Area",
113
+ "AI/ML Research Engineer",
114
+ "NASA Ames Research Center"
115
+ ]
116
+ }
117
+ },
118
+ "source": "LinkedIn \u00b7 Roxana Rivkani"
119
+ },
120
+ {
121
+ "position": 5,
122
+ "title": "Andy Su - ML Research - Meta",
123
+ "link": "https://www.linkedin.com/in/andyssu",
124
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/andyssu&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBkQAQ",
125
+ "displayed_link": "3.6K+ followers",
126
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab5413cd18de5a0ad5eed1827f03b46b90.png",
127
+ "snippet": "LLM researcher | andysu.org \u00b7 Experience: Meta \u00b7 Education: Princeton ... Experience. Meta Graphic. ML Research. Meta. 2024 - Present 1 year. Apple Graphic ...",
128
+ "snippet_highlighted_words": [
129
+ "LLM",
130
+ "ML Research"
131
+ ],
132
+ "rich_snippet": {
133
+ "top": {
134
+ "extensions": [
135
+ "New York City Metropolitan Area",
136
+ "ML Research",
137
+ "Meta"
138
+ ]
139
+ }
140
+ },
141
+ "source": "LinkedIn \u00b7 Andy Su"
142
+ },
143
+ {
144
+ "position": 6,
145
+ "title": "Vaskar Nath - Machine Learning Researcher - Scale AI",
146
+ "link": "https://www.linkedin.com/in/vaskarnath",
147
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/vaskarnath&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECBoQAQ",
148
+ "displayed_link": "1K+ followers",
149
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab42774fef2b747d799adfefb958279c14.png",
150
+ "snippet": "ML Research Engineer. Nuro. May 2022 - Aug 2022 4 months. Mountain View, California, United States. Localization and Mapping. Scale AI Graphic. Software ...",
151
+ "snippet_highlighted_words": [
152
+ "ML Research",
153
+ "Mountain View"
154
+ ],
155
+ "rich_snippet": {
156
+ "top": {
157
+ "extensions": [
158
+ "New York, New York, United States",
159
+ "Machine Learning Researcher",
160
+ "Scale AI"
161
+ ]
162
+ }
163
+ },
164
+ "source": "LinkedIn \u00b7 Vaskar Nath"
165
+ },
166
+ {
167
+ "position": 7,
168
+ "title": "Ming Yin - Engineering Lead - AI Agent - Cresta",
169
+ "link": "https://www.linkedin.com/in/myin8",
170
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/myin8&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCgQAQ",
171
+ "displayed_link": "1.5K+ followers",
172
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab7d58d32b3b3d8d90697271a460e8938a.png",
173
+ "snippet": "AI Agent Engineering Lead @ Cresta | Building LLM & Gen AI ... Leading engineering and ML research teams to push the boundaries of conversational AI.",
174
+ "snippet_highlighted_words": [
175
+ "LLM",
176
+ "ML research"
177
+ ],
178
+ "rich_snippet": {
179
+ "top": {
180
+ "extensions": [
181
+ "Greater Seattle Area",
182
+ "Engineering Lead",
183
+ "AI Agent",
184
+ "Cresta"
185
+ ]
186
+ }
187
+ },
188
+ "source": "LinkedIn \u00b7 Ming Yin"
189
+ },
190
+ {
191
+ "position": 8,
192
+ "title": "Di Zhou - San Francisco Bay Area | Professional Profile",
193
+ "link": "https://www.linkedin.com/in/di-zhou-57b031ab",
194
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/di-zhou-57b031ab&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECC4QAQ",
195
+ "displayed_link": "40+ followers",
196
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abda086b98580a3f10f8974db19a523e44.png",
197
+ "snippet": "Experience ; Software Engineer. Lymba. Feb 2016 \u00b7 Aug 2018 2 years 7 months ; Software Engineer. Baidu, Inc. Nov 2012 \u00b7 Aug 2014 1 year 10 months ; Software ...",
198
+ "source": "LinkedIn \u00b7 Di Zhou"
199
+ },
200
+ {
201
+ "position": 9,
202
+ "title": "Kevin H. - AI & Engineering Manager at Bluestone Analytics",
203
+ "link": "https://www.linkedin.com/in/kevin-h-a6044a18a",
204
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/kevin-h-a6044a18a&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCkQAQ",
205
+ "displayed_link": "200+ followers",
206
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abdfcf6959a576ecaa030c613812861c8c.png",
207
+ "snippet": "AI & Engineering Manager at Bluestone Analytics \u00b7 Experience: Bluestone Analytics, a CACI Company \u00b7 Education: University of Virginia \u00b7 Location: Greater ...",
208
+ "source": "LinkedIn \u00b7 Kevin H."
209
+ },
210
+ {
211
+ "position": 10,
212
+ "title": "Jing Z. - Staff GenAI ML DS - Meta",
213
+ "link": "https://www.linkedin.com/in/zhujing",
214
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/zhujing&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECC0QAQ",
215
+ "displayed_link": "730+ followers",
216
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab80cc4f5683489ab16d774ebb7de2a845.png",
217
+ "snippet": "... LLM modeling and evaluation, including designing data flywheel systems and ... ML research and business impact, translating complex technical concepts ...",
218
+ "snippet_highlighted_words": [
219
+ "LLM",
220
+ "ML research"
221
+ ],
222
+ "rich_snippet": {
223
+ "top": {
224
+ "extensions": [
225
+ "Fremont, California, United States",
226
+ "Staff GenAI ML DS",
227
+ "Meta"
228
+ ]
229
+ }
230
+ },
231
+ "source": "LinkedIn \u00b7 Jing Z."
232
+ },
233
+ {
234
+ "position": 11,
235
+ "title": "\u5f20\u667a\u98a2 - Algorithmic Engineer - \u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8",
236
+ "link": "https://www.linkedin.com/in/%E6%99%BA%E9%A2%A2-%E5%BC%A0-a373b113b",
237
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/%25E6%2599%25BA%25E9%25A2%25A2-%25E5%25BC%25A0-a373b113b&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECC8QAQ",
238
+ "displayed_link": "40+ followers",
239
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab4710c706ab9c1c7d4de30394af62859d.png",
240
+ "snippet": "\u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8 - \u7b97\u6cd5\u5de5\u7a0b\u5e08 \u00b7 Experience: \u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8 \u00b7 Education: University of Connecticut \u00b7 Location: Mansfield Center \u00b7 44 connections on ...",
241
+ "rich_snippet": {
242
+ "top": {
243
+ "extensions": [
244
+ "Mansfield Center, Connecticut, United States",
245
+ "Algorithmic Engineer",
246
+ "\u65af\u5c14\u987f\u79d1\u6280\u6709\u9650\u516c\u53f8"
247
+ ]
248
+ }
249
+ },
250
+ "source": "LinkedIn \u00b7 \u5f20\u667a\u98a2"
251
+ },
252
+ {
253
+ "position": 12,
254
+ "title": "Yang Yu - Senior Data Scientist, Gen AI - NVIDIA",
255
+ "link": "https://www.linkedin.com/in/yangyu190",
256
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/yangyu190&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEoQAQ",
257
+ "displayed_link": "560+ followers",
258
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab58c1b0c2e2293e13fd8d88913fa661cb.png",
259
+ "snippet": "Adept at quickly building proof-of-concepts of latest AI/ML research and transitioning them into deployable solutions to deliver business value. ... LLM inference ...",
260
+ "snippet_highlighted_words": [
261
+ "ML research",
262
+ "LLM"
263
+ ],
264
+ "rich_snippet": {
265
+ "top": {
266
+ "extensions": [
267
+ "Houston, Texas, United States",
268
+ "Senior Data Scientist, Gen AI",
269
+ "NVIDIA"
270
+ ]
271
+ }
272
+ },
273
+ "source": "LinkedIn \u00b7 Yang Yu"
274
+ },
275
+ {
276
+ "position": 13,
277
+ "title": "Ravi Raagav Srinivas - U.S. Bank",
278
+ "link": "https://www.linkedin.com/in/raviraagav",
279
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/raviraagav&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCoQAQ",
280
+ "displayed_link": "500+ followers",
281
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293abb3dcb76c3b3b80ef500f0ffd5b9747a0.png",
282
+ "snippet": "Innovative, energetic software developer with five years of experience in the design and\u2026 \u00b7 Experience: U.S. Bank \u00b7 Education: Rochester Institute of ...",
283
+ "rich_snippet": {
284
+ "top": {
285
+ "extensions": [
286
+ "San Francisco Bay Area",
287
+ "U.S. Bank"
288
+ ]
289
+ }
290
+ },
291
+ "source": "LinkedIn \u00b7 Ravi Raagav Srinivas"
292
+ },
293
+ {
294
+ "position": 14,
295
+ "title": "Jing Lu - Google",
296
+ "link": "https://www.linkedin.com/in/jing-lu-a0b07123",
297
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/jing-lu-a0b07123&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCsQAQ",
298
+ "displayed_link": "1.2K+ followers",
299
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab4c0a0b511dbaffae0af0fd8fa9cc8584.png",
300
+ "snippet": "Experience \u00b7 Google Graphic \u00b7 Google. Mountain View, California, United States \u00b7 -. San Jose, California, United States \u00b7 -. United States \u00b7 - \u00b7 -. 428 Church Street ...",
301
+ "snippet_highlighted_words": [
302
+ "Mountain View"
303
+ ],
304
+ "rich_snippet": {
305
+ "top": {
306
+ "extensions": [
307
+ "San Francisco Bay Area",
308
+ "Google"
309
+ ]
310
+ }
311
+ },
312
+ "source": "LinkedIn \u00b7 Jing Lu"
313
+ },
314
+ {
315
+ "position": 15,
316
+ "title": "Zhi (Kevin) P. - Technical consultant, Partner - Empire 55 LLC",
317
+ "link": "https://www.linkedin.com/in/zhi-kevin-p-3718934",
318
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/zhi-kevin-p-3718934&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECDAQAQ",
319
+ "displayed_link": "150+ followers",
320
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab3672252d2b16e2613378ac9aa7881af2.png",
321
+ "snippet": "Full Stack AI/ML Software Engineer (AI/ML, E2E, Cloud, AWS, NLP, Deep Learning, LLM, RAG, Data modeling and processing, model training ... AI/ML Research and ...",
322
+ "snippet_highlighted_words": [
323
+ "LLM",
324
+ "ML Research"
325
+ ],
326
+ "rich_snippet": {
327
+ "top": {
328
+ "detected_extensions": {
329
+ "empire_llc": 55
330
+ },
331
+ "extensions": [
332
+ "San Jose, California, United States",
333
+ "Technical consultant, Partner",
334
+ "Empire 55 LLC"
335
+ ]
336
+ }
337
+ },
338
+ "source": "LinkedIn \u00b7 Zhi (Kevin) P."
339
+ },
340
+ {
341
+ "position": 16,
342
+ "title": "Zhe Su - Evanston, Illinois, United States",
343
+ "link": "https://www.linkedin.com/in/zhe-su-86b389b3",
344
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/zhe-su-86b389b3&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECCwQAQ",
345
+ "displayed_link": "400+ followers",
346
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab0e94b092cc68821a9d339dc28fab567e.png",
347
+ "snippet": "ML Research Scientist with expertise in machine learning and data analysis. \u00b7 Experience: Upstart \u00b7 Education: Northwestern University \u00b7 Location: Evanston ...",
348
+ "snippet_highlighted_words": [
349
+ "ML Research"
350
+ ],
351
+ "source": "LinkedIn \u00b7 Zhe Su"
352
+ },
353
+ {
354
+ "position": 17,
355
+ "title": "Husain Al-Mohssen - Chrome Bird",
356
+ "link": "https://www.linkedin.com/in/husainalmohssen",
357
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/husainalmohssen&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEcQAQ",
358
+ "displayed_link": "1.4K+ followers",
359
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab33a55fb264eed8720356e465394ec4f7.png",
360
+ "snippet": "I am an Executive AI Leader, specializing in translating cutting-edge AI/ML research\u2026 \u00b7 ... (LLM Based Error Action Recommender). Filed October 21, 2024 #63 ...",
361
+ "snippet_highlighted_words": [
362
+ "ML research",
363
+ "LLM"
364
+ ],
365
+ "rich_snippet": {
366
+ "top": {
367
+ "extensions": [
368
+ "Cambridge, Massachusetts, United States",
369
+ "Chrome Bird"
370
+ ]
371
+ }
372
+ },
373
+ "source": "LinkedIn \u00b7 Husain Al-Mohssen"
374
+ },
375
+ {
376
+ "position": 18,
377
+ "title": "Yang Tian - Tencent Americas",
378
+ "link": "https://www.linkedin.com/in/yang-tian-5a0719b0",
379
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/yang-tian-5a0719b0&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEgQAQ",
380
+ "displayed_link": "730+ followers",
381
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab80e4705eefebd31bdb2d6005ee831582.png",
382
+ "snippet": "Yang is an excellent data engineer. He's a great mentor who is patient and knowledgeable in his field and is always willing and able to assist the Junior ...",
383
+ "rich_snippet": {
384
+ "top": {
385
+ "extensions": [
386
+ "Sunnyvale, California, United States",
387
+ "Tencent Americas"
388
+ ]
389
+ }
390
+ },
391
+ "source": "LinkedIn \u00b7 Yang Tian"
392
+ },
393
+ {
394
+ "position": 19,
395
+ "title": "Peter Mains - Senior Data Engineer",
396
+ "link": "https://www.linkedin.com/in/peter-mains-aa8674336",
397
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/peter-mains-aa8674336&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEUQAQ",
398
+ "displayed_link": "6 followers",
399
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293aba29b014e345ad182be26fc3a794a2b19.png",
400
+ "snippet": "Senior Data Engineer with 12 years of expertise in designing scalable data architectures, optimizing ETL workflows, and building real-time data solutions.",
401
+ "rich_snippet": {
402
+ "top": {
403
+ "extensions": [
404
+ "Gilbert, Arizona, United States",
405
+ "Senior Data Engineer",
406
+ "Luxevision Consulting LLC"
407
+ ]
408
+ }
409
+ },
410
+ "source": "LinkedIn \u00b7 Peter Mains"
411
+ },
412
+ {
413
+ "position": 20,
414
+ "title": "Krishan K. - CodeSignal",
415
+ "link": "https://www.linkedin.com/in/krishan-kumar38",
416
+ "redirect_link": "https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.linkedin.com/in/krishan-kumar38&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQFnoECEYQAQ",
417
+ "displayed_link": "740+ followers",
418
+ "favicon": "https://serpapi.com/searches/6862bbabeeabd394c127d48f/images/ce63d946354500b0cb0994ea88f293ab4508b5b4dfbd6acccd1770f2c8abee37.png",
419
+ "snippet": "I am an engineer with 6+ years experience in the industry. I love solving real world\u2026 \u00b7 CodeSignal \u00b7 The University of Texas at Dallas \u00b7 Nashville \u00b7 500+ ...",
420
+ "rich_snippet": {
421
+ "top": {
422
+ "extensions": [
423
+ "Nashville, Tennessee, United States",
424
+ "CodeSignal"
425
+ ]
426
+ }
427
+ },
428
+ "source": "LinkedIn \u00b7 Krishan K."
429
+ }
430
+ ],
431
+ "pagination": {
432
+ "current": 1,
433
+ "next": "https://www.google.com/search?q=site:linkedin.com/in+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&num=20&sca_esv=6d27a280cc9bfc7a&gl=us&hl=en&ei=rrtiaI3eB7u8juMP17PPiA8&start=20&sa=N&sstk=Ac65TH6KgSWXzfjLxTxsWJDmqXpCNcLB5EhPkbehJWRoqTOswsTJXIfLU0u1V9OX6jXgr3D_qdQhoUwxwhPVsVBnGER9y4lieXQLsw&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQ8NMDegQIChAI",
434
+ "other_pages": {
435
+ "2": "https://www.google.com/search?q=site:linkedin.com/in+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&num=20&sca_esv=6d27a280cc9bfc7a&gl=us&hl=en&ei=rrtiaI3eB7u8juMP17PPiA8&start=20&sa=N&sstk=Ac65TH6KgSWXzfjLxTxsWJDmqXpCNcLB5EhPkbehJWRoqTOswsTJXIfLU0u1V9OX6jXgr3D_qdQhoUwxwhPVsVBnGER9y4lieXQLsw&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQ8tMDegQIChAE",
436
+ "3": "https://www.google.com/search?q=site:linkedin.com/in+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&num=20&sca_esv=6d27a280cc9bfc7a&gl=us&hl=en&ei=rrtiaI3eB7u8juMP17PPiA8&start=40&sa=N&sstk=Ac65TH6KgSWXzfjLxTxsWJDmqXpCNcLB5EhPkbehJWRoqTOswsTJXIfLU0u1V9OX6jXgr3D_qdQhoUwxwhPVsVBnGER9y4lieXQLsw&ved=2ahUKEwiN7ZbiyJmOAxU7nmMGHdfZE_EQ8tMDegQIChAG"
437
+ }
438
+ },
439
+ "serpapi_pagination": {
440
+ "current": 1,
441
+ "next_link": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=20",
442
+ "next": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=20",
443
+ "other_pages": {
444
+ "2": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=20",
445
+ "3": "https://serpapi.com/search.json?device=desktop&engine=google&gl=us&google_domain=google.com&hl=en&num=20&q=site%3Alinkedin.com%2Fin+%22ml+research%22+%22llm%22+%22mountain+view%22+-student+-intern+-%22recent+graduate%22&start=40"
446
+ }
447
+ }
448
+ }
test/data/linkedin_urls.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://www.linkedin.com/in/yangyu190
2
+ https://www.linkedin.com/in/raviraagav
3
+ https://www.linkedin.com/in/shobha-vasudevan-72154b11
4
+ https://www.linkedin.com/in/kevin-h-a6044a18a
5
+ https://www.linkedin.com/in/vaskarnath
6
+ https://www.linkedin.com/in/jing-lu-a0b07123
7
+ https://www.linkedin.com/in/myin8
8
+ https://www.linkedin.com/in/zhe-su-86b389b3
9
+ https://www.linkedin.com/in/zhujing
10
+ https://www.linkedin.com/in/nitinsinghal03
11
+ https://www.linkedin.com/in/andyssu
12
+ https://www.linkedin.com/in/krishan-kumar38
13
+ https://www.linkedin.com/in/changwan-ryu-6aba477
14
+ https://www.linkedin.com/in/roxanarvk
15
+ https://www.linkedin.com/in/husainalmohssen
16
+ https://www.linkedin.com/in/zhi-kevin-p-3718934
17
+ https://www.linkedin.com/in/di-zhou-57b031ab
18
+ https://www.linkedin.com/in/peter-mains-aa8674336
19
+ https://www.linkedin.com/in/yang-tian-5a0719b0