""" LinkedIn Profile Scraper API Integration Uses the Relevance API to scrape LinkedIn profile data """ import requests import json from typing import Dict, Any, Optional from urllib.parse import urlparse import time class LinkedInScraper: """ A Python wrapper for the Relevance LinkedIn scraping API """ def __init__(self): # Primary API (original) self.primary_api = { "url": "https://api-f1db6c.stack.tryrelevance.com/latest/studios/11116e42-9be9-4837-8753-c46a80458318/trigger_webhook", "project_id": "f56ec267-8285-4bef-b8ab-4dce36204e5d" } # Fallback API (new account) self.fallback_api = { "url": "https://api-f1db6c.stack.tryrelevance.com/latest/studios/a1a00cf9-4102-4d76-99e5-8ce9b922b51c/trigger_webhook", "project_id": "e5f9ef92-aa24-4626-a145-3fb746186504" } self.headers = { "Content-Type": "application/json" } def is_valid_linkedin_url(self, url: str) -> bool: """ Validate if the URL is a LinkedIn profile URL """ try: parsed = urlparse(url) return ( parsed.netloc in ['www.linkedin.com', 'linkedin.com'] and '/in/' in parsed.path ) except: return False def _try_api(self, api_config: Dict[str, str], linkedin_url: str, api_name: str) -> Dict[str, Any]: """ Try scraping with a specific API configuration """ try: payload = {"url": linkedin_url} full_url = f"{api_config['url']}?project={api_config['project_id']}" print(f"[{api_name}] Trying to scrape: {linkedin_url}") start_time = time.time() response = requests.post( full_url, headers=self.headers, data=json.dumps(payload), timeout=60 ) end_time = time.time() duration = round(end_time - start_time, 2) if response.status_code == 200: data = response.json() # Handle different response formats from different APIs profile_data = None if 'linkedin_full_data' in data: profile_data = data['linkedin_full_data'] elif 'data' in data: profile_data = data['data'] if profile_data: print(f"[{api_name}] SUCCESS in {duration}s") print(f" Name: {profile_data.get('full_name', 'N/A')}") print(f" Headline: {profile_data.get('headline', 'N/A')}") print(f" Location: {profile_data.get('location', 'N/A')}") return { "success": True, "data": profile_data, "scrape_time": duration, "url": linkedin_url, "api_used": api_name } else: return { "success": False, "error": f"{api_name}: No profile data returned", "raw_response": data, "url": linkedin_url } else: return { "success": False, "error": f"{api_name}: API returned status {response.status_code}", "response_text": response.text, "url": linkedin_url } except requests.exceptions.Timeout: return { "success": False, "error": f"{api_name}: Request timed out after 60 seconds", "url": linkedin_url } except Exception as e: return { "success": False, "error": f"{api_name}: {str(e)}", "url": linkedin_url } def scrape_profile(self, linkedin_url: str) -> Dict[str, Any]: """ Scrape a LinkedIn profile using primary API with fallback Args: linkedin_url (str): LinkedIn profile URL Returns: Dict containing profile data or error information """ # Validate URL if not self.is_valid_linkedin_url(linkedin_url): return { "success": False, "error": "Invalid LinkedIn URL format", "url": linkedin_url } print(f"[SCRAPING] LinkedIn profile: {linkedin_url}") # Try primary API first result = self._try_api(self.primary_api, linkedin_url, "PRIMARY") if result["success"]: return result print(f"[FALLBACK] Primary API failed: {result['error']}") print(f"[FALLBACK] Trying secondary API...") # Try fallback API result = self._try_api(self.fallback_api, linkedin_url, "FALLBACK") if result["success"]: return result # Both APIs failed print(f"[FAILED] Both APIs failed!") return { "success": False, "error": "Both primary and fallback APIs failed", "primary_error": result.get('error', 'Unknown error'), "url": linkedin_url } def extract_key_info(self, profile_data: Dict[str, Any]) -> Dict[str, Any]: """ Extract key information from the scraped profile data Args: profile_data: Raw profile data from the API Returns: Simplified profile data with key fields """ if not profile_data.get('success', False): return profile_data data = profile_data['data'] # Extract education info educations = [] for edu in data.get('educations', []): educations.append({ 'school': edu.get('school', ''), 'degree': edu.get('degree', ''), 'field': edu.get('field_of_study', ''), 'date_range': edu.get('date_range', ''), 'grade': edu.get('activities', '') # Grade is often in activities }) # Extract experience info experiences = [] for exp in data.get('experiences', []): experiences.append({ 'title': exp.get('title', ''), 'company': exp.get('company', ''), 'date_range': exp.get('date_range', ''), 'description': exp.get('description', '') }) return { 'success': True, 'profile': { 'name': data.get('full_name', ''), 'headline': data.get('headline', ''), 'location': data.get('location', ''), 'about': data.get('about', ''), 'profile_url': data.get('linkedin_url', ''), 'profile_image': data.get('profile_image_url', ''), 'connections': data.get('connection_count', 0), 'is_verified': data.get('is_verified', False), 'current_company': data.get('company', ''), 'current_title': data.get('job_title', ''), 'educations': educations, 'experiences': experiences }, 'scrape_time': profile_data.get('scrape_time', 0), 'url': profile_data.get('url', '') } # This module is integrated with the Enhance LinkedIn application # No standalone test code needed