Spaces:
Sleeping
Sleeping
| """ | |
| LinkedIn Profile Scraper API Integration | |
| Uses the Relevance API to scrape LinkedIn profile data | |
| """ | |
| import requests | |
| import json | |
| from typing import Dict, Any, Optional | |
| from urllib.parse import urlparse | |
| import time | |
| class LinkedInScraper: | |
| """ | |
| A Python wrapper for the Relevance LinkedIn scraping API | |
| """ | |
| def __init__(self): | |
| # Primary API (original) | |
| self.primary_api = { | |
| "url": "https://api-f1db6c.stack.tryrelevance.com/latest/studios/11116e42-9be9-4837-8753-c46a80458318/trigger_webhook", | |
| "project_id": "f56ec267-8285-4bef-b8ab-4dce36204e5d" | |
| } | |
| # Fallback API (new account) | |
| self.fallback_api = { | |
| "url": "https://api-f1db6c.stack.tryrelevance.com/latest/studios/a1a00cf9-4102-4d76-99e5-8ce9b922b51c/trigger_webhook", | |
| "project_id": "e5f9ef92-aa24-4626-a145-3fb746186504" | |
| } | |
| self.headers = { | |
| "Content-Type": "application/json" | |
| } | |
| def is_valid_linkedin_url(self, url: str) -> bool: | |
| """ | |
| Validate if the URL is a LinkedIn profile URL | |
| """ | |
| try: | |
| parsed = urlparse(url) | |
| return ( | |
| parsed.netloc in ['www.linkedin.com', 'linkedin.com'] and | |
| '/in/' in parsed.path | |
| ) | |
| except: | |
| return False | |
| def _try_api(self, api_config: Dict[str, str], linkedin_url: str, api_name: str) -> Dict[str, Any]: | |
| """ | |
| Try scraping with a specific API configuration | |
| """ | |
| try: | |
| payload = {"url": linkedin_url} | |
| full_url = f"{api_config['url']}?project={api_config['project_id']}" | |
| print(f"[{api_name}] Trying to scrape: {linkedin_url}") | |
| start_time = time.time() | |
| response = requests.post( | |
| full_url, | |
| headers=self.headers, | |
| data=json.dumps(payload), | |
| timeout=60 | |
| ) | |
| end_time = time.time() | |
| duration = round(end_time - start_time, 2) | |
| if response.status_code == 200: | |
| data = response.json() | |
| # Handle different response formats from different APIs | |
| profile_data = None | |
| if 'linkedin_full_data' in data: | |
| profile_data = data['linkedin_full_data'] | |
| elif 'data' in data: | |
| profile_data = data['data'] | |
| if profile_data: | |
| print(f"[{api_name}] SUCCESS in {duration}s") | |
| print(f" Name: {profile_data.get('full_name', 'N/A')}") | |
| print(f" Headline: {profile_data.get('headline', 'N/A')}") | |
| print(f" Location: {profile_data.get('location', 'N/A')}") | |
| return { | |
| "success": True, | |
| "data": profile_data, | |
| "scrape_time": duration, | |
| "url": linkedin_url, | |
| "api_used": api_name | |
| } | |
| else: | |
| return { | |
| "success": False, | |
| "error": f"{api_name}: No profile data returned", | |
| "raw_response": data, | |
| "url": linkedin_url | |
| } | |
| else: | |
| return { | |
| "success": False, | |
| "error": f"{api_name}: API returned status {response.status_code}", | |
| "response_text": response.text, | |
| "url": linkedin_url | |
| } | |
| except requests.exceptions.Timeout: | |
| return { | |
| "success": False, | |
| "error": f"{api_name}: Request timed out after 60 seconds", | |
| "url": linkedin_url | |
| } | |
| except Exception as e: | |
| return { | |
| "success": False, | |
| "error": f"{api_name}: {str(e)}", | |
| "url": linkedin_url | |
| } | |
| def scrape_profile(self, linkedin_url: str) -> Dict[str, Any]: | |
| """ | |
| Scrape a LinkedIn profile using primary API with fallback | |
| Args: | |
| linkedin_url (str): LinkedIn profile URL | |
| Returns: | |
| Dict containing profile data or error information | |
| """ | |
| # Validate URL | |
| if not self.is_valid_linkedin_url(linkedin_url): | |
| return { | |
| "success": False, | |
| "error": "Invalid LinkedIn URL format", | |
| "url": linkedin_url | |
| } | |
| print(f"[SCRAPING] LinkedIn profile: {linkedin_url}") | |
| # Try primary API first | |
| result = self._try_api(self.primary_api, linkedin_url, "PRIMARY") | |
| if result["success"]: | |
| return result | |
| print(f"[FALLBACK] Primary API failed: {result['error']}") | |
| print(f"[FALLBACK] Trying secondary API...") | |
| # Try fallback API | |
| result = self._try_api(self.fallback_api, linkedin_url, "FALLBACK") | |
| if result["success"]: | |
| return result | |
| # Both APIs failed | |
| print(f"[FAILED] Both APIs failed!") | |
| return { | |
| "success": False, | |
| "error": "Both primary and fallback APIs failed", | |
| "primary_error": result.get('error', 'Unknown error'), | |
| "url": linkedin_url | |
| } | |
| def extract_key_info(self, profile_data: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Extract key information from the scraped profile data | |
| Args: | |
| profile_data: Raw profile data from the API | |
| Returns: | |
| Simplified profile data with key fields | |
| """ | |
| if not profile_data.get('success', False): | |
| return profile_data | |
| data = profile_data['data'] | |
| # Extract education info | |
| educations = [] | |
| for edu in data.get('educations', []): | |
| educations.append({ | |
| 'school': edu.get('school', ''), | |
| 'degree': edu.get('degree', ''), | |
| 'field': edu.get('field_of_study', ''), | |
| 'date_range': edu.get('date_range', ''), | |
| 'grade': edu.get('activities', '') # Grade is often in activities | |
| }) | |
| # Extract experience info | |
| experiences = [] | |
| for exp in data.get('experiences', []): | |
| experiences.append({ | |
| 'title': exp.get('title', ''), | |
| 'company': exp.get('company', ''), | |
| 'date_range': exp.get('date_range', ''), | |
| 'description': exp.get('description', '') | |
| }) | |
| return { | |
| 'success': True, | |
| 'profile': { | |
| 'name': data.get('full_name', ''), | |
| 'headline': data.get('headline', ''), | |
| 'location': data.get('location', ''), | |
| 'about': data.get('about', ''), | |
| 'profile_url': data.get('linkedin_url', ''), | |
| 'profile_image': data.get('profile_image_url', ''), | |
| 'connections': data.get('connection_count', 0), | |
| 'is_verified': data.get('is_verified', False), | |
| 'current_company': data.get('company', ''), | |
| 'current_title': data.get('job_title', ''), | |
| 'educations': educations, | |
| 'experiences': experiences | |
| }, | |
| 'scrape_time': profile_data.get('scrape_time', 0), | |
| 'url': profile_data.get('url', '') | |
| } | |
| # This module is integrated with the Enhance LinkedIn application | |
| # No standalone test code needed |