Spaces:

yashgori20
/

Inhance

Sleeping

App Files Files Community

Inhance / linkedin_scraper.py

yashgori20

done

492af8a 10 months ago

raw

history blame contribute delete

7.88 kB

	"""
	LinkedIn Profile Scraper API Integration
	Uses the Relevance API to scrape LinkedIn profile data
	"""

	import requests
	import json
	from typing import Dict, Any, Optional
	from urllib.parse import urlparse
	import time

	class LinkedInScraper:
	"""
	A Python wrapper for the Relevance LinkedIn scraping API
	"""

	def __init__(self):
	# Primary API (original)
	self.primary_api = {
	"url": "https://api-f1db6c.stack.tryrelevance.com/latest/studios/11116e42-9be9-4837-8753-c46a80458318/trigger_webhook",
	"project_id": "f56ec267-8285-4bef-b8ab-4dce36204e5d"
	}

	# Fallback API (new account)
	self.fallback_api = {
	"url": "https://api-f1db6c.stack.tryrelevance.com/latest/studios/a1a00cf9-4102-4d76-99e5-8ce9b922b51c/trigger_webhook",
	"project_id": "e5f9ef92-aa24-4626-a145-3fb746186504"
	}

	self.headers = {
	"Content-Type": "application/json"
	}

	def is_valid_linkedin_url(self, url: str) -> bool:
	"""
	Validate if the URL is a LinkedIn profile URL
	"""
	try:
	parsed = urlparse(url)
	return (
	parsed.netloc in ['www.linkedin.com', 'linkedin.com'] and
	'/in/' in parsed.path
	)
	except:
	return False

	def _try_api(self, api_config: Dict[str, str], linkedin_url: str, api_name: str) -> Dict[str, Any]:
	"""
	Try scraping with a specific API configuration
	"""
	try:
	payload = {"url": linkedin_url}
	full_url = f"{api_config['url']}?project={api_config['project_id']}"

	print(f"[{api_name}] Trying to scrape: {linkedin_url}")
	start_time = time.time()

	response = requests.post(
	full_url,
	headers=self.headers,
	data=json.dumps(payload),
	timeout=60
	)

	end_time = time.time()
	duration = round(end_time - start_time, 2)

	if response.status_code == 200:
	data = response.json()

	# Handle different response formats from different APIs
	profile_data = None
	if 'linkedin_full_data' in data:
	profile_data = data['linkedin_full_data']
	elif 'data' in data:
	profile_data = data['data']

	if profile_data:
	print(f"[{api_name}] SUCCESS in {duration}s")
	print(f" Name: {profile_data.get('full_name', 'N/A')}")
	print(f" Headline: {profile_data.get('headline', 'N/A')}")
	print(f" Location: {profile_data.get('location', 'N/A')}")

	return {
	"success": True,
	"data": profile_data,
	"scrape_time": duration,
	"url": linkedin_url,
	"api_used": api_name
	}
	else:
	return {
	"success": False,
	"error": f"{api_name}: No profile data returned",
	"raw_response": data,
	"url": linkedin_url
	}
	else:
	return {
	"success": False,
	"error": f"{api_name}: API returned status {response.status_code}",
	"response_text": response.text,
	"url": linkedin_url
	}

	except requests.exceptions.Timeout:
	return {
	"success": False,
	"error": f"{api_name}: Request timed out after 60 seconds",
	"url": linkedin_url
	}
	except Exception as e:
	return {
	"success": False,
	"error": f"{api_name}: {str(e)}",
	"url": linkedin_url
	}

	def scrape_profile(self, linkedin_url: str) -> Dict[str, Any]:
	"""
	Scrape a LinkedIn profile using primary API with fallback

	Args:
	linkedin_url (str): LinkedIn profile URL

	Returns:
	Dict containing profile data or error information
	"""

	# Validate URL
	if not self.is_valid_linkedin_url(linkedin_url):
	return {
	"success": False,
	"error": "Invalid LinkedIn URL format",
	"url": linkedin_url
	}

	print(f"[SCRAPING] LinkedIn profile: {linkedin_url}")

	# Try primary API first
	result = self._try_api(self.primary_api, linkedin_url, "PRIMARY")

	if result["success"]:
	return result

	print(f"[FALLBACK] Primary API failed: {result['error']}")
	print(f"[FALLBACK] Trying secondary API...")

	# Try fallback API
	result = self._try_api(self.fallback_api, linkedin_url, "FALLBACK")

	if result["success"]:
	return result

	# Both APIs failed
	print(f"[FAILED] Both APIs failed!")
	return {
	"success": False,
	"error": "Both primary and fallback APIs failed",
	"primary_error": result.get('error', 'Unknown error'),
	"url": linkedin_url
	}

	def extract_key_info(self, profile_data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Extract key information from the scraped profile data

	Args:
	profile_data: Raw profile data from the API

	Returns:
	Simplified profile data with key fields
	"""

	if not profile_data.get('success', False):
	return profile_data

	data = profile_data['data']

	# Extract education info
	educations = []
	for edu in data.get('educations', []):
	educations.append({
	'school': edu.get('school', ''),
	'degree': edu.get('degree', ''),
	'field': edu.get('field_of_study', ''),
	'date_range': edu.get('date_range', ''),
	'grade': edu.get('activities', '') # Grade is often in activities
	})

	# Extract experience info
	experiences = []
	for exp in data.get('experiences', []):
	experiences.append({
	'title': exp.get('title', ''),
	'company': exp.get('company', ''),
	'date_range': exp.get('date_range', ''),
	'description': exp.get('description', '')
	})

	return {
	'success': True,
	'profile': {
	'name': data.get('full_name', ''),
	'headline': data.get('headline', ''),
	'location': data.get('location', ''),
	'about': data.get('about', ''),
	'profile_url': data.get('linkedin_url', ''),
	'profile_image': data.get('profile_image_url', ''),
	'connections': data.get('connection_count', 0),
	'is_verified': data.get('is_verified', False),
	'current_company': data.get('company', ''),
	'current_title': data.get('job_title', ''),
	'educations': educations,
	'experiences': experiences
	},
	'scrape_time': profile_data.get('scrape_time', 0),
	'url': profile_data.get('url', '')
	}

	# This module is integrated with the Enhance LinkedIn application
	# No standalone test code needed