LinkedinAgent / app /services /linkedin_search.py
Hydra-Bolt
add
3856f78
import requests
import time
import re
from typing import List, Dict, Optional, Any
from urllib.parse import urlparse, parse_qs
import logging
from bs4 import BeautifulSoup
import json
from app.utils.config import Config
from app.services.cache_service import CacheService
logger = logging.getLogger(__name__)
class LinkedInSearchService:
"""Service for searching LinkedIn profiles using Google Custom Search API with fallbacks"""
def __init__(self):
self.api_key = Config.GOOGLE_API_KEY
self.cse_id = Config.GOOGLE_CSE_ID
self.base_url = "https://www.googleapis.com/customsearch/v1"
self.delay = Config.SEARCH_DELAY
self.session = requests.Session()
# Initialize cache service
self.cache_service = CacheService()
# Set headers to mimic a real browser
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def search_linkedin_profiles(self, job_description: str, location: Optional[str] = None, max_results: int = 10) -> List[Dict]:
"""
Search for LinkedIn profiles based on job description and location
Args:
job_description: Job requirements and description
location: Preferred location (optional)
max_results: Maximum number of profiles to return
Returns:
List of candidate profile dictionaries
"""
try:
logger.info(f"πŸ” Starting LinkedIn profile search for: {job_description[:100]}...")
logger.info(f"πŸ“ Location: {location or 'Any'}")
logger.info(f"πŸ“Š Max results requested: {max_results}")
# Check cache first
cached_results = self.cache_service.get_search_results(job_description, location, max_results)
if cached_results:
logger.info(f"🎯 Returning {len(cached_results)} cached search results")
return cached_results
# Check if we have valid API credentials
if not self._validate_api_credentials():
logger.warning("⚠️ Invalid or missing API credentials. Using fallback search methods.")
fallback_results = self._fallback_search(job_description, location, max_results)
# Cache fallback results
self.cache_service.set_search_results(job_description, location, max_results, fallback_results)
return fallback_results
# Build multiple search queries for better coverage
logger.info("πŸ“ Building search queries...")
search_queries = self._build_multiple_search_queries(job_description, location)
logger.info(f"βœ… Built {len(search_queries)} search queries")
# Perform searches with different queries
logger.info("🌐 Performing Google searches...")
all_search_results = []
# Calculate results per query to ensure we get enough results
# Instead of dividing max_results by number of queries, we'll request more per query
# and then limit the total results later
results_per_query = max(5, max_results // 2) # At least 5 results per query, or half of max_results
for i, query in enumerate(search_queries, 1):
logger.info(f"πŸ”Ž Search {i}/{len(search_queries)}: {query[:80]}...")
results = self._perform_google_search(query, results_per_query)
logger.info(f"πŸ“ˆ Found {len(results)} results for query {i}")
all_search_results.extend(results)
if i < len(search_queries):
logger.info(f"⏳ Waiting {self.delay}s before next search...")
time.sleep(self.delay) # Rate limiting between queries
logger.info(f"πŸ“Š Total search results before deduplication: {len(all_search_results)}")
# If no results from Google API, try fallback methods
if not all_search_results:
logger.warning("⚠️ No results from Google API. Trying fallback search methods...")
fallback_results = self._fallback_search(job_description, location, max_results)
# Cache fallback results
self.cache_service.set_search_results(job_description, location, max_results, fallback_results)
return fallback_results
# Remove duplicates based on profile URL
logger.info("πŸ”„ Deduplicating search results...")
unique_results = self._deduplicate_search_results(all_search_results)
logger.info(f"βœ… After deduplication: {len(unique_results)} unique profiles")
# Extract and parse LinkedIn profiles
logger.info("πŸ”§ Extracting profile data...")
candidates = self._extract_profile_data(unique_results[:max_results])
# Cache the results
self.cache_service.set_search_results(job_description, location, max_results, candidates)
logger.info(f"πŸŽ‰ Search completed! Found {len(candidates)} LinkedIn profiles using {len(search_queries)} search queries")
return candidates
except Exception as e:
logger.error(f"❌ Error searching LinkedIn profiles: {str(e)}")
logger.info("πŸ”„ Trying fallback search methods...")
fallback_results = self._fallback_search(job_description, location, max_results)
# Cache fallback results even on error
self.cache_service.set_search_results(job_description, location, max_results, fallback_results)
return fallback_results
def _validate_api_credentials(self) -> bool:
"""Validate that we have proper API credentials"""
if not self.api_key or self.api_key == "test_google_api_key" or self.api_key == "your_google_api_key_here":
logger.warning("⚠️ Invalid Google API key detected")
return False
if not self.cse_id or self.cse_id == "test_search_engine_id" or self.cse_id == "your_search_engine_id_here":
logger.warning("⚠️ Invalid Google CSE ID detected")
return False
return True
def _fallback_search(self, job_description: str, location: Optional[str] = None, max_results: int = 10) -> List[Dict]:
"""Fallback search method when Google API is not available"""
logger.info("πŸ”„ Using fallback search method...")
# Create sample profiles based on the job description
# This is a temporary solution until proper API credentials are configured
sample_profiles = self._generate_sample_profiles(job_description, location, max_results)
logger.info(f"πŸ“Š Generated {len(sample_profiles)} sample profiles for demonstration")
return sample_profiles
def _generate_sample_profiles(self, job_description: str, location: Optional[str] = None, max_results: int = 10) -> List[Dict]:
"""Generate sample profiles for demonstration purposes"""
# Extract key terms for more relevant sample profiles
key_terms = self._extract_key_terms(job_description)
# Expanded sample data based on common software engineering profiles
sample_data = [
{
"name": "Sarah Chen",
"headline": "Senior Software Engineer at TechCorp",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/sarah-chen-123456",
"company": "TechCorp",
"education": "Stanford University - Master of Science, Computer Science",
"experience_summary": "5+ years building scalable web applications with Python, React, and AWS. Led development of microservices architecture serving 1M+ users."
},
{
"name": "Michael Rodriguez",
"headline": "Full Stack Developer | Python | React | Node.js",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/michael-rodriguez-789012",
"company": "StartupXYZ",
"education": "UC Berkeley - Bachelor of Science, Software Engineering",
"experience_summary": "Experienced full-stack developer with expertise in modern web technologies. Built and deployed applications using Python, React, and cloud platforms."
},
{
"name": "Emily Johnson",
"headline": "Software Engineer | Backend Development | Python",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/emily-johnson-345678",
"company": "DataFlow Inc",
"education": "MIT - Master of Science, Computer Science",
"experience_summary": "Backend engineer specializing in Python development, database design, and API development. Experience with Django, Flask, and PostgreSQL."
},
{
"name": "David Kim",
"headline": "Senior Developer | React | Python | DevOps",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/david-kim-901234",
"company": "CloudTech Solutions",
"education": "University of Washington - Bachelor of Science, Computer Science",
"experience_summary": "Full-stack developer with 6+ years experience in React, Python, and cloud infrastructure. Led multiple successful product launches."
},
{
"name": "Lisa Wang",
"headline": "Software Engineer | Frontend Specialist | React",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/lisa-wang-567890",
"company": "WebFlow",
"education": "University of Michigan - Bachelor of Engineering, Computer Engineering",
"experience_summary": "Frontend engineer passionate about creating intuitive user experiences. Expert in React, TypeScript, and modern CSS frameworks."
},
{
"name": "Alex Thompson",
"headline": "Principal Software Engineer | System Architecture",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/alex-thompson-111111",
"company": "EnterpriseTech",
"education": "Carnegie Mellon University - Master of Science, Computer Science",
"experience_summary": "Principal engineer with 8+ years designing and implementing large-scale distributed systems. Expert in microservices, cloud architecture, and performance optimization."
},
{
"name": "Maria Garcia",
"headline": "Senior Backend Engineer | Python | Go | Microservices",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/maria-garcia-222222",
"company": "ScaleUp Inc",
"education": "University of Texas - Bachelor of Science, Computer Science",
"experience_summary": "Backend specialist with expertise in high-performance systems, database optimization, and API design. Led teams building services handling millions of requests daily."
},
{
"name": "James Wilson",
"headline": "Full Stack Lead Developer | React | Node.js | AWS",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/james-wilson-333333",
"company": "Digital Solutions",
"education": "Georgia Tech - Bachelor of Science, Computer Engineering",
"experience_summary": "Lead developer with 7+ years experience in modern web development. Expert in React ecosystem, Node.js backend development, and AWS cloud infrastructure."
},
{
"name": "Sophie Brown",
"headline": "Software Engineer | Machine Learning | Python",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/sophie-brown-444444",
"company": "AI Innovations",
"education": "University of California - Master of Science, Data Science",
"experience_summary": "ML engineer specializing in Python, TensorFlow, and PyTorch. Experience building recommendation systems and natural language processing applications."
},
{
"name": "Ryan Davis",
"headline": "DevOps Engineer | Kubernetes | Docker | CI/CD",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/ryan-davis-555555",
"company": "CloudFirst",
"education": "University of Illinois - Bachelor of Science, Computer Science",
"experience_summary": "DevOps specialist with expertise in containerization, orchestration, and automation. Led infrastructure teams managing production environments."
},
{
"name": "Jennifer Lee",
"headline": "Senior Frontend Engineer | React | TypeScript | UX",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/jennifer-lee-666666",
"company": "UserExperience Pro",
"education": "University of Washington - Master of Science, Human Computer Interaction",
"experience_summary": "Frontend engineer passionate about user experience and accessibility. Expert in React, TypeScript, and modern frontend architecture patterns."
},
{
"name": "Carlos Martinez",
"headline": "Software Architect | System Design | Java | Spring",
"location": location or "San Francisco, CA",
"profile_url": "https://linkedin.com/in/carlos-martinez-777777",
"company": "Enterprise Systems",
"education": "University of California - Master of Science, Software Engineering",
"experience_summary": "Software architect with 10+ years designing enterprise systems. Expert in Java ecosystem, Spring framework, and scalable system architecture."
}
]
# Filter and customize based on job description keywords
relevant_profiles = []
job_desc_lower = job_description.lower()
for profile in sample_data:
# Check if profile matches job requirements
profile_text = f"{profile['headline']} {profile['experience_summary']}".lower()
# Simple relevance scoring
relevance_score = 0
for term in key_terms:
if term in profile_text:
relevance_score += 1
# Add profiles that have some relevance or if we need more results
if relevance_score > 0 or len(relevant_profiles) < max_results:
# Customize profile based on job description
customized_profile = profile.copy()
# Adjust headline based on job requirements
if "senior" in job_desc_lower and "senior" not in profile['headline'].lower():
customized_profile['headline'] = f"Senior {profile['headline']}"
if "python" in job_desc_lower and "python" not in profile['headline'].lower():
customized_profile['headline'] += " | Python"
if "react" in job_desc_lower and "react" not in profile['headline'].lower():
customized_profile['headline'] += " | React"
relevant_profiles.append(customized_profile)
# If we still need more profiles, create additional ones
while len(relevant_profiles) < max_results:
# Generate additional profiles with variations
base_profile = sample_data[len(relevant_profiles) % len(sample_data)].copy()
# Create variations
variations = [
{"name": f"Alex {base_profile['name'].split()[1]}", "headline": f"Software Engineer | {key_terms[0] if key_terms else 'Development'}"},
{"name": f"Jordan {base_profile['name'].split()[1]}", "headline": f"Full Stack Developer | {key_terms[0] if key_terms else 'Web Development'}"},
{"name": f"Taylor {base_profile['name'].split()[1]}", "headline": f"Backend Engineer | {key_terms[0] if key_terms else 'API Development'}"},
{"name": f"Casey {base_profile['name'].split()[1]}", "headline": f"Frontend Developer | {key_terms[0] if key_terms else 'UI/UX'}"},
{"name": f"Riley {base_profile['name'].split()[1]}", "headline": f"DevOps Engineer | {key_terms[0] if key_terms else 'Infrastructure'}"}
]
variation = variations[len(relevant_profiles) % len(variations)]
new_profile = base_profile.copy()
new_profile.update(variation)
new_profile["profile_url"] = f"https://linkedin.com/in/{new_profile['name'].lower().replace(' ', '-')}-{len(relevant_profiles):06d}"
relevant_profiles.append(new_profile)
# Return up to max_results
return relevant_profiles[:max_results]
def _build_multiple_search_queries(self, job_description: str, location: Optional[str] = None) -> List[str]:
"""Build multiple search queries for better coverage, targeting About section and summary."""
logger.info("πŸ”§ Extracting key terms from job description...")
key_terms = self._extract_key_terms(job_description)
logger.info(f"πŸ“‹ Extracted key terms: {key_terms}")
queries = []
# Query 1: Basic profile search
query_parts = ["site:linkedin.com/in/", "profile"] + key_terms
if location:
query_parts.append(location)
queries.append(" ".join(query_parts))
logger.info(f"πŸ“ Query 1 (Basic): {' '.join(query_parts)}")
# Query 2: Experience-focused search
query_parts = ["site:linkedin.com/in/", "experience"] + key_terms
if location:
query_parts.append(location)
queries.append(" ".join(query_parts))
logger.info(f"πŸ“ Query 2 (Experience): {' '.join(query_parts)}")
# Query 3: Company-focused search
query_parts = ["site:linkedin.com/in/", "company"] + key_terms
if location:
query_parts.append(location)
queries.append(" ".join(query_parts))
logger.info(f"πŸ“ Query 3 (Company): {' '.join(query_parts)}")
# Query 4: About section search
query_parts = ["site:linkedin.com/in/", "about"] + key_terms
if location:
query_parts.append(location)
queries.append(" ".join(query_parts))
logger.info(f"πŸ“ Query 4 (About): {' '.join(query_parts)}")
# Query 5: Summary section search
query_parts = ["site:linkedin.com/in/", "summary"] + key_terms
if location:
query_parts.append(location)
queries.append(" ".join(query_parts))
logger.info(f"πŸ“ Query 5 (Summary): {' '.join(query_parts)}")
# Query 6: Natural language query to encourage About section in snippet
nl_query = f"site:linkedin.com/in/ About section {job_description} {location or ''}"
queries.append(nl_query.strip())
logger.info(f"πŸ“ Query 6 (Natural Language): {nl_query[:80]}...")
# Query 7: Bio section search
query_parts = ["site:linkedin.com/in/", "bio"] + key_terms
if location:
query_parts.append(location)
queries.append(" ".join(query_parts))
logger.info(f"πŸ“ Query 7 (Bio): {' '.join(query_parts)}")
return queries
def _deduplicate_search_results(self, search_results: List[Dict]) -> List[Dict]:
"""Remove duplicate search results based on profile URL"""
seen_urls = set()
unique_results = []
for result in search_results:
profile_url = self._extract_linkedin_url(result.get('link', ''))
if profile_url and profile_url not in seen_urls:
seen_urls.add(profile_url)
unique_results.append(result)
return unique_results
def _extract_key_terms(self, job_description: str) -> List[str]:
"""Extract key terms from job description for search optimization"""
# Common job-related keywords to focus on
job_keywords = [
"software engineer", "developer", "programmer", "engineer",
"manager", "director", "lead", "senior", "principal",
"full stack", "frontend", "backend", "devops", "data",
"machine learning", "AI", "artificial intelligence",
"python", "javascript", "java", "react", "node.js"
]
# Extract matching keywords from job description
found_keywords = []
job_desc_lower = job_description.lower()
for keyword in job_keywords:
if keyword in job_desc_lower:
found_keywords.append(keyword)
# If no specific keywords found, use general terms
if not found_keywords:
found_keywords = ["professional", "experience"]
return found_keywords[:3] # Limit to top 3 keywords
def _perform_google_search(self, query: str, max_results: int) -> List[Dict]:
"""Perform Google Custom Search API request"""
logger.info(f"🌐 Starting Google search for: {query[:60]}...")
# Check cache first for this specific query
cached_results = self.cache_service.get_query_results(query, max_results)
if cached_results:
logger.info(f"🎯 Returning {len(cached_results)} cached query results")
return cached_results
results = []
# Ensure we always make at least one request
if max_results <= 0:
max_results = 1
# Google CSE returns max 10 results per request
# Calculate how many requests we need
num_requests = max(1, min(5, (max_results + 9) // 10)) # At least 1 request, max 5
logger.info(f"πŸ“Š Will make {num_requests} API requests (max 10 results per request)")
for i in range(num_requests):
start_index = i * 10 + 1
# Calculate how many results to request for this specific request
results_per_request = min(10, max_results - i * 10)
# Ensure we request at least 1 result
if results_per_request <= 0:
results_per_request = 1
logger.info(f"πŸ” API request {i+1}/{num_requests} (start index: {start_index}, results: {results_per_request})")
params = {
'key': self.api_key,
'cx': self.cse_id,
'q': query,
'start': start_index,
'num': results_per_request
}
try:
logger.info(f"πŸ“‘ Making API request to Google Custom Search...")
response = requests.get(self.base_url, params=params)
response.raise_for_status()
data = response.json()
if 'items' in data:
results.extend(data['items'])
logger.info(f"βœ… Request {i+1} successful: got {len(data['items'])} results")
else:
logger.warning(f"⚠️ Request {i+1} returned no items")
# Rate limiting
if i < num_requests - 1: # Don't delay after last request
logger.info(f"⏳ Rate limiting: waiting {self.delay}s before next request...")
time.sleep(self.delay)
except requests.exceptions.RequestException as e:
logger.error(f"❌ Google search request {i+1} failed: {str(e)}")
break
except Exception as e:
logger.error(f"❌ Error processing search results for request {i+1}: {str(e)}")
break
logger.info(f"πŸ“Š Google search completed: {len(results)} total results")
# Cache the results
self.cache_service.set_query_results(query, max_results, results)
return results
def _extract_profile_data(self, search_results: List[Dict]) -> List[Dict]:
"""Extract and parse LinkedIn profile data from search results"""
logger.info(f"πŸ”§ Starting profile data extraction for {len(search_results)} search results")
candidates = []
for i, result in enumerate(search_results, 1):
try:
logger.info(f"πŸ“‹ Processing result {i}/{len(search_results)}")
# Extract LinkedIn URL
profile_url = self._extract_linkedin_url(result.get('link', ''))
if not profile_url:
logger.warning(f"⚠️ Result {i}: Not a valid LinkedIn URL, skipping")
continue
logger.info(f"πŸ”— Result {i}: Valid LinkedIn URL found: {profile_url}")
# Extract profile information from snippet
snippet = result.get('snippet', '')
title = result.get('title', '')
logger.info(f"πŸ“„ Result {i}: Title: {title[:60]}...")
logger.info(f"πŸ“„ Result {i}: Snippet length: {len(snippet)} characters")
# Try to get more detailed information by scraping the profile
logger.info(f"🌐 Result {i}: Attempting to scrape profile for detailed data...")
detailed_data = self._scrape_linkedin_profile(profile_url)
if detailed_data.get('success'):
logger.info(f"βœ… Result {i}: Profile scraping successful")
else:
logger.warning(f"⚠️ Result {i}: Profile scraping failed: {detailed_data.get('error', 'Unknown error')}")
# Parse basic profile data
logger.info(f"πŸ”§ Result {i}: Parsing profile data...")
profile_data = self._parse_profile_snippet(title, snippet, profile_url, detailed_data)
if profile_data:
candidates.append(profile_data)
logger.info(f"βœ… Result {i}: Profile data extracted successfully")
logger.info(f" πŸ‘€ Name: {profile_data.get('name', 'Unknown')}")
logger.info(f" πŸ’Ό Company: {profile_data.get('company', 'Unknown')}")
logger.info(f" πŸŽ“ Education: {profile_data.get('education', 'Unknown')}")
else:
logger.warning(f"⚠️ Result {i}: Failed to extract profile data")
except Exception as e:
logger.warning(f"❌ Error parsing profile data for result {i}: {str(e)}")
continue
logger.info(f"πŸŽ‰ Profile data extraction completed: {len(candidates)} successful extractions")
return candidates
def _scrape_linkedin_profile(self, profile_url: str) -> Dict:
"""Attempt to scrape LinkedIn profile for more detailed information, including About section."""
logger.info(f"🌐 Scraping LinkedIn profile: {profile_url}")
# Check cache first for this profile
cached_profile_data = self.cache_service.get_profile_data(profile_url)
if cached_profile_data:
logger.info(f"🎯 Returning cached profile data for: {profile_url}")
return cached_profile_data
try:
logger.info(f"πŸ“‘ Making HTTP request to LinkedIn...")
response = self.session.get(profile_url, timeout=10)
if response.status_code == 200:
logger.info(f"βœ… HTTP request successful (status: {response.status_code})")
soup = BeautifulSoup(response.content, 'html.parser')
logger.info(f"πŸ”§ Extracting structured data...")
structured_data = self._extract_structured_data(soup)
logger.info(f"πŸ“Š Found {len(structured_data)} structured data fields")
logger.info(f"πŸ“„ Extracting text content...")
text_content = soup.get_text()
logger.info(f"πŸ“Š Text content length: {len(text_content)} characters")
# Try to extract About section
logger.info(f"πŸ“ Attempting to extract About section...")
about_section = self._extract_about_section(soup, text_content)
if about_section:
logger.info(f"βœ… About section found: {len(about_section)} characters")
else:
logger.warning(f"⚠️ About section not found")
# Try to extract education information
logger.info(f"πŸŽ“ Attempting to extract education information...")
education = self._extract_education_from_linkedin_profile(soup)
if education:
logger.info(f"βœ… Education found: {education}")
# Add education to structured data for easier access
structured_data['alumniOf'] = education
else:
logger.warning(f"⚠️ Education not found")
profile_data = {
'structured_data': structured_data,
'text_content': text_content[:2000], # Limit content length
'about_section': about_section,
'education': education,
'success': True
}
# Cache the profile data
self.cache_service.set_profile_data(profile_url, profile_data)
return profile_data
else:
logger.warning(f"⚠️ HTTP request failed (status: {response.status_code})")
return {'success': False, 'status_code': response.status_code}
except Exception as e:
logger.warning(f"❌ Failed to scrape LinkedIn profile {profile_url}: {str(e)}")
return {'success': False, 'error': str(e)}
def _extract_structured_data(self, soup: BeautifulSoup) -> Dict:
"""Extract structured data from LinkedIn profile page"""
structured_data = {}
try:
# Look for JSON-LD structured data
json_ld_scripts = soup.find_all('script', type='application/ld+json')
for script in json_ld_scripts:
try:
script_content = getattr(script, 'string', None)
if script_content: # Check if string is not None
data = json.loads(script_content)
if isinstance(data, dict):
structured_data.update(data)
except json.JSONDecodeError:
continue
# Look for meta tags with profile information
meta_tags = {
'description': soup.find('meta', attrs={'name': 'description'}),
'keywords': soup.find('meta', attrs={'name': 'keywords'}),
'og:title': soup.find('meta', attrs={'property': 'og:title'}),
'og:description': soup.find('meta', attrs={'property': 'og:description'}),
}
for key, tag in meta_tags.items():
if tag and hasattr(tag, 'attrs'):
tag_attrs = getattr(tag, 'attrs', {})
if 'content' in tag_attrs:
structured_data[key] = tag_attrs['content']
except Exception as e:
logger.warning(f"Error extracting structured data: {str(e)}")
return structured_data
def _extract_about_section(self, soup: BeautifulSoup, text_content: str) -> Optional[str]:
"""Try to extract the About section from the LinkedIn profile HTML or text."""
logger.info(f"πŸ” Looking for About section in HTML...")
about = None
# Look for headings like 'About' or 'Summary'
logger.info(f"πŸ” Searching for About/Summary headings...")
for heading in soup.find_all(['h2', 'h3', 'span']):
heading_text = heading.get_text(strip=True).lower()
if heading_text in ['about', 'summary', 'bio']:
logger.info(f"βœ… Found heading: '{heading_text}'")
# The About section is often in the next sibling or parent
next_elem = heading.find_next_sibling()
if next_elem and hasattr(next_elem, 'get_text') and next_elem.get_text(strip=True):
about = next_elem.get_text(strip=True)
logger.info(f"βœ… Found About section in next sibling: {len(about)} characters")
break
parent = heading.parent
if parent and hasattr(parent, 'get_text') and parent.get_text(strip=True) and len(parent.get_text(strip=True)) > 30:
about = parent.get_text(strip=True)
logger.info(f"βœ… Found About section in parent: {len(about)} characters")
break
# Fallback: look for 'About' in text content
if not about and text_content:
logger.info(f"πŸ” Searching for 'About' pattern in text content...")
match = re.search(r'About[\s\n]+([A-Z][^\n]{30,600})', text_content)
if match:
about = match.group(1).strip()
logger.info(f"βœ… Found About section via regex: {len(about)} characters")
# Fallback: use meta description or og:description
if not about:
logger.info(f"πŸ” Looking for meta description...")
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc and hasattr(meta_desc, 'attrs'):
meta_attrs = getattr(meta_desc, 'attrs', {})
if 'content' in meta_attrs:
about = meta_attrs['content']
logger.info(f"βœ… Found About section in meta description: {len(about) if about else 0} characters")
if not about:
logger.info(f"πŸ” Looking for og:description...")
og_desc = soup.find('meta', attrs={'property': 'og:description'})
if og_desc and hasattr(og_desc, 'attrs'):
og_attrs = getattr(og_desc, 'attrs', {})
if 'content' in og_attrs:
about = og_attrs['content']
logger.info(f"βœ… Found About section in og:description: {len(about) if about else 0} characters")
if about and not isinstance(about, str):
about = str(about)
if about:
logger.info(f"βœ… About section extraction successful: {len(about)} characters")
else:
logger.warning(f"⚠️ About section extraction failed")
return about if isinstance(about, str) else None
def _extract_linkedin_url(self, url: str) -> Optional[str]:
"""Extract and validate LinkedIn profile URL"""
if not url:
return None
# Check if it's a LinkedIn profile URL
if 'linkedin.com/in/' in url:
# Clean up the URL
clean_url = url.split('?')[0] # Remove query parameters
return clean_url
return None
def _parse_profile_snippet(self, title: str, snippet: str, profile_url: str, detailed_data: Optional[Dict] = None) -> Optional[Dict]:
"""Parse LinkedIn profile information from search result snippet and detailed data"""
try:
logger.info(f"πŸ”§ Parsing profile data from title and snippet...")
# Extract name from title (usually "Name | Headline | Location")
logger.info(f"πŸ‘€ Extracting name from title...")
name = self._extract_name_from_title(title)
logger.info(f"βœ… Extracted name: {name}")
# Extract headline and location from snippet
logger.info(f"πŸ’Ό Extracting headline and location...")
headline, location = self._extract_headline_and_location(snippet)
logger.info(f"βœ… Extracted headline: {headline}")
logger.info(f"βœ… Extracted location: {location}")
# Extract company from title first, then snippet, then detailed data
logger.info(f"🏒 Extracting company information...")
company = self._extract_company_from_title(title)
if company:
logger.info(f"βœ… Found company in title: {company}")
else:
logger.info(f"πŸ” Company not found in title, checking snippet...")
company = self._extract_company_from_snippet(snippet)
if company:
logger.info(f"βœ… Found company in snippet: {company}")
else:
logger.info(f"πŸ” Company not found in snippet, checking detailed data...")
if detailed_data and detailed_data.get('success'):
company = self._extract_company_from_detailed_data(detailed_data)
if company:
logger.info(f"βœ… Found company in detailed data: {company}")
else:
logger.warning(f"⚠️ Company not found in any source")
else:
logger.warning(f"⚠️ Company not found in any source")
# Extract education from snippet and detailed data
logger.info(f"πŸŽ“ Extracting education information...")
education = self._extract_education_from_snippet(snippet)
if education:
logger.info(f"βœ… Found education in snippet: {education}")
else:
logger.info(f"πŸ” Education not found in snippet, checking detailed data...")
if detailed_data and detailed_data.get('success'):
# First check if education was extracted during scraping
scraped_education = detailed_data.get('education')
if scraped_education:
logger.info(f"βœ… Found education from scraping: {scraped_education}")
education = scraped_education
else:
# Fallback to other extraction methods
education = self._extract_education_from_detailed_data(detailed_data)
if education:
logger.info(f"βœ… Found education in detailed data: {education}")
else:
logger.warning(f"⚠️ Education not found in any source")
else:
logger.warning(f"⚠️ Education not found in any source")
# Create better experience summary
logger.info(f"πŸ“ Creating experience summary...")
experience_summary = self._create_experience_summary(snippet, detailed_data)
logger.info(f"βœ… Experience summary created: {len(experience_summary)} characters")
# Create candidate profile
candidate = {
'name': name or 'Unknown',
'headline': headline or 'Professional',
'location': location or 'Unknown',
'profile_url': profile_url,
'company': company,
'education': education,
'experience_summary': experience_summary
}
logger.info(f"βœ… Profile parsing completed successfully")
return candidate
except Exception as e:
logger.warning(f"❌ Error parsing profile snippet: {str(e)}")
return None
def _extract_name_from_title(self, title: str) -> str:
"""Extract name from LinkedIn profile title"""
if not title:
return 'Unknown'
# LinkedIn titles are usually "Name | Headline | Location" or "Name - Headline at Company"
# First, try to extract just the name part
name_patterns = [
r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)(?:\s*[-|]\s*)', # Name followed by - or |
r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)(?:\s+at\s+)', # Name followed by "at"
r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)(?:\s*,\s*)', # Name followed by comma
]
for pattern in name_patterns:
match = re.match(pattern, title)
if match:
name = match.group(1).strip()
# Clean up common LinkedIn prefixes
name = name.replace('LinkedIn', '').strip()
if name and len(name) > 2:
return name
# Fallback: take first part before any separator
parts = re.split(r'[-|,]\s*', title)
if parts:
name = parts[0].strip()
# Clean up common LinkedIn prefixes
name = name.replace('LinkedIn', '').strip()
return name if name else 'Unknown'
return 'Unknown'
def _extract_headline_and_location(self, snippet: str) -> tuple:
"""Extract headline and location from snippet"""
headline = 'Professional'
location = 'Unknown'
if not snippet:
return headline, location
# Look for location patterns (City, State or Country)
location_patterns = [
r'([A-Z][a-z]+(?:[\s,]+[A-Z][a-z]+)*)\s*,\s*([A-Z]{2})', # City, State
r'([A-Z][a-z]+(?:[\s,]+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)', # City, Country
]
for pattern in location_patterns:
match = re.search(pattern, snippet)
if match:
location = f"{match.group(1)}, {match.group(2)}"
break
# Extract headline (usually contains job title) - improved logic
lines = snippet.split('.')
for line in lines:
line = line.strip()
# Look for job title patterns
if any(keyword in line.lower() for keyword in ['engineer', 'developer', 'manager', 'director', 'lead', 'senior', 'principal', 'architect']):
# Filter out lines that are likely names
if not self._is_likely_name(line) and len(line) > 5:
headline = line
break
return headline, location
def _extract_company_from_title(self, title: str) -> Optional[str]:
"""Extract company name from LinkedIn profile title"""
if not title:
return None
# LinkedIn titles are usually "Name | Headline | Location" or "Name - Headline at Company"
# Look for company patterns in title
company_patterns = [
r'at\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|$|\(|\))',
r'-\s+([A-Z][a-zA-Z\s&\.]+?)\s+at\s+',
r'\|\s+([A-Z][a-zA-Z\s&\.]+?)\s+at\s+',
r'-\s+[^-]+-\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|$|\(|\))', # Third part after two dashes
r'\|\s+[^|]+\|\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|$|\(|\))', # Third part after two pipes
]
for pattern in company_patterns:
match = re.search(pattern, title)
if match:
company = match.group(1).strip()
if (len(company) > 2 and len(company) < 50 and
company.lower() not in ['linkedin', 'profile', 'view', 'professional', 'experience'] and
not self._is_likely_name(company)):
return company
# Try to extract from the last part of the title (after last separator)
parts = re.split(r'[-|]\s*', title)
if len(parts) >= 3:
last_part = parts[-1].strip()
# Check if the last part looks like a company name
if (len(last_part) > 3 and len(last_part) < 50 and
not self._is_likely_name(last_part) and
last_part.lower() not in ['linkedin', 'profile', 'view', 'professional', 'experience']):
return last_part
return None
def _extract_company_from_snippet(self, snippet: str) -> Optional[str]:
"""Extract company name from snippet"""
if not snippet:
return None
# Look for company patterns - more comprehensive patterns
company_patterns = [
r'at\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|,|\.|$|\(|\))',
r'([A-Z][a-zA-Z\s&\.]+?)\s+β€’\s+',
r'([A-Z][a-zA-Z\s&\.]+?)\s+-\s+',
r'([A-Z][a-zA-Z\s&\.]+?)\s+\(',
r'([A-Z][a-zA-Z\s&\.]+?)\s+at\s+',
r'([A-Z][a-zA-Z\s&\.]+?)\s+Software\s+Engineer',
r'([A-Z][a-zA-Z\s&\.]+?)\s+Senior\s+',
r'([A-Z][a-zA-Z\s&\.]+?)\s+Developer',
]
for pattern in company_patterns:
match = re.search(pattern, snippet)
if match:
company = match.group(1).strip()
# Filter out common false positives and names
if (len(company) > 2 and len(company) < 50 and
company.lower() not in ['linkedin', 'profile', 'view', 'professional', 'experience'] and
not self._is_likely_name(company)):
return company
return None
def _is_likely_name(self, text: str) -> bool:
"""Check if text is likely a person's name rather than a company"""
# Common name patterns
name_indicators = [
r'^[A-Z][a-z]+\s+[A-Z][a-z]+$', # First Last
r'^[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+$', # First Middle Last
r'^[A-Z][a-z]+\.\s+[A-Z][a-z]+$', # F. Last
]
for pattern in name_indicators:
if re.match(pattern, text):
return True
# Check for common name words
common_names = ['michael', 'john', 'david', 'james', 'robert', 'mary', 'jennifer', 'lisa', 'sarah']
if text.lower() in common_names:
return True
return False
def _extract_company_from_detailed_data(self, detailed_data: Dict) -> Optional[str]:
"""Extract company from detailed scraped data"""
try:
# Check structured data first
structured_data = detailed_data.get('structured_data', {})
# Look for organization information in structured data
if 'worksFor' in structured_data:
return structured_data['worksFor']
# Check meta description for company info
description = structured_data.get('description', '')
if description:
company_match = re.search(r'at\s+([A-Z][a-zA-Z\s&]+?)(?:\s|,|\.|$)', description)
if company_match:
return company_match.group(1).strip()
# Check text content
text_content = detailed_data.get('text_content', '')
if text_content:
return self._extract_company_from_snippet(text_content)
except Exception as e:
logger.warning(f"Error extracting company from detailed data: {str(e)}")
return None
def _extract_education_from_snippet(self, snippet: str) -> Optional[str]:
"""Extract education information from snippet using improved patterns"""
if not snippet:
return None
# Look for education patterns - more comprehensive and specific to LinkedIn structure
education_patterns = [
# Pattern for "Education: University Name" format (from top card)
r'Education:\s*([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute|Academy))',
# Pattern for degree + university format
r'(?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS)\s+(?:of|in|from)\s+([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))',
# Pattern for university name followed by degree
r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute)).*?(?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS)',
# Pattern for "Studied at" format
r'(?:Studied|Graduated|Attended)\s+(?:from|at)\s+([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))',
# Pattern for university name with degree in parentheses
r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute)).*?\((?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS)',
# Pattern for degree, field format
r'(?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS),\s*[A-Za-z\s/]+(?:from|at)\s+([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))',
]
for pattern in education_patterns:
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
education = match.group(1).strip()
if len(education) > 3 and len(education) < 100:
# Clean up the education string
education = re.sub(r'\s+', ' ', education) # Remove extra whitespace
education = education.strip()
return education
# Fallback to keyword-based search with better context
education_keywords = ['university', 'college', 'school', 'institute', 'bachelor', 'master', 'phd', 'degree']
for keyword in education_keywords:
if keyword in snippet.lower():
# Find the sentence containing education info
sentences = snippet.split('.')
for sentence in sentences:
if keyword in sentence.lower():
# Extract university name from the sentence
university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', sentence)
if university_match:
return university_match.group(1).strip()
# If no university found, return the sentence itself
return sentence.strip()
return None
def _extract_education_from_detailed_data(self, detailed_data: Dict) -> Optional[str]:
"""Extract education from detailed scraped data using LinkedIn-specific selectors"""
try:
# Check structured data first
structured_data = detailed_data.get('structured_data', {})
# Look for education information in structured data
if 'alumniOf' in structured_data:
return structured_data['alumniOf']
# Check meta description for education info
description = structured_data.get('description', '')
if description:
education_match = re.search(r'([A-Z][a-zA-Z\s&]+?(?:university|college|school|institute))', description, re.IGNORECASE)
if education_match:
return education_match.group(1).strip()
# Check text content for LinkedIn-specific education patterns
text_content = detailed_data.get('text_content', '')
if text_content:
# Look for education section specifically
education_section = self._extract_education_section_from_html(text_content)
if education_section:
return education_section
# Fallback to snippet extraction
return self._extract_education_from_snippet(text_content)
except Exception as e:
logger.warning(f"Error extracting education from detailed data: {str(e)}")
return None
def _extract_education_section_from_html(self, html_content: str) -> Optional[str]:
"""Extract education information from LinkedIn HTML structure"""
try:
# Look for education section using LinkedIn-specific patterns
education_patterns = [
# Pattern for education section header
r'<h2[^>]*>.*?Education.*?</h2>.*?<span[^>]*>([^<]+(?:University|College|School|Institute)[^<]*)</span>',
# Pattern for education in top card
r'aria-label="Education:\s*([^"]+(?:University|College|School|Institute)[^"]*)"',
# Pattern for education list items
r'<li[^>]*>.*?<span[^>]*>([^<]+(?:University|College|School|Institute)[^<]*)</span>',
# Pattern for education in bold text
r'<span[^>]*class="[^"]*t-bold[^"]*"[^>]*>([^<]+(?:University|College|School|Institute)[^<]*)</span>',
]
for pattern in education_patterns:
match = re.search(pattern, html_content, re.IGNORECASE | re.DOTALL)
if match:
education = match.group(1).strip()
if len(education) > 3 and len(education) < 100:
# Clean up the education string
education = re.sub(r'\s+', ' ', education) # Remove extra whitespace
education = education.strip()
return education
# Look for education keywords in the HTML
if 'education' in html_content.lower():
# Find the section containing education
lines = html_content.split('\n')
for i, line in enumerate(lines):
if 'education' in line.lower() and 'university' in line.lower():
# Extract university name from this line or nearby lines
university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', line)
if university_match:
return university_match.group(1).strip()
# Check next few lines for university name
for j in range(i+1, min(i+5, len(lines))):
university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', lines[j])
if university_match:
return university_match.group(1).strip()
except Exception as e:
logger.warning(f"Error extracting education from HTML: {str(e)}")
return None
def _create_experience_summary(self, snippet: str, detailed_data: Optional[Dict] = None) -> str:
"""Create a better experience summary from available data, prioritizing the About section."""
logger.info(f"πŸ“ Creating experience summary...")
# Use About section if available
if detailed_data and detailed_data.get('success'):
about_section = detailed_data.get('about_section')
if about_section and len(about_section) > 30:
logger.info(f"βœ… Using About section for experience summary")
summary = about_section.strip()
if len(summary) > 400:
summary = summary[:397] + '...'
logger.info(f"πŸ“ Truncated summary to 400 characters")
return summary
else:
logger.info(f"⚠️ About section not available or too short, using fallback logic")
# Fallback to previous logic
logger.info(f"πŸ” Extracting relevant sentences from snippet...")
summary_parts = []
if snippet:
sentences = snippet.split('.')
relevant_sentences = []
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) > 20 and any(keyword in sentence.lower() for keyword in
['engineer', 'developer', 'manager', 'lead', 'senior', 'experience', 'worked', 'responsible', 'developed', 'built', 'created']):
relevant_sentences.append(sentence)
if relevant_sentences:
summary_parts.extend(relevant_sentences[:2])
logger.info(f"βœ… Found {len(relevant_sentences[:2])} relevant sentences from snippet")
else:
logger.info(f"⚠️ No relevant sentences found in snippet")
if detailed_data and detailed_data.get('success'):
logger.info(f"πŸ” Adding structured data information...")
structured_data = detailed_data.get('structured_data', {})
if 'jobTitle' in structured_data:
summary_parts.append(f"Current role: {structured_data['jobTitle']}")
logger.info(f"βœ… Added job title: {structured_data['jobTitle']}")
if 'worksFor' in structured_data:
summary_parts.append(f"Company: {structured_data['worksFor']}")
logger.info(f"βœ… Added company: {structured_data['worksFor']}")
if 'alumniOf' in structured_data:
summary_parts.append(f"Education: {structured_data['alumniOf']}")
logger.info(f"βœ… Added education: {structured_data['alumniOf']}")
text_content = detailed_data.get('text_content', '')
if text_content and not summary_parts:
logger.info(f"πŸ” Searching for experience keywords in text content...")
experience_keywords = ['experience', 'worked', 'developed', 'built', 'created', 'managed']
for keyword in experience_keywords:
if keyword in text_content.lower():
sentences = text_content.split('.')
for sentence in sentences:
if keyword in sentence.lower() and len(sentence.strip()) > 30:
summary_parts.append(sentence.strip())
logger.info(f"βœ… Found experience sentence with keyword '{keyword}'")
break
if summary_parts:
break
if summary_parts:
summary = '. '.join(summary_parts)
if len(summary) > 400:
summary = summary[:397] + '...'
logger.info(f"πŸ“ Truncated summary to 400 characters")
logger.info(f"βœ… Created summary from {len(summary_parts)} parts")
return summary
if snippet:
logger.info(f"πŸ” Using snippet as fallback...")
words = snippet.split()
if len(words) > 20:
summary = ' '.join(words[:20]) + '...'
logger.info(f"βœ… Created fallback summary from first 20 words")
return summary
logger.info(f"βœ… Using full snippet as summary")
return snippet
logger.warning(f"⚠️ No experience information available")
return "Experience information not available"
def _extract_education_from_linkedin_profile(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract education information from LinkedIn profile using BeautifulSoup"""
try:
# Method 1: Look for education section by ID
education_section = soup.find('div', {'id': 'education'})
if education_section:
# Find the parent section that contains education information
education_card = education_section.find_parent('section', class_='pv-profile-card')
if education_card:
# Look for university names in the education card
university_elements = education_card.find_all(text=re.compile(r'.*University.*|.*College.*|.*School.*|.*Institute.*'))
for element in university_elements:
if hasattr(element, 'strip') and element.strip() and len(element.strip()) > 3:
return element.strip()
# Method 2: Look for education in the top card section
top_card = soup.find('section', class_=re.compile(r'artdeco-card.*'))
if top_card:
# Look for education button/link
education_button = top_card.find('button', attrs={'aria-label': re.compile(r'Education:.*')})
if education_button and hasattr(education_button, 'get'):
# Extract university name from aria-label
aria_label = education_button.get('aria-label', '')
if aria_label and isinstance(aria_label, str):
education_match = re.search(r'Education:\s*([^,]+)', aria_label)
if education_match:
return education_match.group(1).strip()
# Look for education text in the top card
education_text = top_card.find(text=re.compile(r'.*University.*|.*College.*|.*School.*|.*Institute.*'))
if education_text and hasattr(education_text, 'strip') and education_text.strip():
return education_text.strip()
# Method 3: Look for education in structured data
structured_data = self._extract_structured_data(soup)
if 'alumniOf' in structured_data:
return structured_data['alumniOf']
# Method 4: Look for education keywords in the entire page
education_keywords = ['university', 'college', 'school', 'institute', 'bachelor', 'master', 'phd']
for keyword in education_keywords:
elements = soup.find_all(text=re.compile(keyword, re.IGNORECASE))
for element in elements:
if hasattr(element, 'strip'):
text = element.strip()
if len(text) > 10 and len(text) < 200:
# Check if this looks like an education entry
if any(edu_keyword in text.lower() for edu_keyword in ['university', 'college', 'school', 'institute']):
# Extract just the university name
university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', text)
if university_match:
return university_match.group(1).strip()
return text
except Exception as e:
logger.warning(f"Error extracting education from LinkedIn profile: {str(e)}")
return None
# Cache management methods
def get_cache_stats(self) -> Dict[str, Any]:
"""Get cache statistics"""
return self.cache_service.get_cache_stats()
def clear_cache(self, cache_type: str = "all"):
"""Clear specified cache or all caches"""
self.cache_service.clear_cache(cache_type)
def cleanup_expired_cache(self):
"""Clean up expired cache entries"""
self.cache_service.cleanup_expired_entries()