Spaces:
Sleeping
Sleeping
| import requests | |
| import time | |
| import re | |
| from typing import List, Dict, Optional, Any | |
| from urllib.parse import urlparse, parse_qs | |
| import logging | |
| from bs4 import BeautifulSoup | |
| import json | |
| from app.utils.config import Config | |
| from app.services.cache_service import CacheService | |
| logger = logging.getLogger(__name__) | |
| class LinkedInSearchService: | |
| """Service for searching LinkedIn profiles using Google Custom Search API with fallbacks""" | |
| def __init__(self): | |
| self.api_key = Config.GOOGLE_API_KEY | |
| self.cse_id = Config.GOOGLE_CSE_ID | |
| self.base_url = "https://www.googleapis.com/customsearch/v1" | |
| self.delay = Config.SEARCH_DELAY | |
| self.session = requests.Session() | |
| # Initialize cache service | |
| self.cache_service = CacheService() | |
| # Set headers to mimic a real browser | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1', | |
| }) | |
| def search_linkedin_profiles(self, job_description: str, location: Optional[str] = None, max_results: int = 10) -> List[Dict]: | |
| """ | |
| Search for LinkedIn profiles based on job description and location | |
| Args: | |
| job_description: Job requirements and description | |
| location: Preferred location (optional) | |
| max_results: Maximum number of profiles to return | |
| Returns: | |
| List of candidate profile dictionaries | |
| """ | |
| try: | |
| logger.info(f"π Starting LinkedIn profile search for: {job_description[:100]}...") | |
| logger.info(f"π Location: {location or 'Any'}") | |
| logger.info(f"π Max results requested: {max_results}") | |
| # Check cache first | |
| cached_results = self.cache_service.get_search_results(job_description, location, max_results) | |
| if cached_results: | |
| logger.info(f"π― Returning {len(cached_results)} cached search results") | |
| return cached_results | |
| # Check if we have valid API credentials | |
| if not self._validate_api_credentials(): | |
| logger.warning("β οΈ Invalid or missing API credentials. Using fallback search methods.") | |
| fallback_results = self._fallback_search(job_description, location, max_results) | |
| # Cache fallback results | |
| self.cache_service.set_search_results(job_description, location, max_results, fallback_results) | |
| return fallback_results | |
| # Build multiple search queries for better coverage | |
| logger.info("π Building search queries...") | |
| search_queries = self._build_multiple_search_queries(job_description, location) | |
| logger.info(f"β Built {len(search_queries)} search queries") | |
| # Perform searches with different queries | |
| logger.info("π Performing Google searches...") | |
| all_search_results = [] | |
| # Calculate results per query to ensure we get enough results | |
| # Instead of dividing max_results by number of queries, we'll request more per query | |
| # and then limit the total results later | |
| results_per_query = max(5, max_results // 2) # At least 5 results per query, or half of max_results | |
| for i, query in enumerate(search_queries, 1): | |
| logger.info(f"π Search {i}/{len(search_queries)}: {query[:80]}...") | |
| results = self._perform_google_search(query, results_per_query) | |
| logger.info(f"π Found {len(results)} results for query {i}") | |
| all_search_results.extend(results) | |
| if i < len(search_queries): | |
| logger.info(f"β³ Waiting {self.delay}s before next search...") | |
| time.sleep(self.delay) # Rate limiting between queries | |
| logger.info(f"π Total search results before deduplication: {len(all_search_results)}") | |
| # If no results from Google API, try fallback methods | |
| if not all_search_results: | |
| logger.warning("β οΈ No results from Google API. Trying fallback search methods...") | |
| fallback_results = self._fallback_search(job_description, location, max_results) | |
| # Cache fallback results | |
| self.cache_service.set_search_results(job_description, location, max_results, fallback_results) | |
| return fallback_results | |
| # Remove duplicates based on profile URL | |
| logger.info("π Deduplicating search results...") | |
| unique_results = self._deduplicate_search_results(all_search_results) | |
| logger.info(f"β After deduplication: {len(unique_results)} unique profiles") | |
| # Extract and parse LinkedIn profiles | |
| logger.info("π§ Extracting profile data...") | |
| candidates = self._extract_profile_data(unique_results[:max_results]) | |
| # Cache the results | |
| self.cache_service.set_search_results(job_description, location, max_results, candidates) | |
| logger.info(f"π Search completed! Found {len(candidates)} LinkedIn profiles using {len(search_queries)} search queries") | |
| return candidates | |
| except Exception as e: | |
| logger.error(f"β Error searching LinkedIn profiles: {str(e)}") | |
| logger.info("π Trying fallback search methods...") | |
| fallback_results = self._fallback_search(job_description, location, max_results) | |
| # Cache fallback results even on error | |
| self.cache_service.set_search_results(job_description, location, max_results, fallback_results) | |
| return fallback_results | |
| def _validate_api_credentials(self) -> bool: | |
| """Validate that we have proper API credentials""" | |
| if not self.api_key or self.api_key == "test_google_api_key" or self.api_key == "your_google_api_key_here": | |
| logger.warning("β οΈ Invalid Google API key detected") | |
| return False | |
| if not self.cse_id or self.cse_id == "test_search_engine_id" or self.cse_id == "your_search_engine_id_here": | |
| logger.warning("β οΈ Invalid Google CSE ID detected") | |
| return False | |
| return True | |
| def _fallback_search(self, job_description: str, location: Optional[str] = None, max_results: int = 10) -> List[Dict]: | |
| """Fallback search method when Google API is not available""" | |
| logger.info("π Using fallback search method...") | |
| # Create sample profiles based on the job description | |
| # This is a temporary solution until proper API credentials are configured | |
| sample_profiles = self._generate_sample_profiles(job_description, location, max_results) | |
| logger.info(f"π Generated {len(sample_profiles)} sample profiles for demonstration") | |
| return sample_profiles | |
| def _generate_sample_profiles(self, job_description: str, location: Optional[str] = None, max_results: int = 10) -> List[Dict]: | |
| """Generate sample profiles for demonstration purposes""" | |
| # Extract key terms for more relevant sample profiles | |
| key_terms = self._extract_key_terms(job_description) | |
| # Expanded sample data based on common software engineering profiles | |
| sample_data = [ | |
| { | |
| "name": "Sarah Chen", | |
| "headline": "Senior Software Engineer at TechCorp", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/sarah-chen-123456", | |
| "company": "TechCorp", | |
| "education": "Stanford University - Master of Science, Computer Science", | |
| "experience_summary": "5+ years building scalable web applications with Python, React, and AWS. Led development of microservices architecture serving 1M+ users." | |
| }, | |
| { | |
| "name": "Michael Rodriguez", | |
| "headline": "Full Stack Developer | Python | React | Node.js", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/michael-rodriguez-789012", | |
| "company": "StartupXYZ", | |
| "education": "UC Berkeley - Bachelor of Science, Software Engineering", | |
| "experience_summary": "Experienced full-stack developer with expertise in modern web technologies. Built and deployed applications using Python, React, and cloud platforms." | |
| }, | |
| { | |
| "name": "Emily Johnson", | |
| "headline": "Software Engineer | Backend Development | Python", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/emily-johnson-345678", | |
| "company": "DataFlow Inc", | |
| "education": "MIT - Master of Science, Computer Science", | |
| "experience_summary": "Backend engineer specializing in Python development, database design, and API development. Experience with Django, Flask, and PostgreSQL." | |
| }, | |
| { | |
| "name": "David Kim", | |
| "headline": "Senior Developer | React | Python | DevOps", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/david-kim-901234", | |
| "company": "CloudTech Solutions", | |
| "education": "University of Washington - Bachelor of Science, Computer Science", | |
| "experience_summary": "Full-stack developer with 6+ years experience in React, Python, and cloud infrastructure. Led multiple successful product launches." | |
| }, | |
| { | |
| "name": "Lisa Wang", | |
| "headline": "Software Engineer | Frontend Specialist | React", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/lisa-wang-567890", | |
| "company": "WebFlow", | |
| "education": "University of Michigan - Bachelor of Engineering, Computer Engineering", | |
| "experience_summary": "Frontend engineer passionate about creating intuitive user experiences. Expert in React, TypeScript, and modern CSS frameworks." | |
| }, | |
| { | |
| "name": "Alex Thompson", | |
| "headline": "Principal Software Engineer | System Architecture", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/alex-thompson-111111", | |
| "company": "EnterpriseTech", | |
| "education": "Carnegie Mellon University - Master of Science, Computer Science", | |
| "experience_summary": "Principal engineer with 8+ years designing and implementing large-scale distributed systems. Expert in microservices, cloud architecture, and performance optimization." | |
| }, | |
| { | |
| "name": "Maria Garcia", | |
| "headline": "Senior Backend Engineer | Python | Go | Microservices", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/maria-garcia-222222", | |
| "company": "ScaleUp Inc", | |
| "education": "University of Texas - Bachelor of Science, Computer Science", | |
| "experience_summary": "Backend specialist with expertise in high-performance systems, database optimization, and API design. Led teams building services handling millions of requests daily." | |
| }, | |
| { | |
| "name": "James Wilson", | |
| "headline": "Full Stack Lead Developer | React | Node.js | AWS", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/james-wilson-333333", | |
| "company": "Digital Solutions", | |
| "education": "Georgia Tech - Bachelor of Science, Computer Engineering", | |
| "experience_summary": "Lead developer with 7+ years experience in modern web development. Expert in React ecosystem, Node.js backend development, and AWS cloud infrastructure." | |
| }, | |
| { | |
| "name": "Sophie Brown", | |
| "headline": "Software Engineer | Machine Learning | Python", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/sophie-brown-444444", | |
| "company": "AI Innovations", | |
| "education": "University of California - Master of Science, Data Science", | |
| "experience_summary": "ML engineer specializing in Python, TensorFlow, and PyTorch. Experience building recommendation systems and natural language processing applications." | |
| }, | |
| { | |
| "name": "Ryan Davis", | |
| "headline": "DevOps Engineer | Kubernetes | Docker | CI/CD", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/ryan-davis-555555", | |
| "company": "CloudFirst", | |
| "education": "University of Illinois - Bachelor of Science, Computer Science", | |
| "experience_summary": "DevOps specialist with expertise in containerization, orchestration, and automation. Led infrastructure teams managing production environments." | |
| }, | |
| { | |
| "name": "Jennifer Lee", | |
| "headline": "Senior Frontend Engineer | React | TypeScript | UX", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/jennifer-lee-666666", | |
| "company": "UserExperience Pro", | |
| "education": "University of Washington - Master of Science, Human Computer Interaction", | |
| "experience_summary": "Frontend engineer passionate about user experience and accessibility. Expert in React, TypeScript, and modern frontend architecture patterns." | |
| }, | |
| { | |
| "name": "Carlos Martinez", | |
| "headline": "Software Architect | System Design | Java | Spring", | |
| "location": location or "San Francisco, CA", | |
| "profile_url": "https://linkedin.com/in/carlos-martinez-777777", | |
| "company": "Enterprise Systems", | |
| "education": "University of California - Master of Science, Software Engineering", | |
| "experience_summary": "Software architect with 10+ years designing enterprise systems. Expert in Java ecosystem, Spring framework, and scalable system architecture." | |
| } | |
| ] | |
| # Filter and customize based on job description keywords | |
| relevant_profiles = [] | |
| job_desc_lower = job_description.lower() | |
| for profile in sample_data: | |
| # Check if profile matches job requirements | |
| profile_text = f"{profile['headline']} {profile['experience_summary']}".lower() | |
| # Simple relevance scoring | |
| relevance_score = 0 | |
| for term in key_terms: | |
| if term in profile_text: | |
| relevance_score += 1 | |
| # Add profiles that have some relevance or if we need more results | |
| if relevance_score > 0 or len(relevant_profiles) < max_results: | |
| # Customize profile based on job description | |
| customized_profile = profile.copy() | |
| # Adjust headline based on job requirements | |
| if "senior" in job_desc_lower and "senior" not in profile['headline'].lower(): | |
| customized_profile['headline'] = f"Senior {profile['headline']}" | |
| if "python" in job_desc_lower and "python" not in profile['headline'].lower(): | |
| customized_profile['headline'] += " | Python" | |
| if "react" in job_desc_lower and "react" not in profile['headline'].lower(): | |
| customized_profile['headline'] += " | React" | |
| relevant_profiles.append(customized_profile) | |
| # If we still need more profiles, create additional ones | |
| while len(relevant_profiles) < max_results: | |
| # Generate additional profiles with variations | |
| base_profile = sample_data[len(relevant_profiles) % len(sample_data)].copy() | |
| # Create variations | |
| variations = [ | |
| {"name": f"Alex {base_profile['name'].split()[1]}", "headline": f"Software Engineer | {key_terms[0] if key_terms else 'Development'}"}, | |
| {"name": f"Jordan {base_profile['name'].split()[1]}", "headline": f"Full Stack Developer | {key_terms[0] if key_terms else 'Web Development'}"}, | |
| {"name": f"Taylor {base_profile['name'].split()[1]}", "headline": f"Backend Engineer | {key_terms[0] if key_terms else 'API Development'}"}, | |
| {"name": f"Casey {base_profile['name'].split()[1]}", "headline": f"Frontend Developer | {key_terms[0] if key_terms else 'UI/UX'}"}, | |
| {"name": f"Riley {base_profile['name'].split()[1]}", "headline": f"DevOps Engineer | {key_terms[0] if key_terms else 'Infrastructure'}"} | |
| ] | |
| variation = variations[len(relevant_profiles) % len(variations)] | |
| new_profile = base_profile.copy() | |
| new_profile.update(variation) | |
| new_profile["profile_url"] = f"https://linkedin.com/in/{new_profile['name'].lower().replace(' ', '-')}-{len(relevant_profiles):06d}" | |
| relevant_profiles.append(new_profile) | |
| # Return up to max_results | |
| return relevant_profiles[:max_results] | |
| def _build_multiple_search_queries(self, job_description: str, location: Optional[str] = None) -> List[str]: | |
| """Build multiple search queries for better coverage, targeting About section and summary.""" | |
| logger.info("π§ Extracting key terms from job description...") | |
| key_terms = self._extract_key_terms(job_description) | |
| logger.info(f"π Extracted key terms: {key_terms}") | |
| queries = [] | |
| # Query 1: Basic profile search | |
| query_parts = ["site:linkedin.com/in/", "profile"] + key_terms | |
| if location: | |
| query_parts.append(location) | |
| queries.append(" ".join(query_parts)) | |
| logger.info(f"π Query 1 (Basic): {' '.join(query_parts)}") | |
| # Query 2: Experience-focused search | |
| query_parts = ["site:linkedin.com/in/", "experience"] + key_terms | |
| if location: | |
| query_parts.append(location) | |
| queries.append(" ".join(query_parts)) | |
| logger.info(f"π Query 2 (Experience): {' '.join(query_parts)}") | |
| # Query 3: Company-focused search | |
| query_parts = ["site:linkedin.com/in/", "company"] + key_terms | |
| if location: | |
| query_parts.append(location) | |
| queries.append(" ".join(query_parts)) | |
| logger.info(f"π Query 3 (Company): {' '.join(query_parts)}") | |
| # Query 4: About section search | |
| query_parts = ["site:linkedin.com/in/", "about"] + key_terms | |
| if location: | |
| query_parts.append(location) | |
| queries.append(" ".join(query_parts)) | |
| logger.info(f"π Query 4 (About): {' '.join(query_parts)}") | |
| # Query 5: Summary section search | |
| query_parts = ["site:linkedin.com/in/", "summary"] + key_terms | |
| if location: | |
| query_parts.append(location) | |
| queries.append(" ".join(query_parts)) | |
| logger.info(f"π Query 5 (Summary): {' '.join(query_parts)}") | |
| # Query 6: Natural language query to encourage About section in snippet | |
| nl_query = f"site:linkedin.com/in/ About section {job_description} {location or ''}" | |
| queries.append(nl_query.strip()) | |
| logger.info(f"π Query 6 (Natural Language): {nl_query[:80]}...") | |
| # Query 7: Bio section search | |
| query_parts = ["site:linkedin.com/in/", "bio"] + key_terms | |
| if location: | |
| query_parts.append(location) | |
| queries.append(" ".join(query_parts)) | |
| logger.info(f"π Query 7 (Bio): {' '.join(query_parts)}") | |
| return queries | |
| def _deduplicate_search_results(self, search_results: List[Dict]) -> List[Dict]: | |
| """Remove duplicate search results based on profile URL""" | |
| seen_urls = set() | |
| unique_results = [] | |
| for result in search_results: | |
| profile_url = self._extract_linkedin_url(result.get('link', '')) | |
| if profile_url and profile_url not in seen_urls: | |
| seen_urls.add(profile_url) | |
| unique_results.append(result) | |
| return unique_results | |
| def _extract_key_terms(self, job_description: str) -> List[str]: | |
| """Extract key terms from job description for search optimization""" | |
| # Common job-related keywords to focus on | |
| job_keywords = [ | |
| "software engineer", "developer", "programmer", "engineer", | |
| "manager", "director", "lead", "senior", "principal", | |
| "full stack", "frontend", "backend", "devops", "data", | |
| "machine learning", "AI", "artificial intelligence", | |
| "python", "javascript", "java", "react", "node.js" | |
| ] | |
| # Extract matching keywords from job description | |
| found_keywords = [] | |
| job_desc_lower = job_description.lower() | |
| for keyword in job_keywords: | |
| if keyword in job_desc_lower: | |
| found_keywords.append(keyword) | |
| # If no specific keywords found, use general terms | |
| if not found_keywords: | |
| found_keywords = ["professional", "experience"] | |
| return found_keywords[:3] # Limit to top 3 keywords | |
| def _perform_google_search(self, query: str, max_results: int) -> List[Dict]: | |
| """Perform Google Custom Search API request""" | |
| logger.info(f"π Starting Google search for: {query[:60]}...") | |
| # Check cache first for this specific query | |
| cached_results = self.cache_service.get_query_results(query, max_results) | |
| if cached_results: | |
| logger.info(f"π― Returning {len(cached_results)} cached query results") | |
| return cached_results | |
| results = [] | |
| # Ensure we always make at least one request | |
| if max_results <= 0: | |
| max_results = 1 | |
| # Google CSE returns max 10 results per request | |
| # Calculate how many requests we need | |
| num_requests = max(1, min(5, (max_results + 9) // 10)) # At least 1 request, max 5 | |
| logger.info(f"π Will make {num_requests} API requests (max 10 results per request)") | |
| for i in range(num_requests): | |
| start_index = i * 10 + 1 | |
| # Calculate how many results to request for this specific request | |
| results_per_request = min(10, max_results - i * 10) | |
| # Ensure we request at least 1 result | |
| if results_per_request <= 0: | |
| results_per_request = 1 | |
| logger.info(f"π API request {i+1}/{num_requests} (start index: {start_index}, results: {results_per_request})") | |
| params = { | |
| 'key': self.api_key, | |
| 'cx': self.cse_id, | |
| 'q': query, | |
| 'start': start_index, | |
| 'num': results_per_request | |
| } | |
| try: | |
| logger.info(f"π‘ Making API request to Google Custom Search...") | |
| response = requests.get(self.base_url, params=params) | |
| response.raise_for_status() | |
| data = response.json() | |
| if 'items' in data: | |
| results.extend(data['items']) | |
| logger.info(f"β Request {i+1} successful: got {len(data['items'])} results") | |
| else: | |
| logger.warning(f"β οΈ Request {i+1} returned no items") | |
| # Rate limiting | |
| if i < num_requests - 1: # Don't delay after last request | |
| logger.info(f"β³ Rate limiting: waiting {self.delay}s before next request...") | |
| time.sleep(self.delay) | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"β Google search request {i+1} failed: {str(e)}") | |
| break | |
| except Exception as e: | |
| logger.error(f"β Error processing search results for request {i+1}: {str(e)}") | |
| break | |
| logger.info(f"π Google search completed: {len(results)} total results") | |
| # Cache the results | |
| self.cache_service.set_query_results(query, max_results, results) | |
| return results | |
| def _extract_profile_data(self, search_results: List[Dict]) -> List[Dict]: | |
| """Extract and parse LinkedIn profile data from search results""" | |
| logger.info(f"π§ Starting profile data extraction for {len(search_results)} search results") | |
| candidates = [] | |
| for i, result in enumerate(search_results, 1): | |
| try: | |
| logger.info(f"π Processing result {i}/{len(search_results)}") | |
| # Extract LinkedIn URL | |
| profile_url = self._extract_linkedin_url(result.get('link', '')) | |
| if not profile_url: | |
| logger.warning(f"β οΈ Result {i}: Not a valid LinkedIn URL, skipping") | |
| continue | |
| logger.info(f"π Result {i}: Valid LinkedIn URL found: {profile_url}") | |
| # Extract profile information from snippet | |
| snippet = result.get('snippet', '') | |
| title = result.get('title', '') | |
| logger.info(f"π Result {i}: Title: {title[:60]}...") | |
| logger.info(f"π Result {i}: Snippet length: {len(snippet)} characters") | |
| # Try to get more detailed information by scraping the profile | |
| logger.info(f"π Result {i}: Attempting to scrape profile for detailed data...") | |
| detailed_data = self._scrape_linkedin_profile(profile_url) | |
| if detailed_data.get('success'): | |
| logger.info(f"β Result {i}: Profile scraping successful") | |
| else: | |
| logger.warning(f"β οΈ Result {i}: Profile scraping failed: {detailed_data.get('error', 'Unknown error')}") | |
| # Parse basic profile data | |
| logger.info(f"π§ Result {i}: Parsing profile data...") | |
| profile_data = self._parse_profile_snippet(title, snippet, profile_url, detailed_data) | |
| if profile_data: | |
| candidates.append(profile_data) | |
| logger.info(f"β Result {i}: Profile data extracted successfully") | |
| logger.info(f" π€ Name: {profile_data.get('name', 'Unknown')}") | |
| logger.info(f" πΌ Company: {profile_data.get('company', 'Unknown')}") | |
| logger.info(f" π Education: {profile_data.get('education', 'Unknown')}") | |
| else: | |
| logger.warning(f"β οΈ Result {i}: Failed to extract profile data") | |
| except Exception as e: | |
| logger.warning(f"β Error parsing profile data for result {i}: {str(e)}") | |
| continue | |
| logger.info(f"π Profile data extraction completed: {len(candidates)} successful extractions") | |
| return candidates | |
| def _scrape_linkedin_profile(self, profile_url: str) -> Dict: | |
| """Attempt to scrape LinkedIn profile for more detailed information, including About section.""" | |
| logger.info(f"π Scraping LinkedIn profile: {profile_url}") | |
| # Check cache first for this profile | |
| cached_profile_data = self.cache_service.get_profile_data(profile_url) | |
| if cached_profile_data: | |
| logger.info(f"π― Returning cached profile data for: {profile_url}") | |
| return cached_profile_data | |
| try: | |
| logger.info(f"π‘ Making HTTP request to LinkedIn...") | |
| response = self.session.get(profile_url, timeout=10) | |
| if response.status_code == 200: | |
| logger.info(f"β HTTP request successful (status: {response.status_code})") | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| logger.info(f"π§ Extracting structured data...") | |
| structured_data = self._extract_structured_data(soup) | |
| logger.info(f"π Found {len(structured_data)} structured data fields") | |
| logger.info(f"π Extracting text content...") | |
| text_content = soup.get_text() | |
| logger.info(f"π Text content length: {len(text_content)} characters") | |
| # Try to extract About section | |
| logger.info(f"π Attempting to extract About section...") | |
| about_section = self._extract_about_section(soup, text_content) | |
| if about_section: | |
| logger.info(f"β About section found: {len(about_section)} characters") | |
| else: | |
| logger.warning(f"β οΈ About section not found") | |
| # Try to extract education information | |
| logger.info(f"π Attempting to extract education information...") | |
| education = self._extract_education_from_linkedin_profile(soup) | |
| if education: | |
| logger.info(f"β Education found: {education}") | |
| # Add education to structured data for easier access | |
| structured_data['alumniOf'] = education | |
| else: | |
| logger.warning(f"β οΈ Education not found") | |
| profile_data = { | |
| 'structured_data': structured_data, | |
| 'text_content': text_content[:2000], # Limit content length | |
| 'about_section': about_section, | |
| 'education': education, | |
| 'success': True | |
| } | |
| # Cache the profile data | |
| self.cache_service.set_profile_data(profile_url, profile_data) | |
| return profile_data | |
| else: | |
| logger.warning(f"β οΈ HTTP request failed (status: {response.status_code})") | |
| return {'success': False, 'status_code': response.status_code} | |
| except Exception as e: | |
| logger.warning(f"β Failed to scrape LinkedIn profile {profile_url}: {str(e)}") | |
| return {'success': False, 'error': str(e)} | |
| def _extract_structured_data(self, soup: BeautifulSoup) -> Dict: | |
| """Extract structured data from LinkedIn profile page""" | |
| structured_data = {} | |
| try: | |
| # Look for JSON-LD structured data | |
| json_ld_scripts = soup.find_all('script', type='application/ld+json') | |
| for script in json_ld_scripts: | |
| try: | |
| script_content = getattr(script, 'string', None) | |
| if script_content: # Check if string is not None | |
| data = json.loads(script_content) | |
| if isinstance(data, dict): | |
| structured_data.update(data) | |
| except json.JSONDecodeError: | |
| continue | |
| # Look for meta tags with profile information | |
| meta_tags = { | |
| 'description': soup.find('meta', attrs={'name': 'description'}), | |
| 'keywords': soup.find('meta', attrs={'name': 'keywords'}), | |
| 'og:title': soup.find('meta', attrs={'property': 'og:title'}), | |
| 'og:description': soup.find('meta', attrs={'property': 'og:description'}), | |
| } | |
| for key, tag in meta_tags.items(): | |
| if tag and hasattr(tag, 'attrs'): | |
| tag_attrs = getattr(tag, 'attrs', {}) | |
| if 'content' in tag_attrs: | |
| structured_data[key] = tag_attrs['content'] | |
| except Exception as e: | |
| logger.warning(f"Error extracting structured data: {str(e)}") | |
| return structured_data | |
| def _extract_about_section(self, soup: BeautifulSoup, text_content: str) -> Optional[str]: | |
| """Try to extract the About section from the LinkedIn profile HTML or text.""" | |
| logger.info(f"π Looking for About section in HTML...") | |
| about = None | |
| # Look for headings like 'About' or 'Summary' | |
| logger.info(f"π Searching for About/Summary headings...") | |
| for heading in soup.find_all(['h2', 'h3', 'span']): | |
| heading_text = heading.get_text(strip=True).lower() | |
| if heading_text in ['about', 'summary', 'bio']: | |
| logger.info(f"β Found heading: '{heading_text}'") | |
| # The About section is often in the next sibling or parent | |
| next_elem = heading.find_next_sibling() | |
| if next_elem and hasattr(next_elem, 'get_text') and next_elem.get_text(strip=True): | |
| about = next_elem.get_text(strip=True) | |
| logger.info(f"β Found About section in next sibling: {len(about)} characters") | |
| break | |
| parent = heading.parent | |
| if parent and hasattr(parent, 'get_text') and parent.get_text(strip=True) and len(parent.get_text(strip=True)) > 30: | |
| about = parent.get_text(strip=True) | |
| logger.info(f"β Found About section in parent: {len(about)} characters") | |
| break | |
| # Fallback: look for 'About' in text content | |
| if not about and text_content: | |
| logger.info(f"π Searching for 'About' pattern in text content...") | |
| match = re.search(r'About[\s\n]+([A-Z][^\n]{30,600})', text_content) | |
| if match: | |
| about = match.group(1).strip() | |
| logger.info(f"β Found About section via regex: {len(about)} characters") | |
| # Fallback: use meta description or og:description | |
| if not about: | |
| logger.info(f"π Looking for meta description...") | |
| meta_desc = soup.find('meta', attrs={'name': 'description'}) | |
| if meta_desc and hasattr(meta_desc, 'attrs'): | |
| meta_attrs = getattr(meta_desc, 'attrs', {}) | |
| if 'content' in meta_attrs: | |
| about = meta_attrs['content'] | |
| logger.info(f"β Found About section in meta description: {len(about) if about else 0} characters") | |
| if not about: | |
| logger.info(f"π Looking for og:description...") | |
| og_desc = soup.find('meta', attrs={'property': 'og:description'}) | |
| if og_desc and hasattr(og_desc, 'attrs'): | |
| og_attrs = getattr(og_desc, 'attrs', {}) | |
| if 'content' in og_attrs: | |
| about = og_attrs['content'] | |
| logger.info(f"β Found About section in og:description: {len(about) if about else 0} characters") | |
| if about and not isinstance(about, str): | |
| about = str(about) | |
| if about: | |
| logger.info(f"β About section extraction successful: {len(about)} characters") | |
| else: | |
| logger.warning(f"β οΈ About section extraction failed") | |
| return about if isinstance(about, str) else None | |
| def _extract_linkedin_url(self, url: str) -> Optional[str]: | |
| """Extract and validate LinkedIn profile URL""" | |
| if not url: | |
| return None | |
| # Check if it's a LinkedIn profile URL | |
| if 'linkedin.com/in/' in url: | |
| # Clean up the URL | |
| clean_url = url.split('?')[0] # Remove query parameters | |
| return clean_url | |
| return None | |
| def _parse_profile_snippet(self, title: str, snippet: str, profile_url: str, detailed_data: Optional[Dict] = None) -> Optional[Dict]: | |
| """Parse LinkedIn profile information from search result snippet and detailed data""" | |
| try: | |
| logger.info(f"π§ Parsing profile data from title and snippet...") | |
| # Extract name from title (usually "Name | Headline | Location") | |
| logger.info(f"π€ Extracting name from title...") | |
| name = self._extract_name_from_title(title) | |
| logger.info(f"β Extracted name: {name}") | |
| # Extract headline and location from snippet | |
| logger.info(f"πΌ Extracting headline and location...") | |
| headline, location = self._extract_headline_and_location(snippet) | |
| logger.info(f"β Extracted headline: {headline}") | |
| logger.info(f"β Extracted location: {location}") | |
| # Extract company from title first, then snippet, then detailed data | |
| logger.info(f"π’ Extracting company information...") | |
| company = self._extract_company_from_title(title) | |
| if company: | |
| logger.info(f"β Found company in title: {company}") | |
| else: | |
| logger.info(f"π Company not found in title, checking snippet...") | |
| company = self._extract_company_from_snippet(snippet) | |
| if company: | |
| logger.info(f"β Found company in snippet: {company}") | |
| else: | |
| logger.info(f"π Company not found in snippet, checking detailed data...") | |
| if detailed_data and detailed_data.get('success'): | |
| company = self._extract_company_from_detailed_data(detailed_data) | |
| if company: | |
| logger.info(f"β Found company in detailed data: {company}") | |
| else: | |
| logger.warning(f"β οΈ Company not found in any source") | |
| else: | |
| logger.warning(f"β οΈ Company not found in any source") | |
| # Extract education from snippet and detailed data | |
| logger.info(f"π Extracting education information...") | |
| education = self._extract_education_from_snippet(snippet) | |
| if education: | |
| logger.info(f"β Found education in snippet: {education}") | |
| else: | |
| logger.info(f"π Education not found in snippet, checking detailed data...") | |
| if detailed_data and detailed_data.get('success'): | |
| # First check if education was extracted during scraping | |
| scraped_education = detailed_data.get('education') | |
| if scraped_education: | |
| logger.info(f"β Found education from scraping: {scraped_education}") | |
| education = scraped_education | |
| else: | |
| # Fallback to other extraction methods | |
| education = self._extract_education_from_detailed_data(detailed_data) | |
| if education: | |
| logger.info(f"β Found education in detailed data: {education}") | |
| else: | |
| logger.warning(f"β οΈ Education not found in any source") | |
| else: | |
| logger.warning(f"β οΈ Education not found in any source") | |
| # Create better experience summary | |
| logger.info(f"π Creating experience summary...") | |
| experience_summary = self._create_experience_summary(snippet, detailed_data) | |
| logger.info(f"β Experience summary created: {len(experience_summary)} characters") | |
| # Create candidate profile | |
| candidate = { | |
| 'name': name or 'Unknown', | |
| 'headline': headline or 'Professional', | |
| 'location': location or 'Unknown', | |
| 'profile_url': profile_url, | |
| 'company': company, | |
| 'education': education, | |
| 'experience_summary': experience_summary | |
| } | |
| logger.info(f"β Profile parsing completed successfully") | |
| return candidate | |
| except Exception as e: | |
| logger.warning(f"β Error parsing profile snippet: {str(e)}") | |
| return None | |
| def _extract_name_from_title(self, title: str) -> str: | |
| """Extract name from LinkedIn profile title""" | |
| if not title: | |
| return 'Unknown' | |
| # LinkedIn titles are usually "Name | Headline | Location" or "Name - Headline at Company" | |
| # First, try to extract just the name part | |
| name_patterns = [ | |
| r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)(?:\s*[-|]\s*)', # Name followed by - or | | |
| r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)(?:\s+at\s+)', # Name followed by "at" | |
| r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*?)(?:\s*,\s*)', # Name followed by comma | |
| ] | |
| for pattern in name_patterns: | |
| match = re.match(pattern, title) | |
| if match: | |
| name = match.group(1).strip() | |
| # Clean up common LinkedIn prefixes | |
| name = name.replace('LinkedIn', '').strip() | |
| if name and len(name) > 2: | |
| return name | |
| # Fallback: take first part before any separator | |
| parts = re.split(r'[-|,]\s*', title) | |
| if parts: | |
| name = parts[0].strip() | |
| # Clean up common LinkedIn prefixes | |
| name = name.replace('LinkedIn', '').strip() | |
| return name if name else 'Unknown' | |
| return 'Unknown' | |
| def _extract_headline_and_location(self, snippet: str) -> tuple: | |
| """Extract headline and location from snippet""" | |
| headline = 'Professional' | |
| location = 'Unknown' | |
| if not snippet: | |
| return headline, location | |
| # Look for location patterns (City, State or Country) | |
| location_patterns = [ | |
| r'([A-Z][a-z]+(?:[\s,]+[A-Z][a-z]+)*)\s*,\s*([A-Z]{2})', # City, State | |
| r'([A-Z][a-z]+(?:[\s,]+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)', # City, Country | |
| ] | |
| for pattern in location_patterns: | |
| match = re.search(pattern, snippet) | |
| if match: | |
| location = f"{match.group(1)}, {match.group(2)}" | |
| break | |
| # Extract headline (usually contains job title) - improved logic | |
| lines = snippet.split('.') | |
| for line in lines: | |
| line = line.strip() | |
| # Look for job title patterns | |
| if any(keyword in line.lower() for keyword in ['engineer', 'developer', 'manager', 'director', 'lead', 'senior', 'principal', 'architect']): | |
| # Filter out lines that are likely names | |
| if not self._is_likely_name(line) and len(line) > 5: | |
| headline = line | |
| break | |
| return headline, location | |
| def _extract_company_from_title(self, title: str) -> Optional[str]: | |
| """Extract company name from LinkedIn profile title""" | |
| if not title: | |
| return None | |
| # LinkedIn titles are usually "Name | Headline | Location" or "Name - Headline at Company" | |
| # Look for company patterns in title | |
| company_patterns = [ | |
| r'at\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|$|\(|\))', | |
| r'-\s+([A-Z][a-zA-Z\s&\.]+?)\s+at\s+', | |
| r'\|\s+([A-Z][a-zA-Z\s&\.]+?)\s+at\s+', | |
| r'-\s+[^-]+-\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|$|\(|\))', # Third part after two dashes | |
| r'\|\s+[^|]+\|\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|$|\(|\))', # Third part after two pipes | |
| ] | |
| for pattern in company_patterns: | |
| match = re.search(pattern, title) | |
| if match: | |
| company = match.group(1).strip() | |
| if (len(company) > 2 and len(company) < 50 and | |
| company.lower() not in ['linkedin', 'profile', 'view', 'professional', 'experience'] and | |
| not self._is_likely_name(company)): | |
| return company | |
| # Try to extract from the last part of the title (after last separator) | |
| parts = re.split(r'[-|]\s*', title) | |
| if len(parts) >= 3: | |
| last_part = parts[-1].strip() | |
| # Check if the last part looks like a company name | |
| if (len(last_part) > 3 and len(last_part) < 50 and | |
| not self._is_likely_name(last_part) and | |
| last_part.lower() not in ['linkedin', 'profile', 'view', 'professional', 'experience']): | |
| return last_part | |
| return None | |
| def _extract_company_from_snippet(self, snippet: str) -> Optional[str]: | |
| """Extract company name from snippet""" | |
| if not snippet: | |
| return None | |
| # Look for company patterns - more comprehensive patterns | |
| company_patterns = [ | |
| r'at\s+([A-Z][a-zA-Z\s&\.]+?)(?:\s|,|\.|$|\(|\))', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+β’\s+', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+-\s+', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+\(', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+at\s+', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+Software\s+Engineer', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+Senior\s+', | |
| r'([A-Z][a-zA-Z\s&\.]+?)\s+Developer', | |
| ] | |
| for pattern in company_patterns: | |
| match = re.search(pattern, snippet) | |
| if match: | |
| company = match.group(1).strip() | |
| # Filter out common false positives and names | |
| if (len(company) > 2 and len(company) < 50 and | |
| company.lower() not in ['linkedin', 'profile', 'view', 'professional', 'experience'] and | |
| not self._is_likely_name(company)): | |
| return company | |
| return None | |
| def _is_likely_name(self, text: str) -> bool: | |
| """Check if text is likely a person's name rather than a company""" | |
| # Common name patterns | |
| name_indicators = [ | |
| r'^[A-Z][a-z]+\s+[A-Z][a-z]+$', # First Last | |
| r'^[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+$', # First Middle Last | |
| r'^[A-Z][a-z]+\.\s+[A-Z][a-z]+$', # F. Last | |
| ] | |
| for pattern in name_indicators: | |
| if re.match(pattern, text): | |
| return True | |
| # Check for common name words | |
| common_names = ['michael', 'john', 'david', 'james', 'robert', 'mary', 'jennifer', 'lisa', 'sarah'] | |
| if text.lower() in common_names: | |
| return True | |
| return False | |
| def _extract_company_from_detailed_data(self, detailed_data: Dict) -> Optional[str]: | |
| """Extract company from detailed scraped data""" | |
| try: | |
| # Check structured data first | |
| structured_data = detailed_data.get('structured_data', {}) | |
| # Look for organization information in structured data | |
| if 'worksFor' in structured_data: | |
| return structured_data['worksFor'] | |
| # Check meta description for company info | |
| description = structured_data.get('description', '') | |
| if description: | |
| company_match = re.search(r'at\s+([A-Z][a-zA-Z\s&]+?)(?:\s|,|\.|$)', description) | |
| if company_match: | |
| return company_match.group(1).strip() | |
| # Check text content | |
| text_content = detailed_data.get('text_content', '') | |
| if text_content: | |
| return self._extract_company_from_snippet(text_content) | |
| except Exception as e: | |
| logger.warning(f"Error extracting company from detailed data: {str(e)}") | |
| return None | |
| def _extract_education_from_snippet(self, snippet: str) -> Optional[str]: | |
| """Extract education information from snippet using improved patterns""" | |
| if not snippet: | |
| return None | |
| # Look for education patterns - more comprehensive and specific to LinkedIn structure | |
| education_patterns = [ | |
| # Pattern for "Education: University Name" format (from top card) | |
| r'Education:\s*([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute|Academy))', | |
| # Pattern for degree + university format | |
| r'(?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS)\s+(?:of|in|from)\s+([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', | |
| # Pattern for university name followed by degree | |
| r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute)).*?(?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS)', | |
| # Pattern for "Studied at" format | |
| r'(?:Studied|Graduated|Attended)\s+(?:from|at)\s+([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', | |
| # Pattern for university name with degree in parentheses | |
| r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute)).*?\((?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS)', | |
| # Pattern for degree, field format | |
| r'(?:Bachelor|Master|PhD|BSc|MSc|MBA|BS|MS),\s*[A-Za-z\s/]+(?:from|at)\s+([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', | |
| ] | |
| for pattern in education_patterns: | |
| match = re.search(pattern, snippet, re.IGNORECASE) | |
| if match: | |
| education = match.group(1).strip() | |
| if len(education) > 3 and len(education) < 100: | |
| # Clean up the education string | |
| education = re.sub(r'\s+', ' ', education) # Remove extra whitespace | |
| education = education.strip() | |
| return education | |
| # Fallback to keyword-based search with better context | |
| education_keywords = ['university', 'college', 'school', 'institute', 'bachelor', 'master', 'phd', 'degree'] | |
| for keyword in education_keywords: | |
| if keyword in snippet.lower(): | |
| # Find the sentence containing education info | |
| sentences = snippet.split('.') | |
| for sentence in sentences: | |
| if keyword in sentence.lower(): | |
| # Extract university name from the sentence | |
| university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', sentence) | |
| if university_match: | |
| return university_match.group(1).strip() | |
| # If no university found, return the sentence itself | |
| return sentence.strip() | |
| return None | |
| def _extract_education_from_detailed_data(self, detailed_data: Dict) -> Optional[str]: | |
| """Extract education from detailed scraped data using LinkedIn-specific selectors""" | |
| try: | |
| # Check structured data first | |
| structured_data = detailed_data.get('structured_data', {}) | |
| # Look for education information in structured data | |
| if 'alumniOf' in structured_data: | |
| return structured_data['alumniOf'] | |
| # Check meta description for education info | |
| description = structured_data.get('description', '') | |
| if description: | |
| education_match = re.search(r'([A-Z][a-zA-Z\s&]+?(?:university|college|school|institute))', description, re.IGNORECASE) | |
| if education_match: | |
| return education_match.group(1).strip() | |
| # Check text content for LinkedIn-specific education patterns | |
| text_content = detailed_data.get('text_content', '') | |
| if text_content: | |
| # Look for education section specifically | |
| education_section = self._extract_education_section_from_html(text_content) | |
| if education_section: | |
| return education_section | |
| # Fallback to snippet extraction | |
| return self._extract_education_from_snippet(text_content) | |
| except Exception as e: | |
| logger.warning(f"Error extracting education from detailed data: {str(e)}") | |
| return None | |
| def _extract_education_section_from_html(self, html_content: str) -> Optional[str]: | |
| """Extract education information from LinkedIn HTML structure""" | |
| try: | |
| # Look for education section using LinkedIn-specific patterns | |
| education_patterns = [ | |
| # Pattern for education section header | |
| r'<h2[^>]*>.*?Education.*?</h2>.*?<span[^>]*>([^<]+(?:University|College|School|Institute)[^<]*)</span>', | |
| # Pattern for education in top card | |
| r'aria-label="Education:\s*([^"]+(?:University|College|School|Institute)[^"]*)"', | |
| # Pattern for education list items | |
| r'<li[^>]*>.*?<span[^>]*>([^<]+(?:University|College|School|Institute)[^<]*)</span>', | |
| # Pattern for education in bold text | |
| r'<span[^>]*class="[^"]*t-bold[^"]*"[^>]*>([^<]+(?:University|College|School|Institute)[^<]*)</span>', | |
| ] | |
| for pattern in education_patterns: | |
| match = re.search(pattern, html_content, re.IGNORECASE | re.DOTALL) | |
| if match: | |
| education = match.group(1).strip() | |
| if len(education) > 3 and len(education) < 100: | |
| # Clean up the education string | |
| education = re.sub(r'\s+', ' ', education) # Remove extra whitespace | |
| education = education.strip() | |
| return education | |
| # Look for education keywords in the HTML | |
| if 'education' in html_content.lower(): | |
| # Find the section containing education | |
| lines = html_content.split('\n') | |
| for i, line in enumerate(lines): | |
| if 'education' in line.lower() and 'university' in line.lower(): | |
| # Extract university name from this line or nearby lines | |
| university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', line) | |
| if university_match: | |
| return university_match.group(1).strip() | |
| # Check next few lines for university name | |
| for j in range(i+1, min(i+5, len(lines))): | |
| university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', lines[j]) | |
| if university_match: | |
| return university_match.group(1).strip() | |
| except Exception as e: | |
| logger.warning(f"Error extracting education from HTML: {str(e)}") | |
| return None | |
| def _create_experience_summary(self, snippet: str, detailed_data: Optional[Dict] = None) -> str: | |
| """Create a better experience summary from available data, prioritizing the About section.""" | |
| logger.info(f"π Creating experience summary...") | |
| # Use About section if available | |
| if detailed_data and detailed_data.get('success'): | |
| about_section = detailed_data.get('about_section') | |
| if about_section and len(about_section) > 30: | |
| logger.info(f"β Using About section for experience summary") | |
| summary = about_section.strip() | |
| if len(summary) > 400: | |
| summary = summary[:397] + '...' | |
| logger.info(f"π Truncated summary to 400 characters") | |
| return summary | |
| else: | |
| logger.info(f"β οΈ About section not available or too short, using fallback logic") | |
| # Fallback to previous logic | |
| logger.info(f"π Extracting relevant sentences from snippet...") | |
| summary_parts = [] | |
| if snippet: | |
| sentences = snippet.split('.') | |
| relevant_sentences = [] | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if len(sentence) > 20 and any(keyword in sentence.lower() for keyword in | |
| ['engineer', 'developer', 'manager', 'lead', 'senior', 'experience', 'worked', 'responsible', 'developed', 'built', 'created']): | |
| relevant_sentences.append(sentence) | |
| if relevant_sentences: | |
| summary_parts.extend(relevant_sentences[:2]) | |
| logger.info(f"β Found {len(relevant_sentences[:2])} relevant sentences from snippet") | |
| else: | |
| logger.info(f"β οΈ No relevant sentences found in snippet") | |
| if detailed_data and detailed_data.get('success'): | |
| logger.info(f"π Adding structured data information...") | |
| structured_data = detailed_data.get('structured_data', {}) | |
| if 'jobTitle' in structured_data: | |
| summary_parts.append(f"Current role: {structured_data['jobTitle']}") | |
| logger.info(f"β Added job title: {structured_data['jobTitle']}") | |
| if 'worksFor' in structured_data: | |
| summary_parts.append(f"Company: {structured_data['worksFor']}") | |
| logger.info(f"β Added company: {structured_data['worksFor']}") | |
| if 'alumniOf' in structured_data: | |
| summary_parts.append(f"Education: {structured_data['alumniOf']}") | |
| logger.info(f"β Added education: {structured_data['alumniOf']}") | |
| text_content = detailed_data.get('text_content', '') | |
| if text_content and not summary_parts: | |
| logger.info(f"π Searching for experience keywords in text content...") | |
| experience_keywords = ['experience', 'worked', 'developed', 'built', 'created', 'managed'] | |
| for keyword in experience_keywords: | |
| if keyword in text_content.lower(): | |
| sentences = text_content.split('.') | |
| for sentence in sentences: | |
| if keyword in sentence.lower() and len(sentence.strip()) > 30: | |
| summary_parts.append(sentence.strip()) | |
| logger.info(f"β Found experience sentence with keyword '{keyword}'") | |
| break | |
| if summary_parts: | |
| break | |
| if summary_parts: | |
| summary = '. '.join(summary_parts) | |
| if len(summary) > 400: | |
| summary = summary[:397] + '...' | |
| logger.info(f"π Truncated summary to 400 characters") | |
| logger.info(f"β Created summary from {len(summary_parts)} parts") | |
| return summary | |
| if snippet: | |
| logger.info(f"π Using snippet as fallback...") | |
| words = snippet.split() | |
| if len(words) > 20: | |
| summary = ' '.join(words[:20]) + '...' | |
| logger.info(f"β Created fallback summary from first 20 words") | |
| return summary | |
| logger.info(f"β Using full snippet as summary") | |
| return snippet | |
| logger.warning(f"β οΈ No experience information available") | |
| return "Experience information not available" | |
| def _extract_education_from_linkedin_profile(self, soup: BeautifulSoup) -> Optional[str]: | |
| """Extract education information from LinkedIn profile using BeautifulSoup""" | |
| try: | |
| # Method 1: Look for education section by ID | |
| education_section = soup.find('div', {'id': 'education'}) | |
| if education_section: | |
| # Find the parent section that contains education information | |
| education_card = education_section.find_parent('section', class_='pv-profile-card') | |
| if education_card: | |
| # Look for university names in the education card | |
| university_elements = education_card.find_all(text=re.compile(r'.*University.*|.*College.*|.*School.*|.*Institute.*')) | |
| for element in university_elements: | |
| if hasattr(element, 'strip') and element.strip() and len(element.strip()) > 3: | |
| return element.strip() | |
| # Method 2: Look for education in the top card section | |
| top_card = soup.find('section', class_=re.compile(r'artdeco-card.*')) | |
| if top_card: | |
| # Look for education button/link | |
| education_button = top_card.find('button', attrs={'aria-label': re.compile(r'Education:.*')}) | |
| if education_button and hasattr(education_button, 'get'): | |
| # Extract university name from aria-label | |
| aria_label = education_button.get('aria-label', '') | |
| if aria_label and isinstance(aria_label, str): | |
| education_match = re.search(r'Education:\s*([^,]+)', aria_label) | |
| if education_match: | |
| return education_match.group(1).strip() | |
| # Look for education text in the top card | |
| education_text = top_card.find(text=re.compile(r'.*University.*|.*College.*|.*School.*|.*Institute.*')) | |
| if education_text and hasattr(education_text, 'strip') and education_text.strip(): | |
| return education_text.strip() | |
| # Method 3: Look for education in structured data | |
| structured_data = self._extract_structured_data(soup) | |
| if 'alumniOf' in structured_data: | |
| return structured_data['alumniOf'] | |
| # Method 4: Look for education keywords in the entire page | |
| education_keywords = ['university', 'college', 'school', 'institute', 'bachelor', 'master', 'phd'] | |
| for keyword in education_keywords: | |
| elements = soup.find_all(text=re.compile(keyword, re.IGNORECASE)) | |
| for element in elements: | |
| if hasattr(element, 'strip'): | |
| text = element.strip() | |
| if len(text) > 10 and len(text) < 200: | |
| # Check if this looks like an education entry | |
| if any(edu_keyword in text.lower() for edu_keyword in ['university', 'college', 'school', 'institute']): | |
| # Extract just the university name | |
| university_match = re.search(r'([A-Z][a-zA-Z\s&\.]+?(?:University|College|School|Institute))', text) | |
| if university_match: | |
| return university_match.group(1).strip() | |
| return text | |
| except Exception as e: | |
| logger.warning(f"Error extracting education from LinkedIn profile: {str(e)}") | |
| return None | |
| # Cache management methods | |
| def get_cache_stats(self) -> Dict[str, Any]: | |
| """Get cache statistics""" | |
| return self.cache_service.get_cache_stats() | |
| def clear_cache(self, cache_type: str = "all"): | |
| """Clear specified cache or all caches""" | |
| self.cache_service.clear_cache(cache_type) | |
| def cleanup_expired_cache(self): | |
| """Clean up expired cache entries""" | |
| self.cache_service.cleanup_expired_entries() |