Spaces:
No application file
No application file
| #!/usr/bin/env python3 | |
| """ | |
| IQKiller Scrape Microservice | |
| Firecrawl integration for superior web scraping (95%+ success rate) | |
| Handles job posting extraction from URLs with fallback methods | |
| """ | |
| import asyncio | |
| import logging | |
| import time | |
| import re | |
| import requests | |
| from typing import Optional, Dict, Any, List | |
| from dataclasses import dataclass | |
| from urllib.parse import urlparse | |
| # Third-party imports (with fallbacks) | |
| try: | |
| import firecrawl | |
| from firecrawl import FirecrawlApp | |
| FIRECRAWL_AVAILABLE = True | |
| except ImportError: | |
| FIRECRAWL_AVAILABLE = False | |
| try: | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| SELENIUM_AVAILABLE = True | |
| except ImportError: | |
| SELENIUM_AVAILABLE = False | |
| # Local imports | |
| from config import get_config | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class ScrapeResult: | |
| """Result from web scraping operation""" | |
| success: bool | |
| content: str | |
| url: str | |
| method: str | |
| processing_time: float | |
| error: Optional[str] = None | |
| metadata: Optional[Dict[str, Any]] = None | |
| class JobScraper: | |
| """Advanced job posting scraper with multiple methods""" | |
| def __init__(self, config=None): | |
| """Initialize scraper with configuration""" | |
| self.config = config or get_config() | |
| # Initialize Firecrawl if available | |
| self.firecrawl_client = None | |
| if FIRECRAWL_AVAILABLE and self.config.firecrawl_api_key: | |
| try: | |
| self.firecrawl_client = FirecrawlApp(api_key=self.config.firecrawl_api_key) | |
| logger.info("β Firecrawl client initialized") | |
| except Exception as e: | |
| logger.warning(f"β οΈ Failed to initialize Firecrawl: {e}") | |
| # Common headers for requests | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1' | |
| } | |
| def _clean_content(self, content: str) -> str: | |
| """Clean and normalize scraped content""" | |
| if not content: | |
| return "" | |
| # Remove excessive whitespace | |
| content = re.sub(r'\s+', ' ', content) | |
| # Remove common navigation elements | |
| content = re.sub(r'(Skip to main content|Navigation|Menu|Footer|Header)', '', content, flags=re.IGNORECASE) | |
| # Remove social media links | |
| content = re.sub(r'(Follow us on|Share on|Like us on) \w+', '', content, flags=re.IGNORECASE) | |
| # Remove cookie notices | |
| content = re.sub(r'(We use cookies|This site uses cookies|Cookie policy).*?(?=\.|$)', '', content, flags=re.IGNORECASE) | |
| # Clean up extra spaces | |
| content = content.strip() | |
| return content | |
| def _extract_job_content(self, content: str, url: str) -> str: | |
| """Extract job-specific content from page""" | |
| # Job posting indicators | |
| job_indicators = [ | |
| r'job description', | |
| r'responsibilities', | |
| r'requirements', | |
| r'qualifications', | |
| r'what you.{0,20}ll do', | |
| r'about the role', | |
| r'position summary', | |
| r'job summary' | |
| ] | |
| # Find job content sections | |
| job_content_parts = [] | |
| for indicator in job_indicators: | |
| pattern = re.compile(rf'({indicator}.*?)(?=(?:{"|".join(job_indicators)})|$)', | |
| flags=re.IGNORECASE | re.DOTALL) | |
| matches = pattern.findall(content) | |
| job_content_parts.extend(matches) | |
| if job_content_parts: | |
| # Join all job-related sections | |
| job_content = " ".join(job_content_parts) | |
| return self._clean_content(job_content) | |
| # Fallback: return cleaned full content | |
| return self._clean_content(content) | |
| async def scrape_with_firecrawl(self, url: str) -> ScrapeResult: | |
| """Scrape using Firecrawl (primary method)""" | |
| if not self.firecrawl_client: | |
| raise Exception("Firecrawl client not available") | |
| start_time = time.time() | |
| try: | |
| # Use Firecrawl to scrape the page (v1 API format) | |
| scrape_response = self.firecrawl_client.scrape_url( | |
| url, | |
| formats=['markdown', 'html'], | |
| includeTags=['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li'], | |
| excludeTags=['nav', 'footer', 'header', 'aside', 'script', 'style'], | |
| timeout=30000, | |
| waitFor=3000 # Wait for dynamic content | |
| ) | |
| processing_time = time.time() - start_time | |
| if scrape_response and hasattr(scrape_response, 'markdown') and scrape_response.markdown: | |
| content = scrape_response.markdown | |
| content = self._extract_job_content(content, url) | |
| metadata = { | |
| 'title': getattr(scrape_response, 'title', ''), | |
| 'description': getattr(scrape_response, 'description', ''), | |
| 'url': url, | |
| 'content_length': len(content), | |
| 'success': getattr(scrape_response, 'success', True) | |
| } | |
| # Add metadata if available | |
| if hasattr(scrape_response, 'metadata') and scrape_response.metadata: | |
| metadata.update(scrape_response.metadata) | |
| return ScrapeResult( | |
| success=True, | |
| content=content, | |
| url=url, | |
| method="firecrawl", | |
| processing_time=processing_time, | |
| metadata=metadata | |
| ) | |
| else: | |
| raise Exception("No content returned from Firecrawl") | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"β Firecrawl scraping failed for {url}: {e}") | |
| return ScrapeResult( | |
| success=False, | |
| content="", | |
| url=url, | |
| method="firecrawl", | |
| processing_time=processing_time, | |
| error=str(e) | |
| ) | |
| async def scrape_with_requests(self, url: str) -> ScrapeResult: | |
| """Scrape using simple HTTP requests (fallback method)""" | |
| start_time = time.time() | |
| try: | |
| response = requests.get( | |
| url, | |
| headers=self.headers, | |
| timeout=self.config.request_timeout, | |
| allow_redirects=True | |
| ) | |
| response.raise_for_status() | |
| processing_time = time.time() - start_time | |
| # Extract text content (basic HTML parsing) | |
| content = response.text | |
| # Remove HTML tags (basic cleaning) | |
| content = re.sub(r'<[^>]+>', ' ', content) | |
| content = re.sub(r'&[a-zA-Z0-9#]+;', ' ', content) # HTML entities | |
| content = self._extract_job_content(content, url) | |
| metadata = { | |
| 'status_code': response.status_code, | |
| 'content_type': response.headers.get('content-type', ''), | |
| 'url': url, | |
| 'content_length': len(content) | |
| } | |
| return ScrapeResult( | |
| success=True, | |
| content=content, | |
| url=url, | |
| method="requests", | |
| processing_time=processing_time, | |
| metadata=metadata | |
| ) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"β Requests scraping failed for {url}: {e}") | |
| return ScrapeResult( | |
| success=False, | |
| content="", | |
| url=url, | |
| method="requests", | |
| processing_time=processing_time, | |
| error=str(e) | |
| ) | |
| async def scrape_with_selenium(self, url: str) -> ScrapeResult: | |
| """Scrape using Selenium (for dynamic content)""" | |
| if not SELENIUM_AVAILABLE: | |
| raise Exception("Selenium not available") | |
| start_time = time.time() | |
| driver = None | |
| try: | |
| # Setup Chrome options | |
| chrome_options = Options() | |
| chrome_options.add_argument('--headless') | |
| chrome_options.add_argument('--no-sandbox') | |
| chrome_options.add_argument('--disable-dev-shm-usage') | |
| chrome_options.add_argument('--disable-gpu') | |
| chrome_options.add_argument('--window-size=1920,1080') | |
| chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}') | |
| driver = webdriver.Chrome(options=chrome_options) | |
| driver.set_page_load_timeout(30) | |
| # Load the page | |
| driver.get(url) | |
| # Wait for content to load | |
| WebDriverWait(driver, 10).until( | |
| EC.presence_of_element_located((By.TAG_NAME, "body")) | |
| ) | |
| # Get page content | |
| content = driver.find_element(By.TAG_NAME, "body").text | |
| content = self._extract_job_content(content, url) | |
| processing_time = time.time() - start_time | |
| metadata = { | |
| 'title': driver.title, | |
| 'url': driver.current_url, | |
| 'content_length': len(content) | |
| } | |
| return ScrapeResult( | |
| success=True, | |
| content=content, | |
| url=url, | |
| method="selenium", | |
| processing_time=processing_time, | |
| metadata=metadata | |
| ) | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| logger.error(f"β Selenium scraping failed for {url}: {e}") | |
| return ScrapeResult( | |
| success=False, | |
| content="", | |
| url=url, | |
| method="selenium", | |
| processing_time=processing_time, | |
| error=str(e) | |
| ) | |
| finally: | |
| if driver: | |
| driver.quit() | |
| async def scrape_job_posting(self, url: str, prefer_method: Optional[str] = None) -> ScrapeResult: | |
| """Scrape job posting with automatic fallback methods""" | |
| # Validate URL | |
| try: | |
| parsed = urlparse(url) | |
| if not parsed.scheme or not parsed.netloc: | |
| raise ValueError("Invalid URL format") | |
| except Exception as e: | |
| return ScrapeResult( | |
| success=False, | |
| content="", | |
| url=url, | |
| method="validation", | |
| processing_time=0.0, | |
| error=f"URL validation failed: {e}" | |
| ) | |
| # Define scraping methods in order of preference | |
| methods = [] | |
| if prefer_method == "firecrawl" and self.firecrawl_client: | |
| methods = ["firecrawl", "requests", "selenium"] | |
| elif prefer_method == "requests": | |
| methods = ["requests", "firecrawl", "selenium"] | |
| elif prefer_method == "selenium" and SELENIUM_AVAILABLE: | |
| methods = ["selenium", "firecrawl", "requests"] | |
| else: | |
| # Default order: Firecrawl first (best), then requests, then selenium | |
| methods = ["firecrawl", "requests", "selenium"] | |
| # Try each method until one succeeds | |
| last_error = None | |
| for method in methods: | |
| try: | |
| logger.info(f"π Trying {method} for {url}") | |
| if method == "firecrawl" and self.firecrawl_client: | |
| result = await self.scrape_with_firecrawl(url) | |
| elif method == "requests": | |
| result = await self.scrape_with_requests(url) | |
| elif method == "selenium" and SELENIUM_AVAILABLE: | |
| result = await self.scrape_with_selenium(url) | |
| else: | |
| logger.warning(f"β οΈ {method} not available, skipping") | |
| continue | |
| if result.success and result.content.strip(): | |
| logger.info(f"β Successfully scraped with {method}: {len(result.content)} chars") | |
| return result | |
| else: | |
| logger.warning(f"β οΈ {method} returned no content or failed") | |
| last_error = result.error | |
| except Exception as e: | |
| logger.warning(f"β οΈ {method} failed with exception: {e}") | |
| last_error = str(e) | |
| continue | |
| # All methods failed | |
| return ScrapeResult( | |
| success=False, | |
| content="", | |
| url=url, | |
| method="all_failed", | |
| processing_time=0.0, | |
| error=f"All scraping methods failed. Last error: {last_error}" | |
| ) | |
| def get_status(self) -> Dict[str, Any]: | |
| """Get scraper status and capabilities""" | |
| return { | |
| "firecrawl_available": bool(self.firecrawl_client), | |
| "selenium_available": SELENIUM_AVAILABLE, | |
| "requests_available": True, | |
| "preferred_method": "firecrawl" if self.firecrawl_client else "requests", | |
| "config": { | |
| "request_timeout": self.config.request_timeout, | |
| "firecrawl_api_key_set": bool(self.config.firecrawl_api_key) | |
| } | |
| } | |
| # Global scraper instance | |
| _scraper: Optional[JobScraper] = None | |
| def get_scraper() -> JobScraper: | |
| """Get global scraper instance""" | |
| global _scraper | |
| if _scraper is None: | |
| _scraper = JobScraper() | |
| return _scraper | |
| async def scrape_job_url(url: str, prefer_method: Optional[str] = None) -> ScrapeResult: | |
| """Convenience function to scrape a job URL""" | |
| scraper = get_scraper() | |
| return await scraper.scrape_job_posting(url, prefer_method) | |
| # Common job board URL patterns for optimization | |
| JOB_BOARD_PATTERNS = { | |
| 'linkedin.com': { | |
| 'method': 'firecrawl', # LinkedIn works best with Firecrawl | |
| 'indicators': ['job description', 'about the job', 'show more'] | |
| }, | |
| 'indeed.com': { | |
| 'method': 'requests', # Indeed works well with simple requests | |
| 'indicators': ['job description', 'full job description'] | |
| }, | |
| 'glassdoor.com': { | |
| 'method': 'selenium', # Glassdoor has dynamic content | |
| 'indicators': ['job description', 'job details'] | |
| }, | |
| 'lever.co': { | |
| 'method': 'requests', # Lever is usually simple HTML | |
| 'indicators': ['about the role', 'responsibilities'] | |
| }, | |
| 'greenhouse.io': { | |
| 'method': 'requests', # Greenhouse works with requests | |
| 'indicators': ['job description', 'what you will do'] | |
| } | |
| } | |
| def get_optimal_scraping_method(url: str) -> str: | |
| """Get optimal scraping method based on URL domain""" | |
| try: | |
| domain = urlparse(url).netloc.lower() | |
| for pattern, config in JOB_BOARD_PATTERNS.items(): | |
| if pattern in domain: | |
| return config['method'] | |
| # Default to firecrawl if available, otherwise requests | |
| scraper = get_scraper() | |
| status = scraper.get_status() | |
| if status['firecrawl_available']: | |
| return 'firecrawl' | |
| else: | |
| return 'requests' | |
| except Exception: | |
| return 'requests' # Safe fallback | |
| if __name__ == "__main__": | |
| async def test_scraper(): | |
| """Test the scraper functionality""" | |
| scraper = JobScraper() | |
| print("π§ͺ Testing Job Scraper") | |
| print("=" * 50) | |
| # Print status | |
| status = scraper.get_status() | |
| print("π Scraper Status:") | |
| for key, value in status.items(): | |
| print(f" {key}: {value}") | |
| # Test with a sample URL (if provided) | |
| test_url = "https://www.example.com" # Replace with actual job posting URL for testing | |
| print(f"\nπ Testing scraper with {test_url}") | |
| result = await scraper.scrape_job_posting(test_url) | |
| print(f"β Success: {result.success}") | |
| print(f"π Method: {result.method}") | |
| print(f"β±οΈ Time: {result.processing_time:.2f}s") | |
| print(f"π Content Length: {len(result.content)}") | |
| if result.error: | |
| print(f"β Error: {result.error}") | |
| print("=" * 50) | |
| # Run test | |
| asyncio.run(test_scraper()) |