#!/usr/bin/env python3 """ IQKiller Scrape Microservice Firecrawl integration for superior web scraping (95%+ success rate) Handles job posting extraction from URLs with fallback methods """ import asyncio import logging import time import re import requests from typing import Optional, Dict, Any, List from dataclasses import dataclass from urllib.parse import urlparse # Third-party imports (with fallbacks) try: import firecrawl from firecrawl import FirecrawlApp FIRECRAWL_AVAILABLE = True except ImportError: FIRECRAWL_AVAILABLE = False try: from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC SELENIUM_AVAILABLE = True except ImportError: SELENIUM_AVAILABLE = False # Local imports from config import get_config # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class ScrapeResult: """Result from web scraping operation""" success: bool content: str url: str method: str processing_time: float error: Optional[str] = None metadata: Optional[Dict[str, Any]] = None class JobScraper: """Advanced job posting scraper with multiple methods""" def __init__(self, config=None): """Initialize scraper with configuration""" self.config = config or get_config() # Initialize Firecrawl if available self.firecrawl_client = None if FIRECRAWL_AVAILABLE and self.config.firecrawl_api_key: try: self.firecrawl_client = FirecrawlApp(api_key=self.config.firecrawl_api_key) logger.info("✅ Firecrawl client initialized") except Exception as e: logger.warning(f"⚠️ Failed to initialize Firecrawl: {e}") # Common headers for requests self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } def _clean_content(self, content: str) -> str: """Clean and normalize scraped content""" if not content: return "" # Remove excessive whitespace content = re.sub(r'\s+', ' ', content) # Remove common navigation elements content = re.sub(r'(Skip to main content|Navigation|Menu|Footer|Header)', '', content, flags=re.IGNORECASE) # Remove social media links content = re.sub(r'(Follow us on|Share on|Like us on) \w+', '', content, flags=re.IGNORECASE) # Remove cookie notices content = re.sub(r'(We use cookies|This site uses cookies|Cookie policy).*?(?=\.|$)', '', content, flags=re.IGNORECASE) # Clean up extra spaces content = content.strip() return content def _extract_job_content(self, content: str, url: str) -> str: """Extract job-specific content from page""" # Job posting indicators job_indicators = [ r'job description', r'responsibilities', r'requirements', r'qualifications', r'what you.{0,20}ll do', r'about the role', r'position summary', r'job summary' ] # Find job content sections job_content_parts = [] for indicator in job_indicators: pattern = re.compile(rf'({indicator}.*?)(?=(?:{"|".join(job_indicators)})|$)', flags=re.IGNORECASE | re.DOTALL) matches = pattern.findall(content) job_content_parts.extend(matches) if job_content_parts: # Join all job-related sections job_content = " ".join(job_content_parts) return self._clean_content(job_content) # Fallback: return cleaned full content return self._clean_content(content) async def scrape_with_firecrawl(self, url: str) -> ScrapeResult: """Scrape using Firecrawl (primary method)""" if not self.firecrawl_client: raise Exception("Firecrawl client not available") start_time = time.time() try: # Use Firecrawl to scrape the page (v1 API format) scrape_response = self.firecrawl_client.scrape_url( url, formats=['markdown', 'html'], includeTags=['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li'], excludeTags=['nav', 'footer', 'header', 'aside', 'script', 'style'], timeout=30000, waitFor=3000 # Wait for dynamic content ) processing_time = time.time() - start_time if scrape_response and hasattr(scrape_response, 'markdown') and scrape_response.markdown: content = scrape_response.markdown content = self._extract_job_content(content, url) metadata = { 'title': getattr(scrape_response, 'title', ''), 'description': getattr(scrape_response, 'description', ''), 'url': url, 'content_length': len(content), 'success': getattr(scrape_response, 'success', True) } # Add metadata if available if hasattr(scrape_response, 'metadata') and scrape_response.metadata: metadata.update(scrape_response.metadata) return ScrapeResult( success=True, content=content, url=url, method="firecrawl", processing_time=processing_time, metadata=metadata ) else: raise Exception("No content returned from Firecrawl") except Exception as e: processing_time = time.time() - start_time logger.error(f"❌ Firecrawl scraping failed for {url}: {e}") return ScrapeResult( success=False, content="", url=url, method="firecrawl", processing_time=processing_time, error=str(e) ) async def scrape_with_requests(self, url: str) -> ScrapeResult: """Scrape using simple HTTP requests (fallback method)""" start_time = time.time() try: response = requests.get( url, headers=self.headers, timeout=self.config.request_timeout, allow_redirects=True ) response.raise_for_status() processing_time = time.time() - start_time # Extract text content (basic HTML parsing) content = response.text # Remove HTML tags (basic cleaning) content = re.sub(r'<[^>]+>', ' ', content) content = re.sub(r'&[a-zA-Z0-9#]+;', ' ', content) # HTML entities content = self._extract_job_content(content, url) metadata = { 'status_code': response.status_code, 'content_type': response.headers.get('content-type', ''), 'url': url, 'content_length': len(content) } return ScrapeResult( success=True, content=content, url=url, method="requests", processing_time=processing_time, metadata=metadata ) except Exception as e: processing_time = time.time() - start_time logger.error(f"❌ Requests scraping failed for {url}: {e}") return ScrapeResult( success=False, content="", url=url, method="requests", processing_time=processing_time, error=str(e) ) async def scrape_with_selenium(self, url: str) -> ScrapeResult: """Scrape using Selenium (for dynamic content)""" if not SELENIUM_AVAILABLE: raise Exception("Selenium not available") start_time = time.time() driver = None try: # Setup Chrome options chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}') driver = webdriver.Chrome(options=chrome_options) driver.set_page_load_timeout(30) # Load the page driver.get(url) # Wait for content to load WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) # Get page content content = driver.find_element(By.TAG_NAME, "body").text content = self._extract_job_content(content, url) processing_time = time.time() - start_time metadata = { 'title': driver.title, 'url': driver.current_url, 'content_length': len(content) } return ScrapeResult( success=True, content=content, url=url, method="selenium", processing_time=processing_time, metadata=metadata ) except Exception as e: processing_time = time.time() - start_time logger.error(f"❌ Selenium scraping failed for {url}: {e}") return ScrapeResult( success=False, content="", url=url, method="selenium", processing_time=processing_time, error=str(e) ) finally: if driver: driver.quit() async def scrape_job_posting(self, url: str, prefer_method: Optional[str] = None) -> ScrapeResult: """Scrape job posting with automatic fallback methods""" # Validate URL try: parsed = urlparse(url) if not parsed.scheme or not parsed.netloc: raise ValueError("Invalid URL format") except Exception as e: return ScrapeResult( success=False, content="", url=url, method="validation", processing_time=0.0, error=f"URL validation failed: {e}" ) # Define scraping methods in order of preference methods = [] if prefer_method == "firecrawl" and self.firecrawl_client: methods = ["firecrawl", "requests", "selenium"] elif prefer_method == "requests": methods = ["requests", "firecrawl", "selenium"] elif prefer_method == "selenium" and SELENIUM_AVAILABLE: methods = ["selenium", "firecrawl", "requests"] else: # Default order: Firecrawl first (best), then requests, then selenium methods = ["firecrawl", "requests", "selenium"] # Try each method until one succeeds last_error = None for method in methods: try: logger.info(f"🔄 Trying {method} for {url}") if method == "firecrawl" and self.firecrawl_client: result = await self.scrape_with_firecrawl(url) elif method == "requests": result = await self.scrape_with_requests(url) elif method == "selenium" and SELENIUM_AVAILABLE: result = await self.scrape_with_selenium(url) else: logger.warning(f"⚠️ {method} not available, skipping") continue if result.success and result.content.strip(): logger.info(f"✅ Successfully scraped with {method}: {len(result.content)} chars") return result else: logger.warning(f"⚠️ {method} returned no content or failed") last_error = result.error except Exception as e: logger.warning(f"⚠️ {method} failed with exception: {e}") last_error = str(e) continue # All methods failed return ScrapeResult( success=False, content="", url=url, method="all_failed", processing_time=0.0, error=f"All scraping methods failed. Last error: {last_error}" ) def get_status(self) -> Dict[str, Any]: """Get scraper status and capabilities""" return { "firecrawl_available": bool(self.firecrawl_client), "selenium_available": SELENIUM_AVAILABLE, "requests_available": True, "preferred_method": "firecrawl" if self.firecrawl_client else "requests", "config": { "request_timeout": self.config.request_timeout, "firecrawl_api_key_set": bool(self.config.firecrawl_api_key) } } # Global scraper instance _scraper: Optional[JobScraper] = None def get_scraper() -> JobScraper: """Get global scraper instance""" global _scraper if _scraper is None: _scraper = JobScraper() return _scraper async def scrape_job_url(url: str, prefer_method: Optional[str] = None) -> ScrapeResult: """Convenience function to scrape a job URL""" scraper = get_scraper() return await scraper.scrape_job_posting(url, prefer_method) # Common job board URL patterns for optimization JOB_BOARD_PATTERNS = { 'linkedin.com': { 'method': 'firecrawl', # LinkedIn works best with Firecrawl 'indicators': ['job description', 'about the job', 'show more'] }, 'indeed.com': { 'method': 'requests', # Indeed works well with simple requests 'indicators': ['job description', 'full job description'] }, 'glassdoor.com': { 'method': 'selenium', # Glassdoor has dynamic content 'indicators': ['job description', 'job details'] }, 'lever.co': { 'method': 'requests', # Lever is usually simple HTML 'indicators': ['about the role', 'responsibilities'] }, 'greenhouse.io': { 'method': 'requests', # Greenhouse works with requests 'indicators': ['job description', 'what you will do'] } } def get_optimal_scraping_method(url: str) -> str: """Get optimal scraping method based on URL domain""" try: domain = urlparse(url).netloc.lower() for pattern, config in JOB_BOARD_PATTERNS.items(): if pattern in domain: return config['method'] # Default to firecrawl if available, otherwise requests scraper = get_scraper() status = scraper.get_status() if status['firecrawl_available']: return 'firecrawl' else: return 'requests' except Exception: return 'requests' # Safe fallback if __name__ == "__main__": async def test_scraper(): """Test the scraper functionality""" scraper = JobScraper() print("🧪 Testing Job Scraper") print("=" * 50) # Print status status = scraper.get_status() print("📊 Scraper Status:") for key, value in status.items(): print(f" {key}: {value}") # Test with a sample URL (if provided) test_url = "https://www.example.com" # Replace with actual job posting URL for testing print(f"\n🔄 Testing scraper with {test_url}") result = await scraper.scrape_job_posting(test_url) print(f"✅ Success: {result.success}") print(f"📝 Method: {result.method}") print(f"⏱️ Time: {result.processing_time:.2f}s") print(f"📊 Content Length: {len(result.content)}") if result.error: print(f"❌ Error: {result.error}") print("=" * 50) # Run test asyncio.run(test_scraper())