import logging from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager from insucompass.config import settings from insucompass.services.database import setup_database, initialize_crawl_jobs from scripts.data_processing.crawler import crawl_with_requests, crawl_with_selenium # Configure logging logging.basicConfig(level=settings.LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def setup_selenium_driver(): """Initializes a headless Chrome WebDriver.""" logger.info("Setting up Selenium WebDriver...") try: options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-gpu") options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # This will handle SSL certificate issues often seen with government sites options.add_argument('--ignore-certificate-errors') driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) logger.info("WebDriver setup complete.") return driver except Exception as e: logger.error(f"Failed to setup Selenium driver: {e}") return None def main(): """Main function to run the data crawling jobs.""" logger.info("--- Starting InsuCompass AI Data Acquisition ---") # Setup DB and initialize starting URLs setup_database() initialize_crawl_jobs() driver = None try: # Initialize Selenium driver only if needed if any(job.get('method') == 'selenium_crawl' and job.get('status') == 'active' for job in settings.CRAWLING_JOBS): driver = setup_selenium_driver() for job in settings.CRAWLING_JOBS: if job.get('status') != 'active': logger.info(f"--- Skipping inactive job: {job['name']} ---") continue logger.info(f"--- Processing job: {job['name']} ---") if job['method'] == 'selenium_crawl': if driver: crawl_with_selenium(driver, job) else: logger.error(f"Selenium method required for {job['name']} but driver failed to initialize. Skipping.") elif job['method'] == 'requests_crawl': crawl_with_requests(job) else: logger.warning(f"Method '{job['method']}' not implemented for job {job['name']}. Skipping.") finally: if driver: driver.quit() logger.info("Selenium WebDriver closed.") logger.info("--- Data Acquisition Process Finished ---") if __name__ == "__main__": main()