InsuCompass-API / scripts /run_crawler.py
nagur-shareef-shaik's picture
Add Application Code
cd6f412
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from insucompass.config import settings
from insucompass.services.database import setup_database, initialize_crawl_jobs
from scripts.data_processing.crawler import crawl_with_requests, crawl_with_selenium
# Configure logging
logging.basicConfig(level=settings.LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def setup_selenium_driver():
"""Initializes a headless Chrome WebDriver."""
logger.info("Setting up Selenium WebDriver...")
try:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# This will handle SSL certificate issues often seen with government sites
options.add_argument('--ignore-certificate-errors')
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
logger.info("WebDriver setup complete.")
return driver
except Exception as e:
logger.error(f"Failed to setup Selenium driver: {e}")
return None
def main():
"""Main function to run the data crawling jobs."""
logger.info("--- Starting InsuCompass AI Data Acquisition ---")
# Setup DB and initialize starting URLs
setup_database()
initialize_crawl_jobs()
driver = None
try:
# Initialize Selenium driver only if needed
if any(job.get('method') == 'selenium_crawl' and job.get('status') == 'active' for job in settings.CRAWLING_JOBS):
driver = setup_selenium_driver()
for job in settings.CRAWLING_JOBS:
if job.get('status') != 'active':
logger.info(f"--- Skipping inactive job: {job['name']} ---")
continue
logger.info(f"--- Processing job: {job['name']} ---")
if job['method'] == 'selenium_crawl':
if driver:
crawl_with_selenium(driver, job)
else:
logger.error(f"Selenium method required for {job['name']} but driver failed to initialize. Skipping.")
elif job['method'] == 'requests_crawl':
crawl_with_requests(job)
else:
logger.warning(f"Method '{job['method']}' not implemented for job {job['name']}. Skipping.")
finally:
if driver:
driver.quit()
logger.info("Selenium WebDriver closed.")
logger.info("--- Data Acquisition Process Finished ---")
if __name__ == "__main__":
main()