""" Utilities for creating and managing Selenium WebDriver instances. This module provides reusable functions for browser automation. """ import time import logging from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, WebDriverException from urllib3.exceptions import ProtocolError logger = logging.getLogger(__name__) def create_chrome_driver(headless=True, load_images=False, page_load_strategy='eager'): """ Create and configure a Chrome WebDriver instance with optimized settings. Args: headless (bool): Whether to run Chrome in headless mode load_images (bool): Whether to load images page_load_strategy (str): Page load strategy ('normal', 'eager', or 'none') Returns: webdriver.Chrome: Configured Chrome WebDriver instance """ chrome_options = webdriver.ChromeOptions() if headless: chrome_options.add_argument('--headless') # Common performance optimizations chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-infobars') chrome_options.add_argument('--disable-notifications') if not load_images: chrome_options.add_argument('--blink-settings=imagesEnabled=false') chrome_options.page_load_strategy = page_load_strategy # Performance preferences chrome_options.add_experimental_option('prefs', { 'profile.default_content_setting_values.notifications': 2, 'profile.managed_default_content_settings.images': 2 if not load_images else 0, 'disk-cache-size': 4096 }) return webdriver.Chrome(options=chrome_options) def wait_for_page_load(driver, url, timeout=10, retries=3, backoff_factor=2): """ Load a URL with retries and exponential backoff. Args: driver (webdriver.Chrome): WebDriver instance url (str): URL to load timeout (int): Page load timeout in seconds retries (int): Number of retry attempts backoff_factor (int): Factor to multiply wait time by on each retry Returns: bool: Whether page load was successful """ for attempt in range(retries): try: driver.set_page_load_timeout(timeout) driver.get(url) # Wait for DOM to be ready WebDriverWait(driver, timeout).until( lambda d: d.execute_script('return document.readyState') == 'complete' ) return True except (TimeoutException, WebDriverException, ProtocolError) as e: if attempt == retries - 1: logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}") return False else: wait_time = backoff_factor * (attempt + 1) logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries}) in {wait_time}s") time.sleep(wait_time) continue except Exception as e: logger.error(f"Unexpected error loading {url}: {str(e)}") return False return False def scroll_to_element(driver, element): """ Scroll the page to make an element visible. Args: driver (webdriver.Chrome): WebDriver instance element: WebElement to scroll to """ try: driver.execute_script("arguments[0].scrollIntoView(true);", element) driver.execute_script("window.scrollBy(0, -100);") # Adjust to avoid navbar overlay except Exception as e: logger.error(f"Error scrolling to element: {str(e)}") def scroll_to_bottom(driver, scroll_pause_time=1.0, num_scrolls=None): """ Scroll to the bottom of the page incrementally. Args: driver (webdriver.Chrome): WebDriver instance scroll_pause_time (float): Time to pause between scrolls num_scrolls (int, optional): Maximum number of scrolls to perform """ # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") scrolls_performed = 0 while True: # Check if we've reached the scroll limit if num_scrolls is not None and scrolls_performed >= num_scrolls: break # Scroll down to bottom driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(scroll_pause_time) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height scrolls_performed += 1 return scrolls_performed