Spaces:
Sleeping
Sleeping
| """ | |
| Utilities for creating and managing Selenium WebDriver instances. | |
| This module provides reusable functions for browser automation. | |
| """ | |
| import time | |
| import logging | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException, WebDriverException | |
| from urllib3.exceptions import ProtocolError | |
| logger = logging.getLogger(__name__) | |
| def create_chrome_driver(headless=True, load_images=False, page_load_strategy='eager'): | |
| """ | |
| Create and configure a Chrome WebDriver instance with optimized settings. | |
| Args: | |
| headless (bool): Whether to run Chrome in headless mode | |
| load_images (bool): Whether to load images | |
| page_load_strategy (str): Page load strategy ('normal', 'eager', or 'none') | |
| Returns: | |
| webdriver.Chrome: Configured Chrome WebDriver instance | |
| """ | |
| chrome_options = webdriver.ChromeOptions() | |
| if headless: | |
| chrome_options.add_argument('--headless') | |
| # Common performance optimizations | |
| chrome_options.add_argument('--no-sandbox') | |
| chrome_options.add_argument('--disable-dev-shm-usage') | |
| chrome_options.add_argument('--disable-extensions') | |
| chrome_options.add_argument('--disable-gpu') | |
| chrome_options.add_argument('--disable-infobars') | |
| chrome_options.add_argument('--disable-notifications') | |
| if not load_images: | |
| chrome_options.add_argument('--blink-settings=imagesEnabled=false') | |
| chrome_options.page_load_strategy = page_load_strategy | |
| # Performance preferences | |
| chrome_options.add_experimental_option('prefs', { | |
| 'profile.default_content_setting_values.notifications': 2, | |
| 'profile.managed_default_content_settings.images': 2 if not load_images else 0, | |
| 'disk-cache-size': 4096 | |
| }) | |
| return webdriver.Chrome(options=chrome_options) | |
| def wait_for_page_load(driver, url, timeout=10, retries=3, backoff_factor=2): | |
| """ | |
| Load a URL with retries and exponential backoff. | |
| Args: | |
| driver (webdriver.Chrome): WebDriver instance | |
| url (str): URL to load | |
| timeout (int): Page load timeout in seconds | |
| retries (int): Number of retry attempts | |
| backoff_factor (int): Factor to multiply wait time by on each retry | |
| Returns: | |
| bool: Whether page load was successful | |
| """ | |
| for attempt in range(retries): | |
| try: | |
| driver.set_page_load_timeout(timeout) | |
| driver.get(url) | |
| # Wait for DOM to be ready | |
| WebDriverWait(driver, timeout).until( | |
| lambda d: d.execute_script('return document.readyState') == 'complete' | |
| ) | |
| return True | |
| except (TimeoutException, WebDriverException, ProtocolError) as e: | |
| if attempt == retries - 1: | |
| logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}") | |
| return False | |
| else: | |
| wait_time = backoff_factor * (attempt + 1) | |
| logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries}) in {wait_time}s") | |
| time.sleep(wait_time) | |
| continue | |
| except Exception as e: | |
| logger.error(f"Unexpected error loading {url}: {str(e)}") | |
| return False | |
| return False | |
| def scroll_to_element(driver, element): | |
| """ | |
| Scroll the page to make an element visible. | |
| Args: | |
| driver (webdriver.Chrome): WebDriver instance | |
| element: WebElement to scroll to | |
| """ | |
| try: | |
| driver.execute_script("arguments[0].scrollIntoView(true);", element) | |
| driver.execute_script("window.scrollBy(0, -100);") # Adjust to avoid navbar overlay | |
| except Exception as e: | |
| logger.error(f"Error scrolling to element: {str(e)}") | |
| def scroll_to_bottom(driver, scroll_pause_time=1.0, num_scrolls=None): | |
| """ | |
| Scroll to the bottom of the page incrementally. | |
| Args: | |
| driver (webdriver.Chrome): WebDriver instance | |
| scroll_pause_time (float): Time to pause between scrolls | |
| num_scrolls (int, optional): Maximum number of scrolls to perform | |
| """ | |
| # Get scroll height | |
| last_height = driver.execute_script("return document.body.scrollHeight") | |
| scrolls_performed = 0 | |
| while True: | |
| # Check if we've reached the scroll limit | |
| if num_scrolls is not None and scrolls_performed >= num_scrolls: | |
| break | |
| # Scroll down to bottom | |
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| # Wait to load page | |
| time.sleep(scroll_pause_time) | |
| # Calculate new scroll height and compare with last scroll height | |
| new_height = driver.execute_script("return document.body.scrollHeight") | |
| if new_height == last_height: | |
| break | |
| last_height = new_height | |
| scrolls_performed += 1 | |
| return scrolls_performed |