News-Scraper / src /utils /webdriver_utils.py
Nishitha03's picture
Upload 15 files
dd99def verified
"""
Utilities for creating and managing Selenium WebDriver instances.
This module provides reusable functions for browser automation.
"""
import time
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib3.exceptions import ProtocolError
logger = logging.getLogger(__name__)
def create_chrome_driver(headless=True, load_images=False, page_load_strategy='eager'):
"""
Create and configure a Chrome WebDriver instance with optimized settings.
Args:
headless (bool): Whether to run Chrome in headless mode
load_images (bool): Whether to load images
page_load_strategy (str): Page load strategy ('normal', 'eager', or 'none')
Returns:
webdriver.Chrome: Configured Chrome WebDriver instance
"""
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
# Common performance optimizations
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--disable-notifications')
if not load_images:
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
chrome_options.page_load_strategy = page_load_strategy
# Performance preferences
chrome_options.add_experimental_option('prefs', {
'profile.default_content_setting_values.notifications': 2,
'profile.managed_default_content_settings.images': 2 if not load_images else 0,
'disk-cache-size': 4096
})
return webdriver.Chrome(options=chrome_options)
def wait_for_page_load(driver, url, timeout=10, retries=3, backoff_factor=2):
"""
Load a URL with retries and exponential backoff.
Args:
driver (webdriver.Chrome): WebDriver instance
url (str): URL to load
timeout (int): Page load timeout in seconds
retries (int): Number of retry attempts
backoff_factor (int): Factor to multiply wait time by on each retry
Returns:
bool: Whether page load was successful
"""
for attempt in range(retries):
try:
driver.set_page_load_timeout(timeout)
driver.get(url)
# Wait for DOM to be ready
WebDriverWait(driver, timeout).until(
lambda d: d.execute_script('return document.readyState') == 'complete'
)
return True
except (TimeoutException, WebDriverException, ProtocolError) as e:
if attempt == retries - 1:
logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}")
return False
else:
wait_time = backoff_factor * (attempt + 1)
logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries}) in {wait_time}s")
time.sleep(wait_time)
continue
except Exception as e:
logger.error(f"Unexpected error loading {url}: {str(e)}")
return False
return False
def scroll_to_element(driver, element):
"""
Scroll the page to make an element visible.
Args:
driver (webdriver.Chrome): WebDriver instance
element: WebElement to scroll to
"""
try:
driver.execute_script("arguments[0].scrollIntoView(true);", element)
driver.execute_script("window.scrollBy(0, -100);") # Adjust to avoid navbar overlay
except Exception as e:
logger.error(f"Error scrolling to element: {str(e)}")
def scroll_to_bottom(driver, scroll_pause_time=1.0, num_scrolls=None):
"""
Scroll to the bottom of the page incrementally.
Args:
driver (webdriver.Chrome): WebDriver instance
scroll_pause_time (float): Time to pause between scrolls
num_scrolls (int, optional): Maximum number of scrolls to perform
"""
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
scrolls_performed = 0
while True:
# Check if we've reached the scroll limit
if num_scrolls is not None and scrolls_performed >= num_scrolls:
break
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scrolls_performed += 1
return scrolls_performed