Spaces:

Nishitha03
/

News-Scraper

Sleeping

App Files Files Community

News-Scraper / src /utils /webdriver_utils.py

Nishitha03

Upload 15 files

dd99def verified 4 months ago

raw

history blame contribute delete

5.2 kB

	"""
	Utilities for creating and managing Selenium WebDriver instances.
	This module provides reusable functions for browser automation.
	"""

	import time
	import logging
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException, WebDriverException
	from urllib3.exceptions import ProtocolError

	logger = logging.getLogger(__name__)

	def create_chrome_driver(headless=True, load_images=False, page_load_strategy='eager'):
	"""
	Create and configure a Chrome WebDriver instance with optimized settings.

	Args:
	headless (bool): Whether to run Chrome in headless mode
	load_images (bool): Whether to load images
	page_load_strategy (str): Page load strategy ('normal', 'eager', or 'none')

	Returns:
	webdriver.Chrome: Configured Chrome WebDriver instance
	"""
	chrome_options = webdriver.ChromeOptions()

	if headless:
	chrome_options.add_argument('--headless')

	# Common performance optimizations
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-dev-shm-usage')
	chrome_options.add_argument('--disable-extensions')
	chrome_options.add_argument('--disable-gpu')
	chrome_options.add_argument('--disable-infobars')
	chrome_options.add_argument('--disable-notifications')

	if not load_images:
	chrome_options.add_argument('--blink-settings=imagesEnabled=false')

	chrome_options.page_load_strategy = page_load_strategy

	# Performance preferences
	chrome_options.add_experimental_option('prefs', {
	'profile.default_content_setting_values.notifications': 2,
	'profile.managed_default_content_settings.images': 2 if not load_images else 0,
	'disk-cache-size': 4096
	})

	return webdriver.Chrome(options=chrome_options)

	def wait_for_page_load(driver, url, timeout=10, retries=3, backoff_factor=2):
	"""
	Load a URL with retries and exponential backoff.

	Args:
	driver (webdriver.Chrome): WebDriver instance
	url (str): URL to load
	timeout (int): Page load timeout in seconds
	retries (int): Number of retry attempts
	backoff_factor (int): Factor to multiply wait time by on each retry

	Returns:
	bool: Whether page load was successful
	"""
	for attempt in range(retries):
	try:
	driver.set_page_load_timeout(timeout)
	driver.get(url)

	# Wait for DOM to be ready
	WebDriverWait(driver, timeout).until(
	lambda d: d.execute_script('return document.readyState') == 'complete'
	)

	return True

	except (TimeoutException, WebDriverException, ProtocolError) as e:
	if attempt == retries - 1:
	logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}")
	return False
	else:
	wait_time = backoff_factor * (attempt + 1)
	logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries}) in {wait_time}s")
	time.sleep(wait_time)
	continue

	except Exception as e:
	logger.error(f"Unexpected error loading {url}: {str(e)}")
	return False

	return False

	def scroll_to_element(driver, element):
	"""
	Scroll the page to make an element visible.

	Args:
	driver (webdriver.Chrome): WebDriver instance
	element: WebElement to scroll to
	"""
	try:
	driver.execute_script("arguments[0].scrollIntoView(true);", element)
	driver.execute_script("window.scrollBy(0, -100);") # Adjust to avoid navbar overlay
	except Exception as e:
	logger.error(f"Error scrolling to element: {str(e)}")

	def scroll_to_bottom(driver, scroll_pause_time=1.0, num_scrolls=None):
	"""
	Scroll to the bottom of the page incrementally.

	Args:
	driver (webdriver.Chrome): WebDriver instance
	scroll_pause_time (float): Time to pause between scrolls
	num_scrolls (int, optional): Maximum number of scrolls to perform
	"""
	# Get scroll height
	last_height = driver.execute_script("return document.body.scrollHeight")

	scrolls_performed = 0

	while True:
	# Check if we've reached the scroll limit
	if num_scrolls is not None and scrolls_performed >= num_scrolls:
	break

	# Scroll down to bottom
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

	# Wait to load page
	time.sleep(scroll_pause_time)

	# Calculate new scroll height and compare with last scroll height
	new_height = driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break

	last_height = new_height
	scrolls_performed += 1

	return scrolls_performed