File size: 5,199 Bytes
dd99def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Utilities for creating and managing Selenium WebDriver instances.
This module provides reusable functions for browser automation.
"""

import time
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib3.exceptions import ProtocolError

logger = logging.getLogger(__name__)

def create_chrome_driver(headless=True, load_images=False, page_load_strategy='eager'):
    """
    Create and configure a Chrome WebDriver instance with optimized settings.
    
    Args:
        headless (bool): Whether to run Chrome in headless mode
        load_images (bool): Whether to load images
        page_load_strategy (str): Page load strategy ('normal', 'eager', or 'none')
        
    Returns:
        webdriver.Chrome: Configured Chrome WebDriver instance
    """
    chrome_options = webdriver.ChromeOptions()
    
    if headless:
        chrome_options.add_argument('--headless')
    
    # Common performance optimizations
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-infobars')
    chrome_options.add_argument('--disable-notifications')
    
    if not load_images:
        chrome_options.add_argument('--blink-settings=imagesEnabled=false')
    
    chrome_options.page_load_strategy = page_load_strategy
    
    # Performance preferences
    chrome_options.add_experimental_option('prefs', {
        'profile.default_content_setting_values.notifications': 2,
        'profile.managed_default_content_settings.images': 2 if not load_images else 0,
        'disk-cache-size': 4096
    })
    
    return webdriver.Chrome(options=chrome_options)

def wait_for_page_load(driver, url, timeout=10, retries=3, backoff_factor=2):
    """
    Load a URL with retries and exponential backoff.
    
    Args:
        driver (webdriver.Chrome): WebDriver instance
        url (str): URL to load
        timeout (int): Page load timeout in seconds
        retries (int): Number of retry attempts
        backoff_factor (int): Factor to multiply wait time by on each retry
        
    Returns:
        bool: Whether page load was successful
    """
    for attempt in range(retries):
        try:
            driver.set_page_load_timeout(timeout)
            driver.get(url)
            
            # Wait for DOM to be ready
            WebDriverWait(driver, timeout).until(
                lambda d: d.execute_script('return document.readyState') == 'complete'
            )
            
            return True
            
        except (TimeoutException, WebDriverException, ProtocolError) as e:
            if attempt == retries - 1:
                logger.warning(f"Failed to load {url} after {retries} attempts: {str(e)}")
                return False
            else:
                wait_time = backoff_factor * (attempt + 1)
                logger.info(f"Retrying page load for {url} (attempt {attempt + 2}/{retries}) in {wait_time}s")
                time.sleep(wait_time)
                continue
                
        except Exception as e:
            logger.error(f"Unexpected error loading {url}: {str(e)}")
            return False
            
    return False

def scroll_to_element(driver, element):
    """
    Scroll the page to make an element visible.
    
    Args:
        driver (webdriver.Chrome): WebDriver instance
        element: WebElement to scroll to
    """
    try:
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        driver.execute_script("window.scrollBy(0, -100);")  # Adjust to avoid navbar overlay
    except Exception as e:
        logger.error(f"Error scrolling to element: {str(e)}")

def scroll_to_bottom(driver, scroll_pause_time=1.0, num_scrolls=None):
    """
    Scroll to the bottom of the page incrementally.
    
    Args:
        driver (webdriver.Chrome): WebDriver instance
        scroll_pause_time (float): Time to pause between scrolls
        num_scrolls (int, optional): Maximum number of scrolls to perform
    """
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    scrolls_performed = 0
    
    while True:
        # Check if we've reached the scroll limit
        if num_scrolls is not None and scrolls_performed >= num_scrolls:
            break
            
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait to load page
        time.sleep(scroll_pause_time)
        
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
            
        last_height = new_height
        scrolls_performed += 1
        
    return scrolls_performed