| | """ |
| | Browser automation utilities using Playwright. |
| | Includes anti-detection measures for web automation. |
| | """ |
| | from playwright.sync_api import sync_playwright, Page, Browser, Playwright |
| |
|
| | |
| | try: |
| | from playwright_stealth import stealth_sync |
| | HAS_STEALTH = True |
| | except ImportError: |
| | HAS_STEALTH = False |
| |
|
| |
|
| | class BrowserManager: |
| | """ |
| | Manages Playwright browser instance with anti-detection measures. |
| | """ |
| |
|
| | def __init__(self, headless=False): |
| | self.headless = headless |
| | self.playwright: Playwright = None |
| | self.browser: Browser = None |
| | self.context = None |
| | self.page: Page = None |
| | self.is_remote = False |
| |
|
| | def setup(self, remote_url=None): |
| | """ |
| | Initialize browser - local or remote via Browserless.io. |
| | |
| | Args: |
| | remote_url: WebSocket URL for remote browser (e.g., Browserless.io) |
| | If None, launches local browser. |
| | |
| | Returns: |
| | Page object |
| | """ |
| | self.playwright = sync_playwright().start() |
| |
|
| | if remote_url: |
| | |
| | self.is_remote = True |
| | self.browser = self.playwright.chromium.connect_over_cdp(remote_url) |
| | |
| | self.context = self.browser.contexts[0] if self.browser.contexts else self.browser.new_context() |
| | self.page = self.context.pages[0] if self.context.pages else self.context.new_page() |
| | else: |
| | |
| | self.is_remote = False |
| | self.browser = self.playwright.chromium.launch( |
| | headless=self.headless, |
| | args=[ |
| | '--no-sandbox', |
| | '--disable-dev-shm-usage', |
| | '--disable-blink-features=AutomationControlled', |
| | '--disable-gpu', |
| | '--disable-software-rasterizer', |
| | '--window-size=1920,1080', |
| | ] |
| | ) |
| |
|
| | |
| | self.context = self.browser.new_context( |
| | user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', |
| | viewport={'width': 1920, 'height': 1080}, |
| | java_script_enabled=True, |
| | ) |
| |
|
| | self.page = self.context.new_page() |
| |
|
| | |
| | if HAS_STEALTH: |
| | stealth_sync(self.page) |
| |
|
| | |
| | self.page.add_init_script(""" |
| | Object.defineProperty(navigator, 'webdriver', { |
| | get: () => undefined |
| | }); |
| | """) |
| |
|
| | |
| | self.page.set_default_timeout(20000) |
| |
|
| | return self.page |
| |
|
| | def close(self): |
| | """Clean up browser resources.""" |
| | if self.page: |
| | self.page.close() |
| | if self.context: |
| | self.context.close() |
| | if self.browser: |
| | self.browser.close() |
| | if self.playwright: |
| | self.playwright.stop() |
| |
|
| | def __enter__(self): |
| | """Context manager entry.""" |
| | self.setup() |
| | return self |
| |
|
| | def __exit__(self, exc_type, exc_val, exc_tb): |
| | """Context manager exit.""" |
| | self.close() |
| |
|
| |
|
| | def find_element_with_fallback(page: Page, selectors: list, timeout: int = 5000): |
| | """ |
| | Try multiple selectors until one succeeds. |
| | |
| | Args: |
| | page: Playwright page object |
| | selectors: List of CSS/XPath selectors |
| | timeout: Timeout per selector attempt in milliseconds |
| | |
| | Returns: |
| | Locator if found, None otherwise |
| | """ |
| | for selector in selectors: |
| | try: |
| | locator = page.locator(selector) |
| | locator.wait_for(timeout=timeout, state='visible') |
| | if locator.count() > 0: |
| | return locator |
| | except Exception: |
| | continue |
| | return None |
| |
|
| |
|
| | def scroll_to_bottom(page: Page, max_scrolls: int = 10, wait_time: int = 2000): |
| | """ |
| | Scroll to bottom of page to load dynamic content. |
| | |
| | Args: |
| | page: Playwright page object |
| | max_scrolls: Maximum number of scroll attempts |
| | wait_time: Wait time between scrolls in milliseconds |
| | """ |
| | last_height = page.evaluate("document.body.scrollHeight") |
| |
|
| | for _ in range(max_scrolls): |
| | page.evaluate("window.scrollTo(0, document.body.scrollHeight)") |
| | page.wait_for_timeout(wait_time) |
| |
|
| | new_height = page.evaluate("document.body.scrollHeight") |
| | if new_height == last_height: |
| | break |
| | last_height = new_height |
| |
|
| | |
| | page.evaluate("window.scrollTo(0, 0)") |
| | page.wait_for_timeout(1000) |
| |
|
| |
|
| | def create_browser(headless=False, remote_url=None): |
| | """ |
| | Factory function to create a browser manager. |
| | |
| | Args: |
| | headless: Run in headless mode (default: False for debugging) |
| | remote_url: WebSocket URL for remote browser (e.g., Browserless.io) |
| | |
| | Returns: |
| | BrowserManager instance (call .setup() to initialize) |
| | """ |
| | manager = BrowserManager(headless=headless) |
| | if remote_url: |
| | manager.setup(remote_url=remote_url) |
| | return manager |
| |
|