ai-apply / browser_utils.py
sk31415's picture
hugging face
477bd93
"""
Browser automation utilities using Playwright.
Includes anti-detection measures for web automation.
"""
from playwright.sync_api import sync_playwright, Page, Browser, Playwright
# Try to import stealth plugin if available
try:
from playwright_stealth import stealth_sync
HAS_STEALTH = True
except ImportError:
HAS_STEALTH = False
class BrowserManager:
"""
Manages Playwright browser instance with anti-detection measures.
"""
def __init__(self, headless=False):
self.headless = headless
self.playwright: Playwright = None
self.browser: Browser = None
self.context = None
self.page: Page = None
self.is_remote = False
def setup(self, remote_url=None):
"""
Initialize browser - local or remote via Browserless.io.
Args:
remote_url: WebSocket URL for remote browser (e.g., Browserless.io)
If None, launches local browser.
Returns:
Page object
"""
self.playwright = sync_playwright().start()
if remote_url:
# Connect to remote browser (Browserless.io)
self.is_remote = True
self.browser = self.playwright.chromium.connect_over_cdp(remote_url)
# Use existing context from remote browser
self.context = self.browser.contexts[0] if self.browser.contexts else self.browser.new_context()
self.page = self.context.pages[0] if self.context.pages else self.context.new_page()
else:
# Launch local browser with stealth options
self.is_remote = False
self.browser = self.playwright.chromium.launch(
headless=self.headless,
args=[
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--disable-gpu',
'--disable-software-rasterizer',
'--window-size=1920,1080',
]
)
# Create context with custom user agent
self.context = self.browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
viewport={'width': 1920, 'height': 1080},
java_script_enabled=True,
)
self.page = self.context.new_page()
# Apply stealth if available (local only)
if HAS_STEALTH:
stealth_sync(self.page)
# Remove webdriver property (local only)
self.page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
# Set default timeout (equivalent to WebDriverWait 20 seconds)
self.page.set_default_timeout(20000)
return self.page
def close(self):
"""Clean up browser resources."""
if self.page:
self.page.close()
if self.context:
self.context.close()
if self.browser:
self.browser.close()
if self.playwright:
self.playwright.stop()
def __enter__(self):
"""Context manager entry."""
self.setup()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
self.close()
def find_element_with_fallback(page: Page, selectors: list, timeout: int = 5000):
"""
Try multiple selectors until one succeeds.
Args:
page: Playwright page object
selectors: List of CSS/XPath selectors
timeout: Timeout per selector attempt in milliseconds
Returns:
Locator if found, None otherwise
"""
for selector in selectors:
try:
locator = page.locator(selector)
locator.wait_for(timeout=timeout, state='visible')
if locator.count() > 0:
return locator
except Exception:
continue
return None
def scroll_to_bottom(page: Page, max_scrolls: int = 10, wait_time: int = 2000):
"""
Scroll to bottom of page to load dynamic content.
Args:
page: Playwright page object
max_scrolls: Maximum number of scroll attempts
wait_time: Wait time between scrolls in milliseconds
"""
last_height = page.evaluate("document.body.scrollHeight")
for _ in range(max_scrolls):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(wait_time)
new_height = page.evaluate("document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Scroll back to top
page.evaluate("window.scrollTo(0, 0)")
page.wait_for_timeout(1000)
def create_browser(headless=False, remote_url=None):
"""
Factory function to create a browser manager.
Args:
headless: Run in headless mode (default: False for debugging)
remote_url: WebSocket URL for remote browser (e.g., Browserless.io)
Returns:
BrowserManager instance (call .setup() to initialize)
"""
manager = BrowserManager(headless=headless)
if remote_url:
manager.setup(remote_url=remote_url)
return manager