from playwright.async_api import async_playwright import asyncio import random import logging from typing import Optional, List logger = logging.getLogger(__name__) # Your WebShare proxies WEBSHARE_PROXIES = [ "198.23.239.134:6540:zvubytfw:ak6yit5k2tvj", "207.244.217.165:6712:zvubytfw:ak6yit5k2tvj", "107.172.163.27:6543:zvubytfw:ak6yit5k2tvj", "161.123.152.115:6360:zvubytfw:ak6yit5k2tvj", "23.94.138.75:6349:zvubytfw:ak6yit5k2tvj", "216.10.27.159:6837:zvubytfw:ak6yit5k2tvj", "136.0.207.84:6661:zvubytfw:ak6yit5k2tvj", "64.64.118.149:6732:zvubytfw:ak6yit5k2tvj", "142.147.128.93:6593:zvubytfw:ak6yit5k2tvj", "154.36.110.199:6853:zvubytfw:ak6yit5k2tvj" ] # Track proxy failures proxy_failures = {} def get_random_proxy() -> List[str]: """Get a random proxy from the list, avoiding those with failures""" available_proxies = [p for p in WEBSHARE_PROXIES if proxy_failures.get(p, 0) < 3] if not available_proxies: # Reset failures if all proxies have failed for proxy in WEBSHARE_PROXIES: proxy_failures[proxy] = 0 available_proxies = WEBSHARE_PROXIES return random.choice(available_proxies) def mark_proxy_failure(proxy_str: str) -> None: """Mark a proxy as failing""" proxy_failures[proxy_str] = proxy_failures.get(proxy_str, 0) + 1 logger.warning(f"Marked proxy as failed: {proxy_str} (failure count: {proxy_failures[proxy_str]})") if proxy_failures[proxy_str] >= 3: logger.warning(f"Proxy {proxy_str} has failed multiple times, will not use for 5 minutes") asyncio.create_task(reset_proxy_after_delay(proxy_str)) async def reset_proxy_after_delay(proxy_str: str) -> None: """Reset a proxy's failure count after a delay""" await asyncio.sleep(300) # 5 minutes if proxy_str in proxy_failures: proxy_failures[proxy_str] = 0 logger.info(f"Reset failure count for proxy: {proxy_str}") async def fetch_page_with_browser(url: str, user_agent: str) -> Optional[str]: """Fetch a page using Playwright with a proxy""" logger.info(f"Requesting URL with browser: {url}") # Try up to 2 different proxies for attempt in range(2): proxy_str = get_random_proxy() ip, port, username, password = proxy_str.split(':') logger.info(f"Using proxy {ip}:{port} (attempt {attempt+1})") try: async with async_playwright() as p: browser = await p.chromium.launch( headless=True, proxy={ "server": f"http://{ip}:{port}", "username": username, "password": password } ) # Create context with realistic settings context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent=user_agent ) # Apply stealth mode await context.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => false, }); """) # Create page and navigate page = await context.new_page() response = await page.goto(url, wait_until="networkidle", timeout=30000) if response and response.status in [200, 202]: # Wait a bit for any dynamic content to load await asyncio.sleep(3) # Get the page HTML html = await page.content() # Check if we got proper content if len(html) > 5000 and ("