import random import asyncio from playwright.async_api import async_playwright import random async def fetch_html(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000): """ Fetch HTML using Playwright with retries and anti-bot measures. Args: url (str): The URL to fetch max_retries (int): Maximum number of retry attempts min_delay (float): Minimum delay between retries in seconds max_delay (float): Maximum delay between retries in seconds headless (bool): Whether to run browser in headless mode timeout (int): Page load timeout in milliseconds Returns: str or None: HTML content as string or None if failed """ for attempt in range(1, max_retries + 1): print(f"[INFO] Fetching {url} (Attempt {attempt}/{max_retries})") try: async with async_playwright() as p: # Launch browser with realistic settings browser = await p.chromium.launch( headless=headless, args=[ '--no-sandbox', '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=VizDisplayCompositor', '--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ] ) # Create context with realistic settings context = await browser.new_context( user_agent=( "Mozilla/5.0 (X11; Linux x86_64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), locale='ar-SA', timezone_id='Asia/Riyadh', viewport={'width': 1920, 'height': 1080}, extra_http_headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ar,en-US;q=0.9,en;q=0.8", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://shamela.ws/", } ) # Create page and set additional stealth measures page = await context.new_page() # Hide webdriver traces await page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined, }); window.chrome = { runtime: {}, }; Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5], }); Object.defineProperty(navigator, 'languages', { get: () => ['ar', 'en-US', 'en'], }); """) # Navigate to the page response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout) if response and response.status == 200: # Wait a bit for any dynamic content await page.wait_for_timeout(random.randint(1000, 3000)) # Optional: Wait for specific elements if needed # await page.wait_for_selector('body', timeout=5000) # Get the HTML content html = await page.content() if len(html.strip()) > 500: # ✅ Got valid HTML preview = html[:300].replace("\n", " ") print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...") await browser.close() return html else: print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}") else: status = response.status if response else "No response" print(f"[WARN] Status {status} on attempt {attempt}") await browser.close() except Exception as e: print(f"[ERROR] Attempt {attempt}: {e}") # Wait before retrying if attempt < max_retries: wait_time = random.uniform(min_delay, max_delay) print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...") await asyncio.sleep(wait_time) print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.") return None async def fetch_html_with_js_execution(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000, wait_for_js=True): """ Enhanced version that can execute JavaScript and wait for dynamic content. Args: url (str): The URL to fetch max_retries (int): Maximum number of retry attempts min_delay (float): Minimum delay between retries in seconds max_delay (float): Maximum delay between retries in seconds headless (bool): Whether to run browser in headless mode timeout (int): Page load timeout in milliseconds wait_for_js (bool): Whether to wait for JavaScript to execute Returns: str or None: HTML content as string or None if failed """ for attempt in range(1, max_retries + 1): print(f"[INFO] Fetching {url} with JS execution (Attempt {attempt}/{max_retries})") try: async with async_playwright() as p: browser = await p.chromium.launch( headless=headless, args=[ '--no-sandbox', '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=VizDisplayCompositor' ] ) context = await browser.new_context( user_agent=( "Mozilla/5.0 (X11; Linux x86_64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), locale='ar-SA', timezone_id='Asia/Riyadh', viewport={'width': 1920, 'height': 1080}, java_script_enabled=True ) page = await context.new_page() # Navigate and wait for network to be idle response = await page.goto(url, wait_until='networkidle', timeout=timeout) if response and response.status == 200: if wait_for_js: # Wait for common dynamic content indicators try: await page.wait_for_load_state('networkidle', timeout=10000) except: pass # Continue even if network doesn't idle # Additional wait for any remaining dynamic content await page.wait_for_timeout(random.randint(2000, 5000)) html = await page.content() if len(html.strip()) > 500: preview = html[:300].replace("\n", " ") print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...") await browser.close() return html else: print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}") else: status = response.status if response else "No response" print(f"[WARN] Status {status} on attempt {attempt}") await browser.close() except Exception as e: print(f"[ERROR] Attempt {attempt}: {e}") if attempt < max_retries: wait_time = random.uniform(min_delay, max_delay) print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...") await asyncio.sleep(wait_time) print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.") return None async def fetch_html_with_browser(url, browser_type='chromium', max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000): """ Fetch HTML using different browser types (chromium, firefox, webkit). Args: url (str): The URL to fetch browser_type (str): Browser type ('chromium', 'firefox', 'webkit') max_retries (int): Maximum number of retry attempts min_delay (float): Minimum delay between retries in seconds max_delay (float): Maximum delay between retries in seconds headless (bool): Whether to run browser in headless mode timeout (int): Page load timeout in milliseconds Returns: str or None: HTML content as string or None if failed """ for attempt in range(1, max_retries + 1): print(f"[INFO] Fetching {url} with {browser_type} (Attempt {attempt}/{max_retries})") try: async with async_playwright() as p: # Get the appropriate browser if browser_type.lower() == 'firefox': browser = await p.firefox.launch(headless=headless) user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0" elif browser_type.lower() == 'webkit': browser = await p.webkit.launch(headless=headless) user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15" else: # default to chromium browser = await p.chromium.launch( headless=headless, args=[ '--no-sandbox', '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--disable-web-security', '--disable-features=VizDisplayCompositor' ] ) user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" context = await browser.new_context( user_agent=user_agent, locale='ar-SA', timezone_id='Asia/Riyadh', viewport={'width': 1920, 'height': 1080} ) page = await context.new_page() # Navigate to the page response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout) if response and response.status == 200: # Wait a bit for any dynamic content await page.wait_for_timeout(random.randint(1000, 3000)) html = await page.content() if len(html.strip()) > 500: preview = html[:300].replace("\n", " ") print(f"[OK] {url} with {browser_type} (Attempt {attempt}) | Preview: {preview[:150]}...") await browser.close() return html else: print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}") else: status = response.status if response else "No response" print(f"[WARN] Status {status} on attempt {attempt}") await browser.close() except Exception as e: print(f"[ERROR] Attempt {attempt} with {browser_type}: {e}") if attempt < max_retries: wait_time = random.uniform(min_delay, max_delay) print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...") await asyncio.sleep(wait_time) print(f"[FAIL] Could not fetch {url} with {browser_type} after {max_retries} attempts.") return None