Spaces:

HydraBolt
/

SanadLLM

Sleeping

File size: 13,170 Bytes



import random
import asyncio
from playwright.async_api import async_playwright
import random

async def fetch_html(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000):
    """
    Fetch HTML using Playwright with retries and anti-bot measures.
    
    Args:
        url (str): The URL to fetch
        max_retries (int): Maximum number of retry attempts
        min_delay (float): Minimum delay between retries in seconds
        max_delay (float): Maximum delay between retries in seconds
        headless (bool): Whether to run browser in headless mode
        timeout (int): Page load timeout in milliseconds
        
    Returns:
        str or None: HTML content as string or None if failed
    """
    for attempt in range(1, max_retries + 1):
        print(f"[INFO] Fetching {url} (Attempt {attempt}/{max_retries})")
        
        try:
            async with async_playwright() as p:
                # Launch browser with realistic settings
                browser = await p.chromium.launch(
                    headless=headless,
                    args=[
                        '--no-sandbox',
                        '--disable-blink-features=AutomationControlled',
                        '--disable-dev-shm-usage',
                        '--disable-web-security',
                        '--disable-features=VizDisplayCompositor',
                        '--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                    ]
                )
                
                # Create context with realistic settings
                context = await browser.new_context(
                    user_agent=(
                        "Mozilla/5.0 (X11; Linux x86_64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/120.0.0.0 Safari/537.36"
                    ),
                    locale='ar-SA',
                    timezone_id='Asia/Riyadh',
                    viewport={'width': 1920, 'height': 1080},
                    extra_http_headers={
                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                        "Accept-Language": "ar,en-US;q=0.9,en;q=0.8",
                        "Accept-Encoding": "gzip, deflate, br",
                        "DNT": "1",
                        "Connection": "keep-alive",
                        "Upgrade-Insecure-Requests": "1",
                        "Referer": "https://shamela.ws/",
                    }
                )
                
                # Create page and set additional stealth measures
                page = await context.new_page()
                
                # Hide webdriver traces
                await page.add_init_script("""
                    Object.defineProperty(navigator, 'webdriver', {
                        get: () => undefined,
                    });
                    
                    window.chrome = {
                        runtime: {},
                    };
                    
                    Object.defineProperty(navigator, 'plugins', {
                        get: () => [1, 2, 3, 4, 5],
                    });
                    
                    Object.defineProperty(navigator, 'languages', {
                        get: () => ['ar', 'en-US', 'en'],
                    });
                """)
                
                # Navigate to the page
                response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)
                
                if response and response.status == 200:
                    # Wait a bit for any dynamic content
                    await page.wait_for_timeout(random.randint(1000, 3000))
                    
                    # Optional: Wait for specific elements if needed
                    # await page.wait_for_selector('body', timeout=5000)
                    
                    # Get the HTML content
                    html = await page.content()
                    
                    if len(html.strip()) > 500:
                        # ✅ Got valid HTML
                        preview = html[:300].replace("\n", " ")
                        print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...")
                        await browser.close()
                        return html
                    else:
                        print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
                else:
                    status = response.status if response else "No response"
                    print(f"[WARN] Status {status} on attempt {attempt}")
                
                await browser.close()
        
        except Exception as e:
            print(f"[ERROR] Attempt {attempt}: {e}")
        
        # Wait before retrying
        if attempt < max_retries:
            wait_time = random.uniform(min_delay, max_delay)
            print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
            await asyncio.sleep(wait_time)
    
    print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.")
    return None


async def fetch_html_with_js_execution(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000, wait_for_js=True):
    """
    Enhanced version that can execute JavaScript and wait for dynamic content.
    
    Args:
        url (str): The URL to fetch
        max_retries (int): Maximum number of retry attempts
        min_delay (float): Minimum delay between retries in seconds
        max_delay (float): Maximum delay between retries in seconds
        headless (bool): Whether to run browser in headless mode
        timeout (int): Page load timeout in milliseconds
        wait_for_js (bool): Whether to wait for JavaScript to execute
        
    Returns:
        str or None: HTML content as string or None if failed
    """
    for attempt in range(1, max_retries + 1):
        print(f"[INFO] Fetching {url} with JS execution (Attempt {attempt}/{max_retries})")
        
        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch(
                    headless=headless,
                    args=[
                        '--no-sandbox',
                        '--disable-blink-features=AutomationControlled',
                        '--disable-dev-shm-usage',
                        '--disable-web-security',
                        '--disable-features=VizDisplayCompositor'
                    ]
                )
                
                context = await browser.new_context(
                    user_agent=(
                        "Mozilla/5.0 (X11; Linux x86_64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/120.0.0.0 Safari/537.36"
                    ),
                    locale='ar-SA',
                    timezone_id='Asia/Riyadh',
                    viewport={'width': 1920, 'height': 1080},
                    java_script_enabled=True
                )
                
                page = await context.new_page()
                
                # Navigate and wait for network to be idle
                response = await page.goto(url, wait_until='networkidle', timeout=timeout)
                
                if response and response.status == 200:
                    if wait_for_js:
                        # Wait for common dynamic content indicators
                        try:
                            await page.wait_for_load_state('networkidle', timeout=10000)
                        except:
                            pass  # Continue even if network doesn't idle
                        
                        # Additional wait for any remaining dynamic content
                        await page.wait_for_timeout(random.randint(2000, 5000))
                    
                    html = await page.content()
                    
                    if len(html.strip()) > 500:
                        preview = html[:300].replace("\n", " ")
                        print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...")
                        await browser.close()
                        return html
                    else:
                        print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
                else:
                    status = response.status if response else "No response"
                    print(f"[WARN] Status {status} on attempt {attempt}")
                
                await browser.close()
        
        except Exception as e:
            print(f"[ERROR] Attempt {attempt}: {e}")
        
        if attempt < max_retries:
            wait_time = random.uniform(min_delay, max_delay)
            print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
            await asyncio.sleep(wait_time)
    
    print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.")
    return None


async def fetch_html_with_browser(url, browser_type='chromium', max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000):
    """
    Fetch HTML using different browser types (chromium, firefox, webkit).
    
    Args:
        url (str): The URL to fetch
        browser_type (str): Browser type ('chromium', 'firefox', 'webkit')
        max_retries (int): Maximum number of retry attempts
        min_delay (float): Minimum delay between retries in seconds
        max_delay (float): Maximum delay between retries in seconds
        headless (bool): Whether to run browser in headless mode
        timeout (int): Page load timeout in milliseconds
        
    Returns:
        str or None: HTML content as string or None if failed
    """
    for attempt in range(1, max_retries + 1):
        print(f"[INFO] Fetching {url} with {browser_type} (Attempt {attempt}/{max_retries})")
        
        try:
            async with async_playwright() as p:
                # Get the appropriate browser
                if browser_type.lower() == 'firefox':
                    browser = await p.firefox.launch(headless=headless)
                    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
                elif browser_type.lower() == 'webkit':
                    browser = await p.webkit.launch(headless=headless)
                    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15"
                else:  # default to chromium
                    browser = await p.chromium.launch(
                        headless=headless,
                        args=[
                            '--no-sandbox',
                            '--disable-blink-features=AutomationControlled',
                            '--disable-dev-shm-usage',
                            '--disable-web-security',
                            '--disable-features=VizDisplayCompositor'
                        ]
                    )
                    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
                
                context = await browser.new_context(
                    user_agent=user_agent,
                    locale='ar-SA',
                    timezone_id='Asia/Riyadh',
                    viewport={'width': 1920, 'height': 1080}
                )
                
                page = await context.new_page()
                
                # Navigate to the page
                response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)
                
                if response and response.status == 200:
                    # Wait a bit for any dynamic content
                    await page.wait_for_timeout(random.randint(1000, 3000))
                    
                    html = await page.content()
                    
                    if len(html.strip()) > 500:
                        preview = html[:300].replace("\n", " ")
                        print(f"[OK] {url} with {browser_type} (Attempt {attempt}) | Preview: {preview[:150]}...")
                        await browser.close()
                        return html
                    else:
                        print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
                else:
                    status = response.status if response else "No response"
                    print(f"[WARN] Status {status} on attempt {attempt}")
                
                await browser.close()
        
        except Exception as e:
            print(f"[ERROR] Attempt {attempt} with {browser_type}: {e}")
        
        if attempt < max_retries:
            wait_time = random.uniform(min_delay, max_delay)
            print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
            await asyncio.sleep(wait_time)
    
    print(f"[FAIL] Could not fetch {url} with {browser_type} after {max_retries} attempts.")
    return None