|
|
|
|
|
|
|
|
import random |
|
|
import asyncio |
|
|
from playwright.async_api import async_playwright |
|
|
import random |
|
|
|
|
|
async def fetch_html(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000): |
|
|
""" |
|
|
Fetch HTML using Playwright with retries and anti-bot measures. |
|
|
|
|
|
Args: |
|
|
url (str): The URL to fetch |
|
|
max_retries (int): Maximum number of retry attempts |
|
|
min_delay (float): Minimum delay between retries in seconds |
|
|
max_delay (float): Maximum delay between retries in seconds |
|
|
headless (bool): Whether to run browser in headless mode |
|
|
timeout (int): Page load timeout in milliseconds |
|
|
|
|
|
Returns: |
|
|
str or None: HTML content as string or None if failed |
|
|
""" |
|
|
for attempt in range(1, max_retries + 1): |
|
|
print(f"[INFO] Fetching {url} (Attempt {attempt}/{max_retries})") |
|
|
|
|
|
try: |
|
|
async with async_playwright() as p: |
|
|
|
|
|
browser = await p.chromium.launch( |
|
|
headless=headless, |
|
|
args=[ |
|
|
'--no-sandbox', |
|
|
'--disable-blink-features=AutomationControlled', |
|
|
'--disable-dev-shm-usage', |
|
|
'--disable-web-security', |
|
|
'--disable-features=VizDisplayCompositor', |
|
|
'--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
context = await browser.new_context( |
|
|
user_agent=( |
|
|
"Mozilla/5.0 (X11; Linux x86_64) " |
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
|
"Chrome/120.0.0.0 Safari/537.36" |
|
|
), |
|
|
locale='ar-SA', |
|
|
timezone_id='Asia/Riyadh', |
|
|
viewport={'width': 1920, 'height': 1080}, |
|
|
extra_http_headers={ |
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
|
"Accept-Language": "ar,en-US;q=0.9,en;q=0.8", |
|
|
"Accept-Encoding": "gzip, deflate, br", |
|
|
"DNT": "1", |
|
|
"Connection": "keep-alive", |
|
|
"Upgrade-Insecure-Requests": "1", |
|
|
"Referer": "https://shamela.ws/", |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
page = await context.new_page() |
|
|
|
|
|
|
|
|
await page.add_init_script(""" |
|
|
Object.defineProperty(navigator, 'webdriver', { |
|
|
get: () => undefined, |
|
|
}); |
|
|
|
|
|
window.chrome = { |
|
|
runtime: {}, |
|
|
}; |
|
|
|
|
|
Object.defineProperty(navigator, 'plugins', { |
|
|
get: () => [1, 2, 3, 4, 5], |
|
|
}); |
|
|
|
|
|
Object.defineProperty(navigator, 'languages', { |
|
|
get: () => ['ar', 'en-US', 'en'], |
|
|
}); |
|
|
""") |
|
|
|
|
|
|
|
|
response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout) |
|
|
|
|
|
if response and response.status == 200: |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1000, 3000)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
html = await page.content() |
|
|
|
|
|
if len(html.strip()) > 500: |
|
|
|
|
|
preview = html[:300].replace("\n", " ") |
|
|
print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...") |
|
|
await browser.close() |
|
|
return html |
|
|
else: |
|
|
print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}") |
|
|
else: |
|
|
status = response.status if response else "No response" |
|
|
print(f"[WARN] Status {status} on attempt {attempt}") |
|
|
|
|
|
await browser.close() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"[ERROR] Attempt {attempt}: {e}") |
|
|
|
|
|
|
|
|
if attempt < max_retries: |
|
|
wait_time = random.uniform(min_delay, max_delay) |
|
|
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...") |
|
|
await asyncio.sleep(wait_time) |
|
|
|
|
|
print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.") |
|
|
return None |
|
|
|
|
|
|
|
|
async def fetch_html_with_js_execution(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000, wait_for_js=True): |
|
|
""" |
|
|
Enhanced version that can execute JavaScript and wait for dynamic content. |
|
|
|
|
|
Args: |
|
|
url (str): The URL to fetch |
|
|
max_retries (int): Maximum number of retry attempts |
|
|
min_delay (float): Minimum delay between retries in seconds |
|
|
max_delay (float): Maximum delay between retries in seconds |
|
|
headless (bool): Whether to run browser in headless mode |
|
|
timeout (int): Page load timeout in milliseconds |
|
|
wait_for_js (bool): Whether to wait for JavaScript to execute |
|
|
|
|
|
Returns: |
|
|
str or None: HTML content as string or None if failed |
|
|
""" |
|
|
for attempt in range(1, max_retries + 1): |
|
|
print(f"[INFO] Fetching {url} with JS execution (Attempt {attempt}/{max_retries})") |
|
|
|
|
|
try: |
|
|
async with async_playwright() as p: |
|
|
browser = await p.chromium.launch( |
|
|
headless=headless, |
|
|
args=[ |
|
|
'--no-sandbox', |
|
|
'--disable-blink-features=AutomationControlled', |
|
|
'--disable-dev-shm-usage', |
|
|
'--disable-web-security', |
|
|
'--disable-features=VizDisplayCompositor' |
|
|
] |
|
|
) |
|
|
|
|
|
context = await browser.new_context( |
|
|
user_agent=( |
|
|
"Mozilla/5.0 (X11; Linux x86_64) " |
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
|
"Chrome/120.0.0.0 Safari/537.36" |
|
|
), |
|
|
locale='ar-SA', |
|
|
timezone_id='Asia/Riyadh', |
|
|
viewport={'width': 1920, 'height': 1080}, |
|
|
java_script_enabled=True |
|
|
) |
|
|
|
|
|
page = await context.new_page() |
|
|
|
|
|
|
|
|
response = await page.goto(url, wait_until='networkidle', timeout=timeout) |
|
|
|
|
|
if response and response.status == 200: |
|
|
if wait_for_js: |
|
|
|
|
|
try: |
|
|
await page.wait_for_load_state('networkidle', timeout=10000) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
await page.wait_for_timeout(random.randint(2000, 5000)) |
|
|
|
|
|
html = await page.content() |
|
|
|
|
|
if len(html.strip()) > 500: |
|
|
preview = html[:300].replace("\n", " ") |
|
|
print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...") |
|
|
await browser.close() |
|
|
return html |
|
|
else: |
|
|
print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}") |
|
|
else: |
|
|
status = response.status if response else "No response" |
|
|
print(f"[WARN] Status {status} on attempt {attempt}") |
|
|
|
|
|
await browser.close() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"[ERROR] Attempt {attempt}: {e}") |
|
|
|
|
|
if attempt < max_retries: |
|
|
wait_time = random.uniform(min_delay, max_delay) |
|
|
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...") |
|
|
await asyncio.sleep(wait_time) |
|
|
|
|
|
print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.") |
|
|
return None |
|
|
|
|
|
|
|
|
async def fetch_html_with_browser(url, browser_type='chromium', max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000): |
|
|
""" |
|
|
Fetch HTML using different browser types (chromium, firefox, webkit). |
|
|
|
|
|
Args: |
|
|
url (str): The URL to fetch |
|
|
browser_type (str): Browser type ('chromium', 'firefox', 'webkit') |
|
|
max_retries (int): Maximum number of retry attempts |
|
|
min_delay (float): Minimum delay between retries in seconds |
|
|
max_delay (float): Maximum delay between retries in seconds |
|
|
headless (bool): Whether to run browser in headless mode |
|
|
timeout (int): Page load timeout in milliseconds |
|
|
|
|
|
Returns: |
|
|
str or None: HTML content as string or None if failed |
|
|
""" |
|
|
for attempt in range(1, max_retries + 1): |
|
|
print(f"[INFO] Fetching {url} with {browser_type} (Attempt {attempt}/{max_retries})") |
|
|
|
|
|
try: |
|
|
async with async_playwright() as p: |
|
|
|
|
|
if browser_type.lower() == 'firefox': |
|
|
browser = await p.firefox.launch(headless=headless) |
|
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0" |
|
|
elif browser_type.lower() == 'webkit': |
|
|
browser = await p.webkit.launch(headless=headless) |
|
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15" |
|
|
else: |
|
|
browser = await p.chromium.launch( |
|
|
headless=headless, |
|
|
args=[ |
|
|
'--no-sandbox', |
|
|
'--disable-blink-features=AutomationControlled', |
|
|
'--disable-dev-shm-usage', |
|
|
'--disable-web-security', |
|
|
'--disable-features=VizDisplayCompositor' |
|
|
] |
|
|
) |
|
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" |
|
|
|
|
|
context = await browser.new_context( |
|
|
user_agent=user_agent, |
|
|
locale='ar-SA', |
|
|
timezone_id='Asia/Riyadh', |
|
|
viewport={'width': 1920, 'height': 1080} |
|
|
) |
|
|
|
|
|
page = await context.new_page() |
|
|
|
|
|
|
|
|
response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout) |
|
|
|
|
|
if response and response.status == 200: |
|
|
|
|
|
await page.wait_for_timeout(random.randint(1000, 3000)) |
|
|
|
|
|
html = await page.content() |
|
|
|
|
|
if len(html.strip()) > 500: |
|
|
preview = html[:300].replace("\n", " ") |
|
|
print(f"[OK] {url} with {browser_type} (Attempt {attempt}) | Preview: {preview[:150]}...") |
|
|
await browser.close() |
|
|
return html |
|
|
else: |
|
|
print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}") |
|
|
else: |
|
|
status = response.status if response else "No response" |
|
|
print(f"[WARN] Status {status} on attempt {attempt}") |
|
|
|
|
|
await browser.close() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"[ERROR] Attempt {attempt} with {browser_type}: {e}") |
|
|
|
|
|
if attempt < max_retries: |
|
|
wait_time = random.uniform(min_delay, max_delay) |
|
|
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...") |
|
|
await asyncio.sleep(wait_time) |
|
|
|
|
|
print(f"[FAIL] Could not fetch {url} with {browser_type} after {max_retries} attempts.") |
|
|
return None |