SanadLLM / app /tools /fetch.py
Hydra-Bolt
restructured
eef2a73
import random
import asyncio
from playwright.async_api import async_playwright
import random
async def fetch_html(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000):
"""
Fetch HTML using Playwright with retries and anti-bot measures.
Args:
url (str): The URL to fetch
max_retries (int): Maximum number of retry attempts
min_delay (float): Minimum delay between retries in seconds
max_delay (float): Maximum delay between retries in seconds
headless (bool): Whether to run browser in headless mode
timeout (int): Page load timeout in milliseconds
Returns:
str or None: HTML content as string or None if failed
"""
for attempt in range(1, max_retries + 1):
print(f"[INFO] Fetching {url} (Attempt {attempt}/{max_retries})")
try:
async with async_playwright() as p:
# Launch browser with realistic settings
browser = await p.chromium.launch(
headless=headless,
args=[
'--no-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
]
)
# Create context with realistic settings
context = await browser.new_context(
user_agent=(
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
locale='ar-SA',
timezone_id='Asia/Riyadh',
viewport={'width': 1920, 'height': 1080},
extra_http_headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "ar,en-US;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://shamela.ws/",
}
)
# Create page and set additional stealth measures
page = await context.new_page()
# Hide webdriver traces
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
window.chrome = {
runtime: {},
};
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
Object.defineProperty(navigator, 'languages', {
get: () => ['ar', 'en-US', 'en'],
});
""")
# Navigate to the page
response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)
if response and response.status == 200:
# Wait a bit for any dynamic content
await page.wait_for_timeout(random.randint(1000, 3000))
# Optional: Wait for specific elements if needed
# await page.wait_for_selector('body', timeout=5000)
# Get the HTML content
html = await page.content()
if len(html.strip()) > 500:
# ✅ Got valid HTML
preview = html[:300].replace("\n", " ")
print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...")
await browser.close()
return html
else:
print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
else:
status = response.status if response else "No response"
print(f"[WARN] Status {status} on attempt {attempt}")
await browser.close()
except Exception as e:
print(f"[ERROR] Attempt {attempt}: {e}")
# Wait before retrying
if attempt < max_retries:
wait_time = random.uniform(min_delay, max_delay)
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
await asyncio.sleep(wait_time)
print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.")
return None
async def fetch_html_with_js_execution(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000, wait_for_js=True):
"""
Enhanced version that can execute JavaScript and wait for dynamic content.
Args:
url (str): The URL to fetch
max_retries (int): Maximum number of retry attempts
min_delay (float): Minimum delay between retries in seconds
max_delay (float): Maximum delay between retries in seconds
headless (bool): Whether to run browser in headless mode
timeout (int): Page load timeout in milliseconds
wait_for_js (bool): Whether to wait for JavaScript to execute
Returns:
str or None: HTML content as string or None if failed
"""
for attempt in range(1, max_retries + 1):
print(f"[INFO] Fetching {url} with JS execution (Attempt {attempt}/{max_retries})")
try:
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=headless,
args=[
'--no-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=VizDisplayCompositor'
]
)
context = await browser.new_context(
user_agent=(
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
locale='ar-SA',
timezone_id='Asia/Riyadh',
viewport={'width': 1920, 'height': 1080},
java_script_enabled=True
)
page = await context.new_page()
# Navigate and wait for network to be idle
response = await page.goto(url, wait_until='networkidle', timeout=timeout)
if response and response.status == 200:
if wait_for_js:
# Wait for common dynamic content indicators
try:
await page.wait_for_load_state('networkidle', timeout=10000)
except:
pass # Continue even if network doesn't idle
# Additional wait for any remaining dynamic content
await page.wait_for_timeout(random.randint(2000, 5000))
html = await page.content()
if len(html.strip()) > 500:
preview = html[:300].replace("\n", " ")
print(f"[OK] {url} (Attempt {attempt}) | Preview: {preview[:150]}...")
await browser.close()
return html
else:
print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
else:
status = response.status if response else "No response"
print(f"[WARN] Status {status} on attempt {attempt}")
await browser.close()
except Exception as e:
print(f"[ERROR] Attempt {attempt}: {e}")
if attempt < max_retries:
wait_time = random.uniform(min_delay, max_delay)
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
await asyncio.sleep(wait_time)
print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.")
return None
async def fetch_html_with_browser(url, browser_type='chromium', max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000):
"""
Fetch HTML using different browser types (chromium, firefox, webkit).
Args:
url (str): The URL to fetch
browser_type (str): Browser type ('chromium', 'firefox', 'webkit')
max_retries (int): Maximum number of retry attempts
min_delay (float): Minimum delay between retries in seconds
max_delay (float): Maximum delay between retries in seconds
headless (bool): Whether to run browser in headless mode
timeout (int): Page load timeout in milliseconds
Returns:
str or None: HTML content as string or None if failed
"""
for attempt in range(1, max_retries + 1):
print(f"[INFO] Fetching {url} with {browser_type} (Attempt {attempt}/{max_retries})")
try:
async with async_playwright() as p:
# Get the appropriate browser
if browser_type.lower() == 'firefox':
browser = await p.firefox.launch(headless=headless)
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
elif browser_type.lower() == 'webkit':
browser = await p.webkit.launch(headless=headless)
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15"
else: # default to chromium
browser = await p.chromium.launch(
headless=headless,
args=[
'--no-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=VizDisplayCompositor'
]
)
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
context = await browser.new_context(
user_agent=user_agent,
locale='ar-SA',
timezone_id='Asia/Riyadh',
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
# Navigate to the page
response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)
if response and response.status == 200:
# Wait a bit for any dynamic content
await page.wait_for_timeout(random.randint(1000, 3000))
html = await page.content()
if len(html.strip()) > 500:
preview = html[:300].replace("\n", " ")
print(f"[OK] {url} with {browser_type} (Attempt {attempt}) | Preview: {preview[:150]}...")
await browser.close()
return html
else:
print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
else:
status = response.status if response else "No response"
print(f"[WARN] Status {status} on attempt {attempt}")
await browser.close()
except Exception as e:
print(f"[ERROR] Attempt {attempt} with {browser_type}: {e}")
if attempt < max_retries:
wait_time = random.uniform(min_delay, max_delay)
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
await asyncio.sleep(wait_time)
print(f"[FAIL] Could not fetch {url} with {browser_type} after {max_retries} attempts.")
return None