Spaces:

HydraBolt
/

SanadLLM

Running

Hydra-Bolt

restructured

eef2a73 5 months ago

13.2 kB



	import random
	import asyncio
	from playwright.async_api import async_playwright
	import random

	async def fetch_html(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000):
	"""
	Fetch HTML using Playwright with retries and anti-bot measures.

	Args:
	url (str): The URL to fetch
	max_retries (int): Maximum number of retry attempts
	min_delay (float): Minimum delay between retries in seconds
	max_delay (float): Maximum delay between retries in seconds
	headless (bool): Whether to run browser in headless mode
	timeout (int): Page load timeout in milliseconds

	Returns:
	str or None: HTML content as string or None if failed
	"""
	for attempt in range(1, max_retries + 1):
	print(f"[INFO] Fetching {url} (Attempt {attempt}/{max_retries})")

	try:
	async with async_playwright() as p:
	# Launch browser with realistic settings
	browser = await p.chromium.launch(
	headless=headless,
	args=[
	'--no-sandbox',
	'--disable-blink-features=AutomationControlled',
	'--disable-dev-shm-usage',
	'--disable-web-security',
	'--disable-features=VizDisplayCompositor',
	'--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
	]
	)

	# Create context with realistic settings
	context = await browser.new_context(
	user_agent=(
	"Mozilla/5.0 (X11; Linux x86_64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	locale='ar-SA',
	timezone_id='Asia/Riyadh',
	viewport={'width': 1920, 'height': 1080},
	extra_http_headers={
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "ar,en-US;q=0.9,en;q=0.8",
	"Accept-Encoding": "gzip, deflate, br",
	"DNT": "1",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	"Referer": "https://shamela.ws/",
	}
	)

	# Create page and set additional stealth measures
	page = await context.new_page()

	# Hide webdriver traces
	await page.add_init_script("""
	Object.defineProperty(navigator, 'webdriver', {
	get: () => undefined,
	});

	window.chrome = {
	runtime: {},
	};

	Object.defineProperty(navigator, 'plugins', {
	get: () => [1, 2, 3, 4, 5],
	});

	Object.defineProperty(navigator, 'languages', {
	get: () => ['ar', 'en-US', 'en'],
	});
	""")

	# Navigate to the page
	response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)

	if response and response.status == 200:
	# Wait a bit for any dynamic content
	await page.wait_for_timeout(random.randint(1000, 3000))

	# Optional: Wait for specific elements if needed
	# await page.wait_for_selector('body', timeout=5000)

	# Get the HTML content
	html = await page.content()

	if len(html.strip()) > 500:
	# ✅ Got valid HTML
	preview = html[:300].replace("\n", " ")
	print(f"[OK] {url} (Attempt {attempt}) \| Preview: {preview[:150]}...")
	await browser.close()
	return html
	else:
	print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
	else:
	status = response.status if response else "No response"
	print(f"[WARN] Status {status} on attempt {attempt}")

	await browser.close()

	except Exception as e:
	print(f"[ERROR] Attempt {attempt}: {e}")

	# Wait before retrying
	if attempt < max_retries:
	wait_time = random.uniform(min_delay, max_delay)
	print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
	await asyncio.sleep(wait_time)

	print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.")
	return None


	async def fetch_html_with_js_execution(url, max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000, wait_for_js=True):
	"""
	Enhanced version that can execute JavaScript and wait for dynamic content.

	Args:
	url (str): The URL to fetch
	max_retries (int): Maximum number of retry attempts
	min_delay (float): Minimum delay between retries in seconds
	max_delay (float): Maximum delay between retries in seconds
	headless (bool): Whether to run browser in headless mode
	timeout (int): Page load timeout in milliseconds
	wait_for_js (bool): Whether to wait for JavaScript to execute

	Returns:
	str or None: HTML content as string or None if failed
	"""
	for attempt in range(1, max_retries + 1):
	print(f"[INFO] Fetching {url} with JS execution (Attempt {attempt}/{max_retries})")

	try:
	async with async_playwright() as p:
	browser = await p.chromium.launch(
	headless=headless,
	args=[
	'--no-sandbox',
	'--disable-blink-features=AutomationControlled',
	'--disable-dev-shm-usage',
	'--disable-web-security',
	'--disable-features=VizDisplayCompositor'
	]
	)

	context = await browser.new_context(
	user_agent=(
	"Mozilla/5.0 (X11; Linux x86_64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	locale='ar-SA',
	timezone_id='Asia/Riyadh',
	viewport={'width': 1920, 'height': 1080},
	java_script_enabled=True
	)

	page = await context.new_page()

	# Navigate and wait for network to be idle
	response = await page.goto(url, wait_until='networkidle', timeout=timeout)

	if response and response.status == 200:
	if wait_for_js:
	# Wait for common dynamic content indicators
	try:
	await page.wait_for_load_state('networkidle', timeout=10000)
	except:
	pass # Continue even if network doesn't idle

	# Additional wait for any remaining dynamic content
	await page.wait_for_timeout(random.randint(2000, 5000))

	html = await page.content()

	if len(html.strip()) > 500:
	preview = html[:300].replace("\n", " ")
	print(f"[OK] {url} (Attempt {attempt}) \| Preview: {preview[:150]}...")
	await browser.close()
	return html
	else:
	print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
	else:
	status = response.status if response else "No response"
	print(f"[WARN] Status {status} on attempt {attempt}")

	await browser.close()

	except Exception as e:
	print(f"[ERROR] Attempt {attempt}: {e}")

	if attempt < max_retries:
	wait_time = random.uniform(min_delay, max_delay)
	print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
	await asyncio.sleep(wait_time)

	print(f"[FAIL] Could not fetch {url} after {max_retries} attempts.")
	return None


	async def fetch_html_with_browser(url, browser_type='chromium', max_retries=5, min_delay=1, max_delay=3, headless=True, timeout=30000):
	"""
	Fetch HTML using different browser types (chromium, firefox, webkit).

	Args:
	url (str): The URL to fetch
	browser_type (str): Browser type ('chromium', 'firefox', 'webkit')
	max_retries (int): Maximum number of retry attempts
	min_delay (float): Minimum delay between retries in seconds
	max_delay (float): Maximum delay between retries in seconds
	headless (bool): Whether to run browser in headless mode
	timeout (int): Page load timeout in milliseconds

	Returns:
	str or None: HTML content as string or None if failed
	"""
	for attempt in range(1, max_retries + 1):
	print(f"[INFO] Fetching {url} with {browser_type} (Attempt {attempt}/{max_retries})")

	try:
	async with async_playwright() as p:
	# Get the appropriate browser
	if browser_type.lower() == 'firefox':
	browser = await p.firefox.launch(headless=headless)
	user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
	elif browser_type.lower() == 'webkit':
	browser = await p.webkit.launch(headless=headless)
	user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15"
	else: # default to chromium
	browser = await p.chromium.launch(
	headless=headless,
	args=[
	'--no-sandbox',
	'--disable-blink-features=AutomationControlled',
	'--disable-dev-shm-usage',
	'--disable-web-security',
	'--disable-features=VizDisplayCompositor'
	]
	)
	user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

	context = await browser.new_context(
	user_agent=user_agent,
	locale='ar-SA',
	timezone_id='Asia/Riyadh',
	viewport={'width': 1920, 'height': 1080}
	)

	page = await context.new_page()

	# Navigate to the page
	response = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)

	if response and response.status == 200:
	# Wait a bit for any dynamic content
	await page.wait_for_timeout(random.randint(1000, 3000))

	html = await page.content()

	if len(html.strip()) > 500:
	preview = html[:300].replace("\n", " ")
	print(f"[OK] {url} with {browser_type} (Attempt {attempt}) \| Preview: {preview[:150]}...")
	await browser.close()
	return html
	else:
	print(f"[WARN] Short HTML content ({len(html)} chars) on attempt {attempt}")
	else:
	status = response.status if response else "No response"
	print(f"[WARN] Status {status} on attempt {attempt}")

	await browser.close()

	except Exception as e:
	print(f"[ERROR] Attempt {attempt} with {browser_type}: {e}")

	if attempt < max_retries:
	wait_time = random.uniform(min_delay, max_delay)
	print(f"[INFO] Waiting {wait_time:.1f} seconds before retry...")
	await asyncio.sleep(wait_time)

	print(f"[FAIL] Could not fetch {url} with {browser_type} after {max_retries} attempts.")
	return None