Spaces:

samihalawa
/

bold-tool-81

Runtime error

App Files Files Community

bold-tool-81 / crawler.py

samihalawa

Deploy Gradio app with multiple files

577ff23 verified 7 months ago

raw

history blame contribute delete

8.1 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from playwright.sync_api import sync_playwright
	from PIL import Image
	from io import BytesIO
	import time
	from typing import List, Dict, Any

	# --- Configuration ---
	DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 GradioCrawler/1.0'
	# ---------------------

	# Global set to store visited URLs for the current crawl session
	visited_urls = set()

	def _get_abs_url(base_url, href):
	"""Converts a relative or partial URL to an absolute URL."""
	try:
	return urljoin(base_url, href)
	except ValueError:
	return None

	def get_links_and_crawl(session_url: str, html_content: str, max_depth: int, current_depth: int) -> List[str]:
	"""
	Extracts new, unvisited, same-domain links from HTML.
	Returns a list of unique, normalized URLs.
	"""
	if current_depth >= max_depth:
	return []

	base_domain = urlparse(session_url).netloc

	try:
	soup = BeautifulSoup(html_content, 'html.parser')
	except Exception:
	return []

	found_links = []

	for link in soup.find_all('a', href=True):
	href = link.get('href')

	abs_url = _get_abs_url(session_url, href)

	if not abs_url:
	continue

	parsed_url = urlparse(abs_url)

	# Filter for HTTP/HTTPS, same domain, and exclude anchors/fragments
	if (parsed_url.scheme in ['http', 'https'] and
	parsed_url.netloc == base_domain):

	# Normalize URL (remove query parameters and fragments for uniqueness check)
	normalized_url = parsed_url._replace(query='', fragment='').geturl()

	# Use visited_urls set outside this function for the primary filtering
	if normalized_url not in visited_urls:
	found_links.append(normalized_url)

	# Return unique links found at this step
	return list(set(found_links))

	def take_screenshot(url: str, page) -> Image.Image \| None:
	"""
	Takes a full-page screenshot using Playwright.
	"""
	try:
	# Navigate and wait until network is mostly idle
	page.goto(url, wait_until="networkidle", timeout=30000)

	# Capture screenshot of the full scrollable page
	screenshot_bytes = page.screenshot(full_page=True, type="jpeg", quality=85)

	img = Image.open(BytesIO(screenshot_bytes))
	return img

	except Exception as e:
	print(f"Error taking screenshot for {url}: {e}")
	return None

	def analyze_screenshot_vlm(image: Image.Image, url: str) -> str:
	"""
	Simulated Multimodal Model (VLM) analysis function.

	In a production system, replace this with an API call to GPT-4o, Claude, etc.,
	passing the image data and URL as context.

	Returns: A Markdown formatted report summary.
	"""

	# --- Simulate VLM Inference and Latency ---
	time.sleep(2)

	width, height = image.size

	# Simulated Issue Detection based on URL hashing
	issues = []

	if hash(url) % 7 == 0:
	issues.append("## [CRITICAL] Broken Element Detection")
	issues.append(f"The model detected elements overlapping or failing to load correctly in the viewport. This is a severe UX issue affecting the primary CTA zone.")

	elif hash(url) % 5 == 0:
	issues.append("### [WARNING] Accessibility Warning")
	issues.append("The VLM flagged insufficient font size on mobile emulation, potentially failing WCAG guidelines for text readability.")

	# Simulate success or general feedback
	if not issues:
	issues.append("## Analysis Summary: Clean Page")
	issues.append("No critical UX/UI issues or functional anomalies were immediately detected.")
	issues.append(f"The page rendered fully (Dimensions: {width}x{height} pixels, processed via simulated VLM).")
	else:
	issues.insert(0, f"## Analysis Results for {url}")

	report = "\n\n---\n\n".join(issues)

	return report

	def crawl_and_analyze_pipeline(start_url: str, max_depth: int, max_pages: int, progress: callable) -> List[Dict[str, Any]]:
	"""
	Main pipeline function: crawls, screenshots, and analyzes pages in sequence.
	"""

	global visited_urls
	visited_urls.clear()

	# Queue stores (URL, depth) tuples
	queue = [(start_url, 0)]
	results = [] # Stores: (URL, Image, Report)
	pages_processed = 0

	if not start_url.startswith('http'):
	start_url = 'https://' + start_url

	if not urlparse(start_url).netloc:
	raise ValueError("Invalid starting URL format or domain.")


	with sync_playwright() as p:
	# Using a fixed 1920x1080 viewport for consistent screenshots
	browser = p.chromium.launch(headless=True)
	context = browser.new_context(
	user_agent=DEFAULT_USER_AGENT,
	viewport={'width': 1920, 'height': 1080}
	)
	page = context.new_page()

	progress(0, desc="Starting crawl and initializing browser...")

	while queue and pages_processed < max_pages:
	current_url, current_depth = queue.pop(0)

	# Important: Check visited_urls here to avoid fetching/processing pages we already started processing
	if current_url in visited_urls:
	continue

	# --- Fetch HTML and Mark Visited ---
	try:
	response = requests.get(current_url, headers={'User-Agent': DEFAULT_USER_AGENT}, timeout=10)
	if response.status_code != 200 or 'text/html' not in response.headers.get('Content-Type', ''):
	print(f"Skipping {current_url}: Status {response.status_code} or not HTML.")
	visited_urls.add(current_url)
	continue
	html_content = response.text

	except requests.RequestException as e:
	print(f"Failed to fetch HTML for {current_url}: {e}")
	visited_urls.add(current_url)
	continue

	visited_urls.add(current_url)
	pages_processed += 1

	# --- Screenshot and Analysis Step ---
	progress_value = (pages_processed / max_pages) * 100
	progress(progress_value, desc=f"Processing Page {pages_processed}/{max_pages} (Depth {current_depth}): {current_url}")

	# 1. Take Screenshot using Playwright
	screenshot = take_screenshot(current_url, page)

	if screenshot:
	# 2. Analyze Screenshot
	report = analyze_screenshot_vlm(screenshot, current_url)

	results.append({
	"url": current_url,
	"image": screenshot,
	"report": report
	})
	else:
	# Placeholder for failed screenshot
	failed_img = Image.new('RGB', (200, 100), color = 'red')
	results.append({
	"url": current_url,
	"image": failed_img,
	"report": f"## Error\nCould not capture screenshot for {current_url}. Page may have timed out or failed to load correctly."
	})

	# --- Link Discovery Step ---
	if current_depth < max_depth and pages_processed < max_pages:
	new_links = get_links_and_crawl(current_url, html_content, max_depth, current_depth)

	# Add new links to the queue, respecting the max pages limit
	for link in new_links:
	if len(queue) + pages_processed < max_pages and link not in visited_urls:
	# Add link to queue for processing later
	queue.append((link, current_depth + 1))

	browser.close()

	return results