bold-tool-81 / crawler.py
samihalawa's picture
Deploy Gradio app with multiple files
577ff23 verified
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from playwright.sync_api import sync_playwright
from PIL import Image
from io import BytesIO
import time
from typing import List, Dict, Any
# --- Configuration ---
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 GradioCrawler/1.0'
# ---------------------
# Global set to store visited URLs for the current crawl session
visited_urls = set()
def _get_abs_url(base_url, href):
"""Converts a relative or partial URL to an absolute URL."""
try:
return urljoin(base_url, href)
except ValueError:
return None
def get_links_and_crawl(session_url: str, html_content: str, max_depth: int, current_depth: int) -> List[str]:
"""
Extracts new, unvisited, same-domain links from HTML.
Returns a list of unique, normalized URLs.
"""
if current_depth >= max_depth:
return []
base_domain = urlparse(session_url).netloc
try:
soup = BeautifulSoup(html_content, 'html.parser')
except Exception:
return []
found_links = []
for link in soup.find_all('a', href=True):
href = link.get('href')
abs_url = _get_abs_url(session_url, href)
if not abs_url:
continue
parsed_url = urlparse(abs_url)
# Filter for HTTP/HTTPS, same domain, and exclude anchors/fragments
if (parsed_url.scheme in ['http', 'https'] and
parsed_url.netloc == base_domain):
# Normalize URL (remove query parameters and fragments for uniqueness check)
normalized_url = parsed_url._replace(query='', fragment='').geturl()
# Use visited_urls set outside this function for the primary filtering
if normalized_url not in visited_urls:
found_links.append(normalized_url)
# Return unique links found at this step
return list(set(found_links))
def take_screenshot(url: str, page) -> Image.Image | None:
"""
Takes a full-page screenshot using Playwright.
"""
try:
# Navigate and wait until network is mostly idle
page.goto(url, wait_until="networkidle", timeout=30000)
# Capture screenshot of the full scrollable page
screenshot_bytes = page.screenshot(full_page=True, type="jpeg", quality=85)
img = Image.open(BytesIO(screenshot_bytes))
return img
except Exception as e:
print(f"Error taking screenshot for {url}: {e}")
return None
def analyze_screenshot_vlm(image: Image.Image, url: str) -> str:
"""
Simulated Multimodal Model (VLM) analysis function.
In a production system, replace this with an API call to GPT-4o, Claude, etc.,
passing the image data and URL as context.
Returns: A Markdown formatted report summary.
"""
# --- Simulate VLM Inference and Latency ---
time.sleep(2)
width, height = image.size
# Simulated Issue Detection based on URL hashing
issues = []
if hash(url) % 7 == 0:
issues.append("## [CRITICAL] Broken Element Detection")
issues.append(f"The model detected elements overlapping or failing to load correctly in the viewport. This is a severe UX issue affecting the primary CTA zone.")
elif hash(url) % 5 == 0:
issues.append("### [WARNING] Accessibility Warning")
issues.append("The VLM flagged insufficient font size on mobile emulation, potentially failing WCAG guidelines for text readability.")
# Simulate success or general feedback
if not issues:
issues.append("## Analysis Summary: Clean Page")
issues.append("No critical UX/UI issues or functional anomalies were immediately detected.")
issues.append(f"The page rendered fully (Dimensions: {width}x{height} pixels, processed via simulated VLM).")
else:
issues.insert(0, f"## Analysis Results for {url}")
report = "\n\n---\n\n".join(issues)
return report
def crawl_and_analyze_pipeline(start_url: str, max_depth: int, max_pages: int, progress: callable) -> List[Dict[str, Any]]:
"""
Main pipeline function: crawls, screenshots, and analyzes pages in sequence.
"""
global visited_urls
visited_urls.clear()
# Queue stores (URL, depth) tuples
queue = [(start_url, 0)]
results = [] # Stores: (URL, Image, Report)
pages_processed = 0
if not start_url.startswith('http'):
start_url = 'https://' + start_url
if not urlparse(start_url).netloc:
raise ValueError("Invalid starting URL format or domain.")
with sync_playwright() as p:
# Using a fixed 1920x1080 viewport for consistent screenshots
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent=DEFAULT_USER_AGENT,
viewport={'width': 1920, 'height': 1080}
)
page = context.new_page()
progress(0, desc="Starting crawl and initializing browser...")
while queue and pages_processed < max_pages:
current_url, current_depth = queue.pop(0)
# Important: Check visited_urls here to avoid fetching/processing pages we already started processing
if current_url in visited_urls:
continue
# --- Fetch HTML and Mark Visited ---
try:
response = requests.get(current_url, headers={'User-Agent': DEFAULT_USER_AGENT}, timeout=10)
if response.status_code != 200 or 'text/html' not in response.headers.get('Content-Type', ''):
print(f"Skipping {current_url}: Status {response.status_code} or not HTML.")
visited_urls.add(current_url)
continue
html_content = response.text
except requests.RequestException as e:
print(f"Failed to fetch HTML for {current_url}: {e}")
visited_urls.add(current_url)
continue
visited_urls.add(current_url)
pages_processed += 1
# --- Screenshot and Analysis Step ---
progress_value = (pages_processed / max_pages) * 100
progress(progress_value, desc=f"Processing Page {pages_processed}/{max_pages} (Depth {current_depth}): {current_url}")
# 1. Take Screenshot using Playwright
screenshot = take_screenshot(current_url, page)
if screenshot:
# 2. Analyze Screenshot
report = analyze_screenshot_vlm(screenshot, current_url)
results.append({
"url": current_url,
"image": screenshot,
"report": report
})
else:
# Placeholder for failed screenshot
failed_img = Image.new('RGB', (200, 100), color = 'red')
results.append({
"url": current_url,
"image": failed_img,
"report": f"## Error\nCould not capture screenshot for {current_url}. Page may have timed out or failed to load correctly."
})
# --- Link Discovery Step ---
if current_depth < max_depth and pages_processed < max_pages:
new_links = get_links_and_crawl(current_url, html_content, max_depth, current_depth)
# Add new links to the queue, respecting the max pages limit
for link in new_links:
if len(queue) + pages_processed < max_pages and link not in visited_urls:
# Add link to queue for processing later
queue.append((link, current_depth + 1))
browser.close()
return results