Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from playwright.sync_api import sync_playwright | |
| from PIL import Image | |
| from io import BytesIO | |
| import time | |
| from typing import List, Dict, Any | |
| # --- Configuration --- | |
| DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 GradioCrawler/1.0' | |
| # --------------------- | |
| # Global set to store visited URLs for the current crawl session | |
| visited_urls = set() | |
| def _get_abs_url(base_url, href): | |
| """Converts a relative or partial URL to an absolute URL.""" | |
| try: | |
| return urljoin(base_url, href) | |
| except ValueError: | |
| return None | |
| def get_links_and_crawl(session_url: str, html_content: str, max_depth: int, current_depth: int) -> List[str]: | |
| """ | |
| Extracts new, unvisited, same-domain links from HTML. | |
| Returns a list of unique, normalized URLs. | |
| """ | |
| if current_depth >= max_depth: | |
| return [] | |
| base_domain = urlparse(session_url).netloc | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| except Exception: | |
| return [] | |
| found_links = [] | |
| for link in soup.find_all('a', href=True): | |
| href = link.get('href') | |
| abs_url = _get_abs_url(session_url, href) | |
| if not abs_url: | |
| continue | |
| parsed_url = urlparse(abs_url) | |
| # Filter for HTTP/HTTPS, same domain, and exclude anchors/fragments | |
| if (parsed_url.scheme in ['http', 'https'] and | |
| parsed_url.netloc == base_domain): | |
| # Normalize URL (remove query parameters and fragments for uniqueness check) | |
| normalized_url = parsed_url._replace(query='', fragment='').geturl() | |
| # Use visited_urls set outside this function for the primary filtering | |
| if normalized_url not in visited_urls: | |
| found_links.append(normalized_url) | |
| # Return unique links found at this step | |
| return list(set(found_links)) | |
| def take_screenshot(url: str, page) -> Image.Image | None: | |
| """ | |
| Takes a full-page screenshot using Playwright. | |
| """ | |
| try: | |
| # Navigate and wait until network is mostly idle | |
| page.goto(url, wait_until="networkidle", timeout=30000) | |
| # Capture screenshot of the full scrollable page | |
| screenshot_bytes = page.screenshot(full_page=True, type="jpeg", quality=85) | |
| img = Image.open(BytesIO(screenshot_bytes)) | |
| return img | |
| except Exception as e: | |
| print(f"Error taking screenshot for {url}: {e}") | |
| return None | |
| def analyze_screenshot_vlm(image: Image.Image, url: str) -> str: | |
| """ | |
| Simulated Multimodal Model (VLM) analysis function. | |
| In a production system, replace this with an API call to GPT-4o, Claude, etc., | |
| passing the image data and URL as context. | |
| Returns: A Markdown formatted report summary. | |
| """ | |
| # --- Simulate VLM Inference and Latency --- | |
| time.sleep(2) | |
| width, height = image.size | |
| # Simulated Issue Detection based on URL hashing | |
| issues = [] | |
| if hash(url) % 7 == 0: | |
| issues.append("## [CRITICAL] Broken Element Detection") | |
| issues.append(f"The model detected elements overlapping or failing to load correctly in the viewport. This is a severe UX issue affecting the primary CTA zone.") | |
| elif hash(url) % 5 == 0: | |
| issues.append("### [WARNING] Accessibility Warning") | |
| issues.append("The VLM flagged insufficient font size on mobile emulation, potentially failing WCAG guidelines for text readability.") | |
| # Simulate success or general feedback | |
| if not issues: | |
| issues.append("## Analysis Summary: Clean Page") | |
| issues.append("No critical UX/UI issues or functional anomalies were immediately detected.") | |
| issues.append(f"The page rendered fully (Dimensions: {width}x{height} pixels, processed via simulated VLM).") | |
| else: | |
| issues.insert(0, f"## Analysis Results for {url}") | |
| report = "\n\n---\n\n".join(issues) | |
| return report | |
| def crawl_and_analyze_pipeline(start_url: str, max_depth: int, max_pages: int, progress: callable) -> List[Dict[str, Any]]: | |
| """ | |
| Main pipeline function: crawls, screenshots, and analyzes pages in sequence. | |
| """ | |
| global visited_urls | |
| visited_urls.clear() | |
| # Queue stores (URL, depth) tuples | |
| queue = [(start_url, 0)] | |
| results = [] # Stores: (URL, Image, Report) | |
| pages_processed = 0 | |
| if not start_url.startswith('http'): | |
| start_url = 'https://' + start_url | |
| if not urlparse(start_url).netloc: | |
| raise ValueError("Invalid starting URL format or domain.") | |
| with sync_playwright() as p: | |
| # Using a fixed 1920x1080 viewport for consistent screenshots | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context( | |
| user_agent=DEFAULT_USER_AGENT, | |
| viewport={'width': 1920, 'height': 1080} | |
| ) | |
| page = context.new_page() | |
| progress(0, desc="Starting crawl and initializing browser...") | |
| while queue and pages_processed < max_pages: | |
| current_url, current_depth = queue.pop(0) | |
| # Important: Check visited_urls here to avoid fetching/processing pages we already started processing | |
| if current_url in visited_urls: | |
| continue | |
| # --- Fetch HTML and Mark Visited --- | |
| try: | |
| response = requests.get(current_url, headers={'User-Agent': DEFAULT_USER_AGENT}, timeout=10) | |
| if response.status_code != 200 or 'text/html' not in response.headers.get('Content-Type', ''): | |
| print(f"Skipping {current_url}: Status {response.status_code} or not HTML.") | |
| visited_urls.add(current_url) | |
| continue | |
| html_content = response.text | |
| except requests.RequestException as e: | |
| print(f"Failed to fetch HTML for {current_url}: {e}") | |
| visited_urls.add(current_url) | |
| continue | |
| visited_urls.add(current_url) | |
| pages_processed += 1 | |
| # --- Screenshot and Analysis Step --- | |
| progress_value = (pages_processed / max_pages) * 100 | |
| progress(progress_value, desc=f"Processing Page {pages_processed}/{max_pages} (Depth {current_depth}): {current_url}") | |
| # 1. Take Screenshot using Playwright | |
| screenshot = take_screenshot(current_url, page) | |
| if screenshot: | |
| # 2. Analyze Screenshot | |
| report = analyze_screenshot_vlm(screenshot, current_url) | |
| results.append({ | |
| "url": current_url, | |
| "image": screenshot, | |
| "report": report | |
| }) | |
| else: | |
| # Placeholder for failed screenshot | |
| failed_img = Image.new('RGB', (200, 100), color = 'red') | |
| results.append({ | |
| "url": current_url, | |
| "image": failed_img, | |
| "report": f"## Error\nCould not capture screenshot for {current_url}. Page may have timed out or failed to load correctly." | |
| }) | |
| # --- Link Discovery Step --- | |
| if current_depth < max_depth and pages_processed < max_pages: | |
| new_links = get_links_and_crawl(current_url, html_content, max_depth, current_depth) | |
| # Add new links to the queue, respecting the max pages limit | |
| for link in new_links: | |
| if len(queue) + pages_processed < max_pages and link not in visited_urls: | |
| # Add link to queue for processing later | |
| queue.append((link, current_depth + 1)) | |
| browser.close() | |
| return results |