Spaces:
Sleeping
Sleeping
| """ | |
| Advanced analysis features: visual testing, link extraction, structured data | |
| """ | |
| import json | |
| import time | |
| import logging | |
| from datetime import datetime | |
| from browser.driver import get_driver, cleanup_driver, create_driver | |
| logger = logging.getLogger(__name__) | |
| def extract_structured_data(url: str, use_persistent: bool = False) -> str: | |
| """Extract structured data (JSON-LD, microdata, meta tags) from page""" | |
| driver = None | |
| try: | |
| driver = get_driver(url, use_persistent) | |
| # Extract various types of structured data | |
| structured_data = driver.execute_script(""" | |
| const data = { | |
| jsonld: [], | |
| meta: {}, | |
| opengraph: {}, | |
| twitter: {}, | |
| microdata: [], | |
| schema_org: [] | |
| }; | |
| // Extract JSON-LD | |
| document.querySelectorAll('script[type="application/ld+json"]').forEach(script => { | |
| try { | |
| const parsed = JSON.parse(script.textContent); | |
| data.jsonld.push(parsed); | |
| // Also add to schema.org if it's schema.org data | |
| if (parsed['@context'] && parsed['@context'].includes('schema.org')) { | |
| data.schema_org.push(parsed); | |
| } | |
| } catch(e) { | |
| console.error('Failed to parse JSON-LD:', e); | |
| } | |
| }); | |
| // Extract meta tags | |
| document.querySelectorAll('meta').forEach(meta => { | |
| const name = meta.getAttribute('name') || meta.getAttribute('property'); | |
| const content = meta.getAttribute('content'); | |
| if (name && content) { | |
| if (name.startsWith('og:')) { | |
| data.opengraph[name] = content; | |
| } else if (name.startsWith('twitter:')) { | |
| data.twitter[name] = content; | |
| } else { | |
| data.meta[name] = content; | |
| } | |
| } | |
| }); | |
| // Extract microdata | |
| document.querySelectorAll('[itemscope]').forEach(item => { | |
| const itemData = { | |
| type: item.getAttribute('itemtype'), | |
| properties: {} | |
| }; | |
| item.querySelectorAll('[itemprop]').forEach(prop => { | |
| const propName = prop.getAttribute('itemprop'); | |
| const propValue = prop.getAttribute('content') || | |
| prop.getAttribute('href') || | |
| prop.textContent.trim(); | |
| itemData.properties[propName] = propValue; | |
| }); | |
| data.microdata.push(itemData); | |
| }); | |
| return data; | |
| """) | |
| # Add summary | |
| structured_data['summary'] = { | |
| 'has_jsonld': len(structured_data['jsonld']) > 0, | |
| 'has_opengraph': len(structured_data['opengraph']) > 0, | |
| 'has_twitter_cards': len(structured_data['twitter']) > 0, | |
| 'has_microdata': len(structured_data['microdata']) > 0, | |
| 'total_meta_tags': len(structured_data['meta']) | |
| } | |
| return json.dumps(structured_data, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error in extract_structured_data: {e}") | |
| return f"Error: {e}" | |
| finally: | |
| cleanup_driver(driver, use_persistent) | |
| def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str: | |
| """Compare two URLs visually for differences""" | |
| driver = None | |
| try: | |
| driver = create_driver(persistent=False) | |
| # Take screenshot of first URL | |
| driver.get(url1) | |
| time.sleep(3) # Wait for page to stabilize | |
| screenshot1_path = "/tmp/screenshot1.png" | |
| driver.save_screenshot(screenshot1_path) | |
| page1_info = { | |
| "title": driver.title, | |
| "url": driver.current_url | |
| } | |
| # Take screenshot of second URL | |
| driver.get(url2) | |
| time.sleep(3) # Wait for page to stabilize | |
| screenshot2_path = "/tmp/screenshot2.png" | |
| driver.save_screenshot(screenshot2_path) | |
| page2_info = { | |
| "title": driver.title, | |
| "url": driver.current_url | |
| } | |
| # Get page dimensions for comparison | |
| dimensions1 = driver.execute_script(""" | |
| return { | |
| width: document.documentElement.scrollWidth, | |
| height: document.documentElement.scrollHeight, | |
| viewport: { | |
| width: window.innerWidth, | |
| height: window.innerHeight | |
| } | |
| } | |
| """) | |
| driver.quit() | |
| # Create comparison result | |
| result = { | |
| "url1": url1, | |
| "url2": url2, | |
| "page1_info": page1_info, | |
| "page2_info": page2_info, | |
| "screenshots": { | |
| "screenshot1": screenshot1_path, | |
| "screenshot2": screenshot2_path | |
| }, | |
| "dimensions_match": dimensions1, | |
| "threshold": threshold, | |
| "timestamp": datetime.now().isoformat(), | |
| "note": "Visual comparison requires external image processing. Screenshots saved for manual review." | |
| } | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error in visual_regression_test: {e}") | |
| if driver: | |
| try: | |
| driver.quit() | |
| except: | |
| pass | |
| return f"Error: {e}" | |
| def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str: | |
| """Extract all links from a page with categorization""" | |
| driver = None | |
| try: | |
| driver = get_driver(url, use_persistent) | |
| # Extract and categorize links | |
| links_data = driver.execute_script(f""" | |
| const currentDomain = new URL(window.location.href).hostname; | |
| const links = {{ | |
| internal: [], | |
| external: [], | |
| email: [], | |
| phone: [], | |
| javascript: [], | |
| anchor: [], | |
| file_downloads: [] | |
| }}; | |
| // Common file extensions for downloads | |
| const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt']; | |
| document.querySelectorAll('a[href]').forEach(a => {{ | |
| const href = a.getAttribute('href'); | |
| const text = a.textContent.trim(); | |
| const linkData = {{ | |
| href: href, | |
| text: text.substring(0, 100), | |
| title: a.title, | |
| target: a.target, | |
| rel: a.rel | |
| }}; | |
| if (href.startsWith('mailto:')) {{ | |
| links.email.push(linkData); | |
| }} else if (href.startsWith('tel:')) {{ | |
| links.phone.push(linkData); | |
| }} else if (href.startsWith('javascript:')) {{ | |
| links.javascript.push(linkData); | |
| }} else if (href.startsWith('#')) {{ | |
| links.anchor.push(linkData); | |
| }} else {{ | |
| try {{ | |
| const linkUrl = new URL(href, window.location.href); | |
| // Check if it's a file download | |
| const isFileDownload = fileExtensions.some(ext => | |
| linkUrl.pathname.toLowerCase().endsWith(ext) | |
| ); | |
| if (isFileDownload) {{ | |
| links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}}); | |
| }} else if (linkUrl.hostname === currentDomain) {{ | |
| links.internal.push({{...linkData, absoluteUrl: linkUrl.href}}); | |
| }} else if ({str(include_external).lower()}) {{ | |
| links.external.push({{...linkData, absoluteUrl: linkUrl.href}}); | |
| }} | |
| }} catch(e) {{ | |
| // Invalid URL, add to javascript category | |
| links.javascript.push(linkData); | |
| }} | |
| }} | |
| }}); | |
| return {{ | |
| links: links, | |
| summary: {{ | |
| total: document.querySelectorAll('a[href]').length, | |
| internal: links.internal.length, | |
| external: links.external.length, | |
| email: links.email.length, | |
| phone: links.phone.length, | |
| javascript: links.javascript.length, | |
| anchor: links.anchor.length, | |
| file_downloads: links.file_downloads.length | |
| }}, | |
| page_info: {{ | |
| title: document.title, | |
| url: window.location.href, | |
| domain: currentDomain | |
| }} | |
| }}; | |
| """) | |
| return json.dumps(links_data, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error in extract_all_links: {e}") | |
| return f"Error: {e}" | |
| finally: | |
| cleanup_driver(driver, use_persistent) | |
| def seo_analysis(url: str, use_persistent: bool = False) -> str: | |
| """Perform SEO analysis on a page""" | |
| driver = None | |
| try: | |
| driver = get_driver(url, use_persistent) | |
| # Perform SEO analysis | |
| seo_data = driver.execute_script(""" | |
| const analysis = { | |
| title: { | |
| content: document.title, | |
| length: document.title.length, | |
| issues: [] | |
| }, | |
| meta_description: { | |
| content: null, | |
| length: 0, | |
| issues: [] | |
| }, | |
| headings: { | |
| h1_count: 0, | |
| h1_texts: [], | |
| hierarchy: [], | |
| issues: [] | |
| }, | |
| images: { | |
| total: 0, | |
| without_alt: 0, | |
| issues: [] | |
| }, | |
| links: { | |
| total: 0, | |
| external: 0, | |
| nofollow: 0 | |
| }, | |
| canonical: null, | |
| robots: null, | |
| lang: document.documentElement.lang, | |
| structured_data_count: 0 | |
| }; | |
| // Check title | |
| if (analysis.title.length < 30) { | |
| analysis.title.issues.push('Title too short (recommended: 30-60 characters)'); | |
| } else if (analysis.title.length > 60) { | |
| analysis.title.issues.push('Title too long (recommended: 30-60 characters)'); | |
| } | |
| // Check meta description | |
| const metaDesc = document.querySelector('meta[name="description"]'); | |
| if (metaDesc) { | |
| analysis.meta_description.content = metaDesc.content; | |
| analysis.meta_description.length = metaDesc.content.length; | |
| if (metaDesc.content.length < 120) { | |
| analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)'); | |
| } else if (metaDesc.content.length > 160) { | |
| analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)'); | |
| } | |
| } else { | |
| analysis.meta_description.issues.push('No meta description found'); | |
| } | |
| // Check headings | |
| const h1s = document.querySelectorAll('h1'); | |
| analysis.headings.h1_count = h1s.length; | |
| h1s.forEach(h1 => { | |
| analysis.headings.h1_texts.push(h1.textContent.trim()); | |
| }); | |
| if (h1s.length === 0) { | |
| analysis.headings.issues.push('No H1 tag found'); | |
| } else if (h1s.length > 1) { | |
| analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)'); | |
| } | |
| // Get heading hierarchy | |
| const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6'); | |
| allHeadings.forEach(h => { | |
| analysis.headings.hierarchy.push({ | |
| level: h.tagName, | |
| text: h.textContent.trim().substring(0, 50) | |
| }); | |
| }); | |
| // Check images | |
| const images = document.querySelectorAll('img'); | |
| analysis.images.total = images.length; | |
| images.forEach(img => { | |
| if (!img.alt) { | |
| analysis.images.without_alt++; | |
| } | |
| }); | |
| if (analysis.images.without_alt > 0) { | |
| analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`); | |
| } | |
| // Check links | |
| const links = document.querySelectorAll('a[href]'); | |
| analysis.links.total = links.length; | |
| links.forEach(link => { | |
| try { | |
| const linkUrl = new URL(link.href, window.location.href); | |
| if (linkUrl.hostname !== window.location.hostname) { | |
| analysis.links.external++; | |
| } | |
| if (link.rel && link.rel.includes('nofollow')) { | |
| analysis.links.nofollow++; | |
| } | |
| } catch(e) {} | |
| }); | |
| // Check canonical | |
| const canonical = document.querySelector('link[rel="canonical"]'); | |
| if (canonical) { | |
| analysis.canonical = canonical.href; | |
| } | |
| // Check robots meta | |
| const robots = document.querySelector('meta[name="robots"]'); | |
| if (robots) { | |
| analysis.robots = robots.content; | |
| } | |
| // Count structured data | |
| analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length; | |
| return analysis; | |
| """) | |
| # Calculate SEO score | |
| score = 100 | |
| total_issues = 0 | |
| for key in ['title', 'meta_description', 'headings', 'images']: | |
| if key in seo_data and 'issues' in seo_data[key]: | |
| issues = len(seo_data[key]['issues']) | |
| total_issues += issues | |
| score -= (issues * 10) | |
| score = max(0, score) | |
| result = { | |
| "url": url, | |
| "seo_score": score, | |
| "analysis": seo_data, | |
| "total_issues": total_issues, | |
| "recommendations": get_seo_recommendations(seo_data) | |
| } | |
| return json.dumps(result, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error in seo_analysis: {e}") | |
| return f"Error: {e}" | |
| finally: | |
| cleanup_driver(driver, use_persistent) | |
| def get_seo_recommendations(seo_data): | |
| """Get SEO recommendations based on analysis""" | |
| recommendations = [] | |
| if seo_data['title']['issues']: | |
| recommendations.extend(seo_data['title']['issues']) | |
| if seo_data['meta_description']['issues']: | |
| recommendations.extend(seo_data['meta_description']['issues']) | |
| if seo_data['headings']['issues']: | |
| recommendations.extend(seo_data['headings']['issues']) | |
| if seo_data['images']['issues']: | |
| recommendations.extend(seo_data['images']['issues']) | |
| if not seo_data['canonical']: | |
| recommendations.append("Add canonical URL to prevent duplicate content issues") | |
| if not seo_data['lang']: | |
| recommendations.append("Add lang attribute to HTML tag for better internationalization") | |
| if seo_data['structured_data_count'] == 0: | |
| recommendations.append("Add structured data (JSON-LD) for better search engine understanding") | |
| return recommendations |