""" Advanced analysis features: visual testing, link extraction, structured data """ import json import time import logging from datetime import datetime from browser.driver import get_driver, cleanup_driver, create_driver logger = logging.getLogger(__name__) def extract_structured_data(url: str, use_persistent: bool = False) -> str: """Extract structured data (JSON-LD, microdata, meta tags) from page""" driver = None try: driver = get_driver(url, use_persistent) # Extract various types of structured data structured_data = driver.execute_script(""" const data = { jsonld: [], meta: {}, opengraph: {}, twitter: {}, microdata: [], schema_org: [] }; // Extract JSON-LD document.querySelectorAll('script[type="application/ld+json"]').forEach(script => { try { const parsed = JSON.parse(script.textContent); data.jsonld.push(parsed); // Also add to schema.org if it's schema.org data if (parsed['@context'] && parsed['@context'].includes('schema.org')) { data.schema_org.push(parsed); } } catch(e) { console.error('Failed to parse JSON-LD:', e); } }); // Extract meta tags document.querySelectorAll('meta').forEach(meta => { const name = meta.getAttribute('name') || meta.getAttribute('property'); const content = meta.getAttribute('content'); if (name && content) { if (name.startsWith('og:')) { data.opengraph[name] = content; } else if (name.startsWith('twitter:')) { data.twitter[name] = content; } else { data.meta[name] = content; } } }); // Extract microdata document.querySelectorAll('[itemscope]').forEach(item => { const itemData = { type: item.getAttribute('itemtype'), properties: {} }; item.querySelectorAll('[itemprop]').forEach(prop => { const propName = prop.getAttribute('itemprop'); const propValue = prop.getAttribute('content') || prop.getAttribute('href') || prop.textContent.trim(); itemData.properties[propName] = propValue; }); data.microdata.push(itemData); }); return data; """) # Add summary structured_data['summary'] = { 'has_jsonld': len(structured_data['jsonld']) > 0, 'has_opengraph': len(structured_data['opengraph']) > 0, 'has_twitter_cards': len(structured_data['twitter']) > 0, 'has_microdata': len(structured_data['microdata']) > 0, 'total_meta_tags': len(structured_data['meta']) } return json.dumps(structured_data, indent=2) except Exception as e: logger.error(f"Error in extract_structured_data: {e}") return f"Error: {e}" finally: cleanup_driver(driver, use_persistent) def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str: """Compare two URLs visually for differences""" driver = None try: driver = create_driver(persistent=False) # Take screenshot of first URL driver.get(url1) time.sleep(3) # Wait for page to stabilize screenshot1_path = "/tmp/screenshot1.png" driver.save_screenshot(screenshot1_path) page1_info = { "title": driver.title, "url": driver.current_url } # Take screenshot of second URL driver.get(url2) time.sleep(3) # Wait for page to stabilize screenshot2_path = "/tmp/screenshot2.png" driver.save_screenshot(screenshot2_path) page2_info = { "title": driver.title, "url": driver.current_url } # Get page dimensions for comparison dimensions1 = driver.execute_script(""" return { width: document.documentElement.scrollWidth, height: document.documentElement.scrollHeight, viewport: { width: window.innerWidth, height: window.innerHeight } } """) driver.quit() # Create comparison result result = { "url1": url1, "url2": url2, "page1_info": page1_info, "page2_info": page2_info, "screenshots": { "screenshot1": screenshot1_path, "screenshot2": screenshot2_path }, "dimensions_match": dimensions1, "threshold": threshold, "timestamp": datetime.now().isoformat(), "note": "Visual comparison requires external image processing. Screenshots saved for manual review." } return json.dumps(result, indent=2) except Exception as e: logger.error(f"Error in visual_regression_test: {e}") if driver: try: driver.quit() except: pass return f"Error: {e}" def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str: """Extract all links from a page with categorization""" driver = None try: driver = get_driver(url, use_persistent) # Extract and categorize links links_data = driver.execute_script(f""" const currentDomain = new URL(window.location.href).hostname; const links = {{ internal: [], external: [], email: [], phone: [], javascript: [], anchor: [], file_downloads: [] }}; // Common file extensions for downloads const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt']; document.querySelectorAll('a[href]').forEach(a => {{ const href = a.getAttribute('href'); const text = a.textContent.trim(); const linkData = {{ href: href, text: text.substring(0, 100), title: a.title, target: a.target, rel: a.rel }}; if (href.startsWith('mailto:')) {{ links.email.push(linkData); }} else if (href.startsWith('tel:')) {{ links.phone.push(linkData); }} else if (href.startsWith('javascript:')) {{ links.javascript.push(linkData); }} else if (href.startsWith('#')) {{ links.anchor.push(linkData); }} else {{ try {{ const linkUrl = new URL(href, window.location.href); // Check if it's a file download const isFileDownload = fileExtensions.some(ext => linkUrl.pathname.toLowerCase().endsWith(ext) ); if (isFileDownload) {{ links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}}); }} else if (linkUrl.hostname === currentDomain) {{ links.internal.push({{...linkData, absoluteUrl: linkUrl.href}}); }} else if ({str(include_external).lower()}) {{ links.external.push({{...linkData, absoluteUrl: linkUrl.href}}); }} }} catch(e) {{ // Invalid URL, add to javascript category links.javascript.push(linkData); }} }} }}); return {{ links: links, summary: {{ total: document.querySelectorAll('a[href]').length, internal: links.internal.length, external: links.external.length, email: links.email.length, phone: links.phone.length, javascript: links.javascript.length, anchor: links.anchor.length, file_downloads: links.file_downloads.length }}, page_info: {{ title: document.title, url: window.location.href, domain: currentDomain }} }}; """) return json.dumps(links_data, indent=2) except Exception as e: logger.error(f"Error in extract_all_links: {e}") return f"Error: {e}" finally: cleanup_driver(driver, use_persistent) def seo_analysis(url: str, use_persistent: bool = False) -> str: """Perform SEO analysis on a page""" driver = None try: driver = get_driver(url, use_persistent) # Perform SEO analysis seo_data = driver.execute_script(""" const analysis = { title: { content: document.title, length: document.title.length, issues: [] }, meta_description: { content: null, length: 0, issues: [] }, headings: { h1_count: 0, h1_texts: [], hierarchy: [], issues: [] }, images: { total: 0, without_alt: 0, issues: [] }, links: { total: 0, external: 0, nofollow: 0 }, canonical: null, robots: null, lang: document.documentElement.lang, structured_data_count: 0 }; // Check title if (analysis.title.length < 30) { analysis.title.issues.push('Title too short (recommended: 30-60 characters)'); } else if (analysis.title.length > 60) { analysis.title.issues.push('Title too long (recommended: 30-60 characters)'); } // Check meta description const metaDesc = document.querySelector('meta[name="description"]'); if (metaDesc) { analysis.meta_description.content = metaDesc.content; analysis.meta_description.length = metaDesc.content.length; if (metaDesc.content.length < 120) { analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)'); } else if (metaDesc.content.length > 160) { analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)'); } } else { analysis.meta_description.issues.push('No meta description found'); } // Check headings const h1s = document.querySelectorAll('h1'); analysis.headings.h1_count = h1s.length; h1s.forEach(h1 => { analysis.headings.h1_texts.push(h1.textContent.trim()); }); if (h1s.length === 0) { analysis.headings.issues.push('No H1 tag found'); } else if (h1s.length > 1) { analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)'); } // Get heading hierarchy const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6'); allHeadings.forEach(h => { analysis.headings.hierarchy.push({ level: h.tagName, text: h.textContent.trim().substring(0, 50) }); }); // Check images const images = document.querySelectorAll('img'); analysis.images.total = images.length; images.forEach(img => { if (!img.alt) { analysis.images.without_alt++; } }); if (analysis.images.without_alt > 0) { analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`); } // Check links const links = document.querySelectorAll('a[href]'); analysis.links.total = links.length; links.forEach(link => { try { const linkUrl = new URL(link.href, window.location.href); if (linkUrl.hostname !== window.location.hostname) { analysis.links.external++; } if (link.rel && link.rel.includes('nofollow')) { analysis.links.nofollow++; } } catch(e) {} }); // Check canonical const canonical = document.querySelector('link[rel="canonical"]'); if (canonical) { analysis.canonical = canonical.href; } // Check robots meta const robots = document.querySelector('meta[name="robots"]'); if (robots) { analysis.robots = robots.content; } // Count structured data analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length; return analysis; """) # Calculate SEO score score = 100 total_issues = 0 for key in ['title', 'meta_description', 'headings', 'images']: if key in seo_data and 'issues' in seo_data[key]: issues = len(seo_data[key]['issues']) total_issues += issues score -= (issues * 10) score = max(0, score) result = { "url": url, "seo_score": score, "analysis": seo_data, "total_issues": total_issues, "recommendations": get_seo_recommendations(seo_data) } return json.dumps(result, indent=2) except Exception as e: logger.error(f"Error in seo_analysis: {e}") return f"Error: {e}" finally: cleanup_driver(driver, use_persistent) def get_seo_recommendations(seo_data): """Get SEO recommendations based on analysis""" recommendations = [] if seo_data['title']['issues']: recommendations.extend(seo_data['title']['issues']) if seo_data['meta_description']['issues']: recommendations.extend(seo_data['meta_description']['issues']) if seo_data['headings']['issues']: recommendations.extend(seo_data['headings']['issues']) if seo_data['images']['issues']: recommendations.extend(seo_data['images']['issues']) if not seo_data['canonical']: recommendations.append("Add canonical URL to prevent duplicate content issues") if not seo_data['lang']: recommendations.append("Add lang attribute to HTML tag for better internationalization") if seo_data['structured_data_count'] == 0: recommendations.append("Add structured data (JSON-LD) for better search engine understanding") return recommendations