Spaces:

diamond-in
/

Browser-Use-mcp

Sleeping

File size: 16,493 Bytes

492241f

"""
Advanced analysis features: visual testing, link extraction, structured data
"""
import json
import time
import logging
from datetime import datetime
from browser.driver import get_driver, cleanup_driver, create_driver

logger = logging.getLogger(__name__)

def extract_structured_data(url: str, use_persistent: bool = False) -> str:
    """Extract structured data (JSON-LD, microdata, meta tags) from page"""
    driver = None
    try:
        driver = get_driver(url, use_persistent)
        
        # Extract various types of structured data
        structured_data = driver.execute_script("""
            const data = {
                jsonld: [],
                meta: {},
                opengraph: {},
                twitter: {},
                microdata: [],
                schema_org: []
            };
            
            // Extract JSON-LD
            document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
                try {
                    const parsed = JSON.parse(script.textContent);
                    data.jsonld.push(parsed);
                    // Also add to schema.org if it's schema.org data
                    if (parsed['@context'] && parsed['@context'].includes('schema.org')) {
                        data.schema_org.push(parsed);
                    }
                } catch(e) {
                    console.error('Failed to parse JSON-LD:', e);
                }
            });
            
            // Extract meta tags
            document.querySelectorAll('meta').forEach(meta => {
                const name = meta.getAttribute('name') || meta.getAttribute('property');
                const content = meta.getAttribute('content');
                if (name && content) {
                    if (name.startsWith('og:')) {
                        data.opengraph[name] = content;
                    } else if (name.startsWith('twitter:')) {
                        data.twitter[name] = content;
                    } else {
                        data.meta[name] = content;
                    }
                }
            });
            
            // Extract microdata
            document.querySelectorAll('[itemscope]').forEach(item => {
                const itemData = {
                    type: item.getAttribute('itemtype'),
                    properties: {}
                };
                item.querySelectorAll('[itemprop]').forEach(prop => {
                    const propName = prop.getAttribute('itemprop');
                    const propValue = prop.getAttribute('content') || 
                                     prop.getAttribute('href') || 
                                     prop.textContent.trim();
                    itemData.properties[propName] = propValue;
                });
                data.microdata.push(itemData);
            });
            
            return data;
        """)
        
        # Add summary
        structured_data['summary'] = {
            'has_jsonld': len(structured_data['jsonld']) > 0,
            'has_opengraph': len(structured_data['opengraph']) > 0,
            'has_twitter_cards': len(structured_data['twitter']) > 0,
            'has_microdata': len(structured_data['microdata']) > 0,
            'total_meta_tags': len(structured_data['meta'])
        }
        
        return json.dumps(structured_data, indent=2)
    except Exception as e:
        logger.error(f"Error in extract_structured_data: {e}")
        return f"Error: {e}"
    finally:
        cleanup_driver(driver, use_persistent)

def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str:
    """Compare two URLs visually for differences"""
    driver = None
    try:
        driver = create_driver(persistent=False)
        
        # Take screenshot of first URL
        driver.get(url1)
        time.sleep(3)  # Wait for page to stabilize
        screenshot1_path = "/tmp/screenshot1.png"
        driver.save_screenshot(screenshot1_path)
        page1_info = {
            "title": driver.title,
            "url": driver.current_url
        }
        
        # Take screenshot of second URL
        driver.get(url2)
        time.sleep(3)  # Wait for page to stabilize
        screenshot2_path = "/tmp/screenshot2.png"
        driver.save_screenshot(screenshot2_path)
        page2_info = {
            "title": driver.title,
            "url": driver.current_url
        }
        
        # Get page dimensions for comparison
        dimensions1 = driver.execute_script("""
            return {
                width: document.documentElement.scrollWidth,
                height: document.documentElement.scrollHeight,
                viewport: {
                    width: window.innerWidth,
                    height: window.innerHeight
                }
            }
        """)
        
        driver.quit()
        
        # Create comparison result
        result = {
            "url1": url1,
            "url2": url2,
            "page1_info": page1_info,
            "page2_info": page2_info,
            "screenshots": {
                "screenshot1": screenshot1_path,
                "screenshot2": screenshot2_path
            },
            "dimensions_match": dimensions1,
            "threshold": threshold,
            "timestamp": datetime.now().isoformat(),
            "note": "Visual comparison requires external image processing. Screenshots saved for manual review."
        }
        
        return json.dumps(result, indent=2)
    except Exception as e:
        logger.error(f"Error in visual_regression_test: {e}")
        if driver:
            try:
                driver.quit()
            except:
                pass
        return f"Error: {e}"

def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str:
    """Extract all links from a page with categorization"""
    driver = None
    try:
        driver = get_driver(url, use_persistent)
        
        # Extract and categorize links
        links_data = driver.execute_script(f"""
            const currentDomain = new URL(window.location.href).hostname;
            const links = {{
                internal: [],
                external: [],
                email: [],
                phone: [],
                javascript: [],
                anchor: [],
                file_downloads: []
            }};
            
            // Common file extensions for downloads
            const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt'];
            
            document.querySelectorAll('a[href]').forEach(a => {{
                const href = a.getAttribute('href');
                const text = a.textContent.trim();
                const linkData = {{
                    href: href,
                    text: text.substring(0, 100),
                    title: a.title,
                    target: a.target,
                    rel: a.rel
                }};
                
                if (href.startsWith('mailto:')) {{
                    links.email.push(linkData);
                }} else if (href.startsWith('tel:')) {{
                    links.phone.push(linkData);
                }} else if (href.startsWith('javascript:')) {{
                    links.javascript.push(linkData);
                }} else if (href.startsWith('#')) {{
                    links.anchor.push(linkData);
                }} else {{
                    try {{
                        const linkUrl = new URL(href, window.location.href);
                        
                        // Check if it's a file download
                        const isFileDownload = fileExtensions.some(ext => 
                            linkUrl.pathname.toLowerCase().endsWith(ext)
                        );
                        
                        if (isFileDownload) {{
                            links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}});
                        }} else if (linkUrl.hostname === currentDomain) {{
                            links.internal.push({{...linkData, absoluteUrl: linkUrl.href}});
                        }} else if ({str(include_external).lower()}) {{
                            links.external.push({{...linkData, absoluteUrl: linkUrl.href}});
                        }}
                    }} catch(e) {{
                        // Invalid URL, add to javascript category
                        links.javascript.push(linkData);
                    }}
                }}
            }});
            
            return {{
                links: links,
                summary: {{
                    total: document.querySelectorAll('a[href]').length,
                    internal: links.internal.length,
                    external: links.external.length,
                    email: links.email.length,
                    phone: links.phone.length,
                    javascript: links.javascript.length,
                    anchor: links.anchor.length,
                    file_downloads: links.file_downloads.length
                }},
                page_info: {{
                    title: document.title,
                    url: window.location.href,
                    domain: currentDomain
                }}
            }};
        """)
        
        return json.dumps(links_data, indent=2)
    except Exception as e:
        logger.error(f"Error in extract_all_links: {e}")
        return f"Error: {e}"
    finally:
        cleanup_driver(driver, use_persistent)

def seo_analysis(url: str, use_persistent: bool = False) -> str:
    """Perform SEO analysis on a page"""
    driver = None
    try:
        driver = get_driver(url, use_persistent)
        
        # Perform SEO analysis
        seo_data = driver.execute_script("""
            const analysis = {
                title: {
                    content: document.title,
                    length: document.title.length,
                    issues: []
                },
                meta_description: {
                    content: null,
                    length: 0,
                    issues: []
                },
                headings: {
                    h1_count: 0,
                    h1_texts: [],
                    hierarchy: [],
                    issues: []
                },
                images: {
                    total: 0,
                    without_alt: 0,
                    issues: []
                },
                links: {
                    total: 0,
                    external: 0,
                    nofollow: 0
                },
                canonical: null,
                robots: null,
                lang: document.documentElement.lang,
                structured_data_count: 0
            };
            
            // Check title
            if (analysis.title.length < 30) {
                analysis.title.issues.push('Title too short (recommended: 30-60 characters)');
            } else if (analysis.title.length > 60) {
                analysis.title.issues.push('Title too long (recommended: 30-60 characters)');
            }
            
            // Check meta description
            const metaDesc = document.querySelector('meta[name="description"]');
            if (metaDesc) {
                analysis.meta_description.content = metaDesc.content;
                analysis.meta_description.length = metaDesc.content.length;
                
                if (metaDesc.content.length < 120) {
                    analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)');
                } else if (metaDesc.content.length > 160) {
                    analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)');
                }
            } else {
                analysis.meta_description.issues.push('No meta description found');
            }
            
            // Check headings
            const h1s = document.querySelectorAll('h1');
            analysis.headings.h1_count = h1s.length;
            h1s.forEach(h1 => {
                analysis.headings.h1_texts.push(h1.textContent.trim());
            });
            
            if (h1s.length === 0) {
                analysis.headings.issues.push('No H1 tag found');
            } else if (h1s.length > 1) {
                analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)');
            }
            
            // Get heading hierarchy
            const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
            allHeadings.forEach(h => {
                analysis.headings.hierarchy.push({
                    level: h.tagName,
                    text: h.textContent.trim().substring(0, 50)
                });
            });
            
            // Check images
            const images = document.querySelectorAll('img');
            analysis.images.total = images.length;
            images.forEach(img => {
                if (!img.alt) {
                    analysis.images.without_alt++;
                }
            });
            
            if (analysis.images.without_alt > 0) {
                analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`);
            }
            
            // Check links
            const links = document.querySelectorAll('a[href]');
            analysis.links.total = links.length;
            links.forEach(link => {
                try {
                    const linkUrl = new URL(link.href, window.location.href);
                    if (linkUrl.hostname !== window.location.hostname) {
                        analysis.links.external++;
                    }
                    if (link.rel && link.rel.includes('nofollow')) {
                        analysis.links.nofollow++;
                    }
                } catch(e) {}
            });
            
            // Check canonical
            const canonical = document.querySelector('link[rel="canonical"]');
            if (canonical) {
                analysis.canonical = canonical.href;
            }
            
            // Check robots meta
            const robots = document.querySelector('meta[name="robots"]');
            if (robots) {
                analysis.robots = robots.content;
            }
            
            // Count structured data
            analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length;
            
            return analysis;
        """)
        
        # Calculate SEO score
        score = 100
        total_issues = 0
        
        for key in ['title', 'meta_description', 'headings', 'images']:
            if key in seo_data and 'issues' in seo_data[key]:
                issues = len(seo_data[key]['issues'])
                total_issues += issues
                score -= (issues * 10)
        
        score = max(0, score)
        
        result = {
            "url": url,
            "seo_score": score,
            "analysis": seo_data,
            "total_issues": total_issues,
            "recommendations": get_seo_recommendations(seo_data)
        }
        
        return json.dumps(result, indent=2)
    except Exception as e:
        logger.error(f"Error in seo_analysis: {e}")
        return f"Error: {e}"
    finally:
        cleanup_driver(driver, use_persistent)

def get_seo_recommendations(seo_data):
    """Get SEO recommendations based on analysis"""
    recommendations = []
    
    if seo_data['title']['issues']:
        recommendations.extend(seo_data['title']['issues'])
    
    if seo_data['meta_description']['issues']:
        recommendations.extend(seo_data['meta_description']['issues'])
    
    if seo_data['headings']['issues']:
        recommendations.extend(seo_data['headings']['issues'])
    
    if seo_data['images']['issues']:
        recommendations.extend(seo_data['images']['issues'])
    
    if not seo_data['canonical']:
        recommendations.append("Add canonical URL to prevent duplicate content issues")
    
    if not seo_data['lang']:
        recommendations.append("Add lang attribute to HTML tag for better internationalization")
    
    if seo_data['structured_data_count'] == 0:
        recommendations.append("Add structured data (JSON-LD) for better search engine understanding")
    
    return recommendations