Browser-Use-mcp

Paused

App Files Files Community

diamond-in commited on Oct 2, 2025

Commit

492241f

verified ·

1 Parent(s): a8acddd

Update features/analysis.py

Browse files

Files changed (1) hide show

features/analysis.py +433 -0

features/analysis.py CHANGED Viewed

	@@ -0,0 +1,433 @@

+"""
+Advanced analysis features: visual testing, link extraction, structured data
+"""
+import json
+import time
+import logging
+from datetime import datetime
+from browser.driver import get_driver, cleanup_driver, create_driver
+logger = logging.getLogger(__name__)
+def extract_structured_data(url: str, use_persistent: bool = False) -> str:
+    """Extract structured data (JSON-LD, microdata, meta tags) from page"""
+    driver = None
+    try:
+        driver = get_driver(url, use_persistent)
+        # Extract various types of structured data
+        structured_data = driver.execute_script("""
+            const data = {
+                jsonld: [],
+                meta: {},
+                opengraph: {},
+                twitter: {},
+                microdata: [],
+                schema_org: []
+            };
+            // Extract JSON-LD
+            document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
+                try {
+                    const parsed = JSON.parse(script.textContent);
+                    data.jsonld.push(parsed);
+                    // Also add to schema.org if it's schema.org data
+                    if (parsed['@context'] && parsed['@context'].includes('schema.org')) {
+                        data.schema_org.push(parsed);
+                    }
+                } catch(e) {
+                    console.error('Failed to parse JSON-LD:', e);
+                }
+            });
+            // Extract meta tags
+            document.querySelectorAll('meta').forEach(meta => {
+                const name = meta.getAttribute('name') || meta.getAttribute('property');
+                const content = meta.getAttribute('content');
+                if (name && content) {
+                    if (name.startsWith('og:')) {
+                        data.opengraph[name] = content;
+                    } else if (name.startsWith('twitter:')) {
+                        data.twitter[name] = content;
+                    } else {
+                        data.meta[name] = content;
+                    }
+                }
+            });
+            // Extract microdata
+            document.querySelectorAll('[itemscope]').forEach(item => {
+                const itemData = {
+                    type: item.getAttribute('itemtype'),
+                    properties: {}
+                };
+                item.querySelectorAll('[itemprop]').forEach(prop => {
+                    const propName = prop.getAttribute('itemprop');
+                    const propValue = prop.getAttribute('content') ||
+                                     prop.getAttribute('href') ||
+                                     prop.textContent.trim();
+                    itemData.properties[propName] = propValue;
+                });
+                data.microdata.push(itemData);
+            });
+            return data;
+        """)
+        # Add summary
+        structured_data['summary'] = {
+            'has_jsonld': len(structured_data['jsonld']) > 0,
+            'has_opengraph': len(structured_data['opengraph']) > 0,
+            'has_twitter_cards': len(structured_data['twitter']) > 0,
+            'has_microdata': len(structured_data['microdata']) > 0,
+            'total_meta_tags': len(structured_data['meta'])
+        }
+        return json.dumps(structured_data, indent=2)
+    except Exception as e:
+        logger.error(f"Error in extract_structured_data: {e}")
+        return f"Error: {e}"
+    finally:
+        cleanup_driver(driver, use_persistent)
+def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str:
+    """Compare two URLs visually for differences"""
+    driver = None
+    try:
+        driver = create_driver(persistent=False)
+        # Take screenshot of first URL
+        driver.get(url1)
+        time.sleep(3)  # Wait for page to stabilize
+        screenshot1_path = "/tmp/screenshot1.png"
+        driver.save_screenshot(screenshot1_path)
+        page1_info = {
+            "title": driver.title,
+            "url": driver.current_url
+        }
+        # Take screenshot of second URL
+        driver.get(url2)
+        time.sleep(3)  # Wait for page to stabilize
+        screenshot2_path = "/tmp/screenshot2.png"
+        driver.save_screenshot(screenshot2_path)
+        page2_info = {
+            "title": driver.title,
+            "url": driver.current_url
+        }
+        # Get page dimensions for comparison
+        dimensions1 = driver.execute_script("""
+            return {
+                width: document.documentElement.scrollWidth,
+                height: document.documentElement.scrollHeight,
+                viewport: {
+                    width: window.innerWidth,
+                    height: window.innerHeight
+                }
+            }
+        """)
+        driver.quit()
+        # Create comparison result
+        result = {
+            "url1": url1,
+            "url2": url2,
+            "page1_info": page1_info,
+            "page2_info": page2_info,
+            "screenshots": {
+                "screenshot1": screenshot1_path,
+                "screenshot2": screenshot2_path
+            },
+            "dimensions_match": dimensions1,
+            "threshold": threshold,
+            "timestamp": datetime.now().isoformat(),
+            "note": "Visual comparison requires external image processing. Screenshots saved for manual review."
+        }
+        return json.dumps(result, indent=2)
+    except Exception as e:
+        logger.error(f"Error in visual_regression_test: {e}")
+        if driver:
+            try:
+                driver.quit()
+            except:
+                pass
+        return f"Error: {e}"
+def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str:
+    """Extract all links from a page with categorization"""
+    driver = None
+    try:
+        driver = get_driver(url, use_persistent)
+        # Extract and categorize links
+        links_data = driver.execute_script(f"""
+            const currentDomain = new URL(window.location.href).hostname;
+            const links = {{
+                internal: [],
+                external: [],
+                email: [],
+                phone: [],
+                javascript: [],
+                anchor: [],
+                file_downloads: []
+            }};
+            // Common file extensions for downloads
+            const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt'];
+            document.querySelectorAll('a[href]').forEach(a => {{
+                const href = a.getAttribute('href');
+                const text = a.textContent.trim();
+                const linkData = {{
+                    href: href,
+                    text: text.substring(0, 100),
+                    title: a.title,
+                    target: a.target,
+                    rel: a.rel
+                }};
+                if (href.startsWith('mailto:')) {{
+                    links.email.push(linkData);
+                }} else if (href.startsWith('tel:')) {{
+                    links.phone.push(linkData);
+                }} else if (href.startsWith('javascript:')) {{
+                    links.javascript.push(linkData);
+                }} else if (href.startsWith('#')) {{
+                    links.anchor.push(linkData);
+                }} else {{
+                    try {{
+                        const linkUrl = new URL(href, window.location.href);
+                        // Check if it's a file download
+                        const isFileDownload = fileExtensions.some(ext =>
+                            linkUrl.pathname.toLowerCase().endsWith(ext)
+                        );
+                        if (isFileDownload) {{
+                            links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}});
+                        }} else if (linkUrl.hostname === currentDomain) {{
+                            links.internal.push({{...linkData, absoluteUrl: linkUrl.href}});
+                        }} else if ({str(include_external).lower()}) {{
+                            links.external.push({{...linkData, absoluteUrl: linkUrl.href}});
+                        }}
+                    }} catch(e) {{
+                        // Invalid URL, add to javascript category
+                        links.javascript.push(linkData);
+                    }}
+                }}
+            }});
+            return {{
+                links: links,
+                summary: {{
+                    total: document.querySelectorAll('a[href]').length,
+                    internal: links.internal.length,
+                    external: links.external.length,
+                    email: links.email.length,
+                    phone: links.phone.length,
+                    javascript: links.javascript.length,
+                    anchor: links.anchor.length,
+                    file_downloads: links.file_downloads.length
+                }},
+                page_info: {{
+                    title: document.title,
+                    url: window.location.href,
+                    domain: currentDomain
+                }}
+            }};
+        """)
+        return json.dumps(links_data, indent=2)
+    except Exception as e:
+        logger.error(f"Error in extract_all_links: {e}")
+        return f"Error: {e}"
+    finally:
+        cleanup_driver(driver, use_persistent)
+def seo_analysis(url: str, use_persistent: bool = False) -> str:
+    """Perform SEO analysis on a page"""
+    driver = None
+    try:
+        driver = get_driver(url, use_persistent)
+        # Perform SEO analysis
+        seo_data = driver.execute_script("""
+            const analysis = {
+                title: {
+                    content: document.title,
+                    length: document.title.length,
+                    issues: []
+                },
+                meta_description: {
+                    content: null,
+                    length: 0,
+                    issues: []
+                },
+                headings: {
+                    h1_count: 0,
+                    h1_texts: [],
+                    hierarchy: [],
+                    issues: []
+                },
+                images: {
+                    total: 0,
+                    without_alt: 0,
+                    issues: []
+                },
+                links: {
+                    total: 0,
+                    external: 0,
+                    nofollow: 0
+                },
+                canonical: null,
+                robots: null,
+                lang: document.documentElement.lang,
+                structured_data_count: 0
+            };
+            // Check title
+            if (analysis.title.length < 30) {
+                analysis.title.issues.push('Title too short (recommended: 30-60 characters)');
+            } else if (analysis.title.length > 60) {
+                analysis.title.issues.push('Title too long (recommended: 30-60 characters)');
+            }
+            // Check meta description
+            const metaDesc = document.querySelector('meta[name="description"]');
+            if (metaDesc) {
+                analysis.meta_description.content = metaDesc.content;
+                analysis.meta_description.length = metaDesc.content.length;
+                if (metaDesc.content.length < 120) {
+                    analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)');
+                } else if (metaDesc.content.length > 160) {
+                    analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)');
+                }
+            } else {
+                analysis.meta_description.issues.push('No meta description found');
+            }
+            // Check headings
+            const h1s = document.querySelectorAll('h1');
+            analysis.headings.h1_count = h1s.length;
+            h1s.forEach(h1 => {
+                analysis.headings.h1_texts.push(h1.textContent.trim());
+            });
+            if (h1s.length === 0) {
+                analysis.headings.issues.push('No H1 tag found');
+            } else if (h1s.length > 1) {
+                analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)');
+            }
+            // Get heading hierarchy
+            const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            allHeadings.forEach(h => {
+                analysis.headings.hierarchy.push({
+                    level: h.tagName,
+                    text: h.textContent.trim().substring(0, 50)
+                });
+            });
+            // Check images
+            const images = document.querySelectorAll('img');
+            analysis.images.total = images.length;
+            images.forEach(img => {
+                if (!img.alt) {
+                    analysis.images.without_alt++;
+                }
+            });
+            if (analysis.images.without_alt > 0) {
+                analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`);
+            }
+            // Check links
+            const links = document.querySelectorAll('a[href]');
+            analysis.links.total = links.length;
+            links.forEach(link => {
+                try {
+                    const linkUrl = new URL(link.href, window.location.href);
+                    if (linkUrl.hostname !== window.location.hostname) {
+                        analysis.links.external++;
+                    }
+                    if (link.rel && link.rel.includes('nofollow')) {
+                        analysis.links.nofollow++;
+                    }
+                } catch(e) {}
+            });
+            // Check canonical
+            const canonical = document.querySelector('link[rel="canonical"]');
+            if (canonical) {
+                analysis.canonical = canonical.href;
+            }
+            // Check robots meta
+            const robots = document.querySelector('meta[name="robots"]');
+            if (robots) {
+                analysis.robots = robots.content;
+            }
+            // Count structured data
+            analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length;
+            return analysis;
+        """)
+        # Calculate SEO score
+        score = 100
+        total_issues = 0
+        for key in ['title', 'meta_description', 'headings', 'images']:
+            if key in seo_data and 'issues' in seo_data[key]:
+                issues = len(seo_data[key]['issues'])
+                total_issues += issues
+                score -= (issues * 10)
+        score = max(0, score)
+        result = {
+            "url": url,
+            "seo_score": score,
+            "analysis": seo_data,
+            "total_issues": total_issues,
+            "recommendations": get_seo_recommendations(seo_data)
+        }
+        return json.dumps(result, indent=2)
+    except Exception as e:
+        logger.error(f"Error in seo_analysis: {e}")
+        return f"Error: {e}"
+    finally:
+        cleanup_driver(driver, use_persistent)
+def get_seo_recommendations(seo_data):
+    """Get SEO recommendations based on analysis"""
+    recommendations = []
+    if seo_data['title']['issues']:
+        recommendations.extend(seo_data['title']['issues'])
+    if seo_data['meta_description']['issues']:
+        recommendations.extend(seo_data['meta_description']['issues'])
+    if seo_data['headings']['issues']:
+        recommendations.extend(seo_data['headings']['issues'])
+    if seo_data['images']['issues']:
+        recommendations.extend(seo_data['images']['issues'])
+    if not seo_data['canonical']:
+        recommendations.append("Add canonical URL to prevent duplicate content issues")
+    if not seo_data['lang']:
+        recommendations.append("Add lang attribute to HTML tag for better internationalization")
+    if seo_data['structured_data_count'] == 0:
+        recommendations.append("Add structured data (JSON-LD) for better search engine understanding")
+    return recommendations