Browser-Use-mcp / features /analysis.py
diamond-in's picture
Update features/analysis.py
492241f verified
"""
Advanced analysis features: visual testing, link extraction, structured data
"""
import json
import time
import logging
from datetime import datetime
from browser.driver import get_driver, cleanup_driver, create_driver
logger = logging.getLogger(__name__)
def extract_structured_data(url: str, use_persistent: bool = False) -> str:
"""Extract structured data (JSON-LD, microdata, meta tags) from page"""
driver = None
try:
driver = get_driver(url, use_persistent)
# Extract various types of structured data
structured_data = driver.execute_script("""
const data = {
jsonld: [],
meta: {},
opengraph: {},
twitter: {},
microdata: [],
schema_org: []
};
// Extract JSON-LD
document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
try {
const parsed = JSON.parse(script.textContent);
data.jsonld.push(parsed);
// Also add to schema.org if it's schema.org data
if (parsed['@context'] && parsed['@context'].includes('schema.org')) {
data.schema_org.push(parsed);
}
} catch(e) {
console.error('Failed to parse JSON-LD:', e);
}
});
// Extract meta tags
document.querySelectorAll('meta').forEach(meta => {
const name = meta.getAttribute('name') || meta.getAttribute('property');
const content = meta.getAttribute('content');
if (name && content) {
if (name.startsWith('og:')) {
data.opengraph[name] = content;
} else if (name.startsWith('twitter:')) {
data.twitter[name] = content;
} else {
data.meta[name] = content;
}
}
});
// Extract microdata
document.querySelectorAll('[itemscope]').forEach(item => {
const itemData = {
type: item.getAttribute('itemtype'),
properties: {}
};
item.querySelectorAll('[itemprop]').forEach(prop => {
const propName = prop.getAttribute('itemprop');
const propValue = prop.getAttribute('content') ||
prop.getAttribute('href') ||
prop.textContent.trim();
itemData.properties[propName] = propValue;
});
data.microdata.push(itemData);
});
return data;
""")
# Add summary
structured_data['summary'] = {
'has_jsonld': len(structured_data['jsonld']) > 0,
'has_opengraph': len(structured_data['opengraph']) > 0,
'has_twitter_cards': len(structured_data['twitter']) > 0,
'has_microdata': len(structured_data['microdata']) > 0,
'total_meta_tags': len(structured_data['meta'])
}
return json.dumps(structured_data, indent=2)
except Exception as e:
logger.error(f"Error in extract_structured_data: {e}")
return f"Error: {e}"
finally:
cleanup_driver(driver, use_persistent)
def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str:
"""Compare two URLs visually for differences"""
driver = None
try:
driver = create_driver(persistent=False)
# Take screenshot of first URL
driver.get(url1)
time.sleep(3) # Wait for page to stabilize
screenshot1_path = "/tmp/screenshot1.png"
driver.save_screenshot(screenshot1_path)
page1_info = {
"title": driver.title,
"url": driver.current_url
}
# Take screenshot of second URL
driver.get(url2)
time.sleep(3) # Wait for page to stabilize
screenshot2_path = "/tmp/screenshot2.png"
driver.save_screenshot(screenshot2_path)
page2_info = {
"title": driver.title,
"url": driver.current_url
}
# Get page dimensions for comparison
dimensions1 = driver.execute_script("""
return {
width: document.documentElement.scrollWidth,
height: document.documentElement.scrollHeight,
viewport: {
width: window.innerWidth,
height: window.innerHeight
}
}
""")
driver.quit()
# Create comparison result
result = {
"url1": url1,
"url2": url2,
"page1_info": page1_info,
"page2_info": page2_info,
"screenshots": {
"screenshot1": screenshot1_path,
"screenshot2": screenshot2_path
},
"dimensions_match": dimensions1,
"threshold": threshold,
"timestamp": datetime.now().isoformat(),
"note": "Visual comparison requires external image processing. Screenshots saved for manual review."
}
return json.dumps(result, indent=2)
except Exception as e:
logger.error(f"Error in visual_regression_test: {e}")
if driver:
try:
driver.quit()
except:
pass
return f"Error: {e}"
def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str:
"""Extract all links from a page with categorization"""
driver = None
try:
driver = get_driver(url, use_persistent)
# Extract and categorize links
links_data = driver.execute_script(f"""
const currentDomain = new URL(window.location.href).hostname;
const links = {{
internal: [],
external: [],
email: [],
phone: [],
javascript: [],
anchor: [],
file_downloads: []
}};
// Common file extensions for downloads
const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt'];
document.querySelectorAll('a[href]').forEach(a => {{
const href = a.getAttribute('href');
const text = a.textContent.trim();
const linkData = {{
href: href,
text: text.substring(0, 100),
title: a.title,
target: a.target,
rel: a.rel
}};
if (href.startsWith('mailto:')) {{
links.email.push(linkData);
}} else if (href.startsWith('tel:')) {{
links.phone.push(linkData);
}} else if (href.startsWith('javascript:')) {{
links.javascript.push(linkData);
}} else if (href.startsWith('#')) {{
links.anchor.push(linkData);
}} else {{
try {{
const linkUrl = new URL(href, window.location.href);
// Check if it's a file download
const isFileDownload = fileExtensions.some(ext =>
linkUrl.pathname.toLowerCase().endsWith(ext)
);
if (isFileDownload) {{
links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}});
}} else if (linkUrl.hostname === currentDomain) {{
links.internal.push({{...linkData, absoluteUrl: linkUrl.href}});
}} else if ({str(include_external).lower()}) {{
links.external.push({{...linkData, absoluteUrl: linkUrl.href}});
}}
}} catch(e) {{
// Invalid URL, add to javascript category
links.javascript.push(linkData);
}}
}}
}});
return {{
links: links,
summary: {{
total: document.querySelectorAll('a[href]').length,
internal: links.internal.length,
external: links.external.length,
email: links.email.length,
phone: links.phone.length,
javascript: links.javascript.length,
anchor: links.anchor.length,
file_downloads: links.file_downloads.length
}},
page_info: {{
title: document.title,
url: window.location.href,
domain: currentDomain
}}
}};
""")
return json.dumps(links_data, indent=2)
except Exception as e:
logger.error(f"Error in extract_all_links: {e}")
return f"Error: {e}"
finally:
cleanup_driver(driver, use_persistent)
def seo_analysis(url: str, use_persistent: bool = False) -> str:
"""Perform SEO analysis on a page"""
driver = None
try:
driver = get_driver(url, use_persistent)
# Perform SEO analysis
seo_data = driver.execute_script("""
const analysis = {
title: {
content: document.title,
length: document.title.length,
issues: []
},
meta_description: {
content: null,
length: 0,
issues: []
},
headings: {
h1_count: 0,
h1_texts: [],
hierarchy: [],
issues: []
},
images: {
total: 0,
without_alt: 0,
issues: []
},
links: {
total: 0,
external: 0,
nofollow: 0
},
canonical: null,
robots: null,
lang: document.documentElement.lang,
structured_data_count: 0
};
// Check title
if (analysis.title.length < 30) {
analysis.title.issues.push('Title too short (recommended: 30-60 characters)');
} else if (analysis.title.length > 60) {
analysis.title.issues.push('Title too long (recommended: 30-60 characters)');
}
// Check meta description
const metaDesc = document.querySelector('meta[name="description"]');
if (metaDesc) {
analysis.meta_description.content = metaDesc.content;
analysis.meta_description.length = metaDesc.content.length;
if (metaDesc.content.length < 120) {
analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)');
} else if (metaDesc.content.length > 160) {
analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)');
}
} else {
analysis.meta_description.issues.push('No meta description found');
}
// Check headings
const h1s = document.querySelectorAll('h1');
analysis.headings.h1_count = h1s.length;
h1s.forEach(h1 => {
analysis.headings.h1_texts.push(h1.textContent.trim());
});
if (h1s.length === 0) {
analysis.headings.issues.push('No H1 tag found');
} else if (h1s.length > 1) {
analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)');
}
// Get heading hierarchy
const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
allHeadings.forEach(h => {
analysis.headings.hierarchy.push({
level: h.tagName,
text: h.textContent.trim().substring(0, 50)
});
});
// Check images
const images = document.querySelectorAll('img');
analysis.images.total = images.length;
images.forEach(img => {
if (!img.alt) {
analysis.images.without_alt++;
}
});
if (analysis.images.without_alt > 0) {
analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`);
}
// Check links
const links = document.querySelectorAll('a[href]');
analysis.links.total = links.length;
links.forEach(link => {
try {
const linkUrl = new URL(link.href, window.location.href);
if (linkUrl.hostname !== window.location.hostname) {
analysis.links.external++;
}
if (link.rel && link.rel.includes('nofollow')) {
analysis.links.nofollow++;
}
} catch(e) {}
});
// Check canonical
const canonical = document.querySelector('link[rel="canonical"]');
if (canonical) {
analysis.canonical = canonical.href;
}
// Check robots meta
const robots = document.querySelector('meta[name="robots"]');
if (robots) {
analysis.robots = robots.content;
}
// Count structured data
analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length;
return analysis;
""")
# Calculate SEO score
score = 100
total_issues = 0
for key in ['title', 'meta_description', 'headings', 'images']:
if key in seo_data and 'issues' in seo_data[key]:
issues = len(seo_data[key]['issues'])
total_issues += issues
score -= (issues * 10)
score = max(0, score)
result = {
"url": url,
"seo_score": score,
"analysis": seo_data,
"total_issues": total_issues,
"recommendations": get_seo_recommendations(seo_data)
}
return json.dumps(result, indent=2)
except Exception as e:
logger.error(f"Error in seo_analysis: {e}")
return f"Error: {e}"
finally:
cleanup_driver(driver, use_persistent)
def get_seo_recommendations(seo_data):
"""Get SEO recommendations based on analysis"""
recommendations = []
if seo_data['title']['issues']:
recommendations.extend(seo_data['title']['issues'])
if seo_data['meta_description']['issues']:
recommendations.extend(seo_data['meta_description']['issues'])
if seo_data['headings']['issues']:
recommendations.extend(seo_data['headings']['issues'])
if seo_data['images']['issues']:
recommendations.extend(seo_data['images']['issues'])
if not seo_data['canonical']:
recommendations.append("Add canonical URL to prevent duplicate content issues")
if not seo_data['lang']:
recommendations.append("Add lang attribute to HTML tag for better internationalization")
if seo_data['structured_data_count'] == 0:
recommendations.append("Add structured data (JSON-LD) for better search engine understanding")
return recommendations