Spaces:

diamond-in
/

Browser-Use-mcp

Sleeping

App Files Files Community

Browser-Use-mcp / features /analysis.py

diamond-in

Update features/analysis.py

492241f verified 4 months ago

raw

history blame contribute delete

16.5 kB

	"""
	Advanced analysis features: visual testing, link extraction, structured data
	"""
	import json
	import time
	import logging
	from datetime import datetime
	from browser.driver import get_driver, cleanup_driver, create_driver

	logger = logging.getLogger(__name__)

	def extract_structured_data(url: str, use_persistent: bool = False) -> str:
	"""Extract structured data (JSON-LD, microdata, meta tags) from page"""
	driver = None
	try:
	driver = get_driver(url, use_persistent)

	# Extract various types of structured data
	structured_data = driver.execute_script("""
	const data = {
	jsonld: [],
	meta: {},
	opengraph: {},
	twitter: {},
	microdata: [],
	schema_org: []
	};

	// Extract JSON-LD
	document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
	try {
	const parsed = JSON.parse(script.textContent);
	data.jsonld.push(parsed);
	// Also add to schema.org if it's schema.org data
	if (parsed['@context'] && parsed['@context'].includes('schema.org')) {
	data.schema_org.push(parsed);
	}
	} catch(e) {
	console.error('Failed to parse JSON-LD:', e);
	}
	});

	// Extract meta tags
	document.querySelectorAll('meta').forEach(meta => {
	const name = meta.getAttribute('name') \|\| meta.getAttribute('property');
	const content = meta.getAttribute('content');
	if (name && content) {
	if (name.startsWith('og:')) {
	data.opengraph[name] = content;
	} else if (name.startsWith('twitter:')) {
	data.twitter[name] = content;
	} else {
	data.meta[name] = content;
	}
	}
	});

	// Extract microdata
	document.querySelectorAll('[itemscope]').forEach(item => {
	const itemData = {
	type: item.getAttribute('itemtype'),
	properties: {}
	};
	item.querySelectorAll('[itemprop]').forEach(prop => {
	const propName = prop.getAttribute('itemprop');
	const propValue = prop.getAttribute('content') \|\|
	prop.getAttribute('href') \|\|
	prop.textContent.trim();
	itemData.properties[propName] = propValue;
	});
	data.microdata.push(itemData);
	});

	return data;
	""")

	# Add summary
	structured_data['summary'] = {
	'has_jsonld': len(structured_data['jsonld']) > 0,
	'has_opengraph': len(structured_data['opengraph']) > 0,
	'has_twitter_cards': len(structured_data['twitter']) > 0,
	'has_microdata': len(structured_data['microdata']) > 0,
	'total_meta_tags': len(structured_data['meta'])
	}

	return json.dumps(structured_data, indent=2)
	except Exception as e:
	logger.error(f"Error in extract_structured_data: {e}")
	return f"Error: {e}"
	finally:
	cleanup_driver(driver, use_persistent)

	def visual_regression_test(url1: str, url2: str, threshold: float = 0.98) -> str:
	"""Compare two URLs visually for differences"""
	driver = None
	try:
	driver = create_driver(persistent=False)

	# Take screenshot of first URL
	driver.get(url1)
	time.sleep(3) # Wait for page to stabilize
	screenshot1_path = "/tmp/screenshot1.png"
	driver.save_screenshot(screenshot1_path)
	page1_info = {
	"title": driver.title,
	"url": driver.current_url
	}

	# Take screenshot of second URL
	driver.get(url2)
	time.sleep(3) # Wait for page to stabilize
	screenshot2_path = "/tmp/screenshot2.png"
	driver.save_screenshot(screenshot2_path)
	page2_info = {
	"title": driver.title,
	"url": driver.current_url
	}

	# Get page dimensions for comparison
	dimensions1 = driver.execute_script("""
	return {
	width: document.documentElement.scrollWidth,
	height: document.documentElement.scrollHeight,
	viewport: {
	width: window.innerWidth,
	height: window.innerHeight
	}
	}
	""")

	driver.quit()

	# Create comparison result
	result = {
	"url1": url1,
	"url2": url2,
	"page1_info": page1_info,
	"page2_info": page2_info,
	"screenshots": {
	"screenshot1": screenshot1_path,
	"screenshot2": screenshot2_path
	},
	"dimensions_match": dimensions1,
	"threshold": threshold,
	"timestamp": datetime.now().isoformat(),
	"note": "Visual comparison requires external image processing. Screenshots saved for manual review."
	}

	return json.dumps(result, indent=2)
	except Exception as e:
	logger.error(f"Error in visual_regression_test: {e}")
	if driver:
	try:
	driver.quit()
	except:
	pass
	return f"Error: {e}"

	def extract_all_links(url: str, include_external: bool = True, use_persistent: bool = False) -> str:
	"""Extract all links from a page with categorization"""
	driver = None
	try:
	driver = get_driver(url, use_persistent)

	# Extract and categorize links
	links_data = driver.execute_script(f"""
	const currentDomain = new URL(window.location.href).hostname;
	const links = {{
	internal: [],
	external: [],
	email: [],
	phone: [],
	javascript: [],
	anchor: [],
	file_downloads: []
	}};

	// Common file extensions for downloads
	const fileExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar', '.csv', '.txt'];

	document.querySelectorAll('a[href]').forEach(a => {{
	const href = a.getAttribute('href');
	const text = a.textContent.trim();
	const linkData = {{
	href: href,
	text: text.substring(0, 100),
	title: a.title,
	target: a.target,
	rel: a.rel
	}};

	if (href.startsWith('mailto:')) {{
	links.email.push(linkData);
	}} else if (href.startsWith('tel:')) {{
	links.phone.push(linkData);
	}} else if (href.startsWith('javascript:')) {{
	links.javascript.push(linkData);
	}} else if (href.startsWith('#')) {{
	links.anchor.push(linkData);
	}} else {{
	try {{
	const linkUrl = new URL(href, window.location.href);

	// Check if it's a file download
	const isFileDownload = fileExtensions.some(ext =>
	linkUrl.pathname.toLowerCase().endsWith(ext)
	);

	if (isFileDownload) {{
	links.file_downloads.push({{...linkData, absoluteUrl: linkUrl.href}});
	}} else if (linkUrl.hostname === currentDomain) {{
	links.internal.push({{...linkData, absoluteUrl: linkUrl.href}});
	}} else if ({str(include_external).lower()}) {{
	links.external.push({{...linkData, absoluteUrl: linkUrl.href}});
	}}
	}} catch(e) {{
	// Invalid URL, add to javascript category
	links.javascript.push(linkData);
	}}
	}}
	}});

	return {{
	links: links,
	summary: {{
	total: document.querySelectorAll('a[href]').length,
	internal: links.internal.length,
	external: links.external.length,
	email: links.email.length,
	phone: links.phone.length,
	javascript: links.javascript.length,
	anchor: links.anchor.length,
	file_downloads: links.file_downloads.length
	}},
	page_info: {{
	title: document.title,
	url: window.location.href,
	domain: currentDomain
	}}
	}};
	""")

	return json.dumps(links_data, indent=2)
	except Exception as e:
	logger.error(f"Error in extract_all_links: {e}")
	return f"Error: {e}"
	finally:
	cleanup_driver(driver, use_persistent)

	def seo_analysis(url: str, use_persistent: bool = False) -> str:
	"""Perform SEO analysis on a page"""
	driver = None
	try:
	driver = get_driver(url, use_persistent)

	# Perform SEO analysis
	seo_data = driver.execute_script("""
	const analysis = {
	title: {
	content: document.title,
	length: document.title.length,
	issues: []
	},
	meta_description: {
	content: null,
	length: 0,
	issues: []
	},
	headings: {
	h1_count: 0,
	h1_texts: [],
	hierarchy: [],
	issues: []
	},
	images: {
	total: 0,
	without_alt: 0,
	issues: []
	},
	links: {
	total: 0,
	external: 0,
	nofollow: 0
	},
	canonical: null,
	robots: null,
	lang: document.documentElement.lang,
	structured_data_count: 0
	};

	// Check title
	if (analysis.title.length < 30) {
	analysis.title.issues.push('Title too short (recommended: 30-60 characters)');
	} else if (analysis.title.length > 60) {
	analysis.title.issues.push('Title too long (recommended: 30-60 characters)');
	}

	// Check meta description
	const metaDesc = document.querySelector('meta[name="description"]');
	if (metaDesc) {
	analysis.meta_description.content = metaDesc.content;
	analysis.meta_description.length = metaDesc.content.length;

	if (metaDesc.content.length < 120) {
	analysis.meta_description.issues.push('Description too short (recommended: 120-160 characters)');
	} else if (metaDesc.content.length > 160) {
	analysis.meta_description.issues.push('Description too long (recommended: 120-160 characters)');
	}
	} else {
	analysis.meta_description.issues.push('No meta description found');
	}

	// Check headings
	const h1s = document.querySelectorAll('h1');
	analysis.headings.h1_count = h1s.length;
	h1s.forEach(h1 => {
	analysis.headings.h1_texts.push(h1.textContent.trim());
	});

	if (h1s.length === 0) {
	analysis.headings.issues.push('No H1 tag found');
	} else if (h1s.length > 1) {
	analysis.headings.issues.push('Multiple H1 tags found (recommended: 1)');
	}

	// Get heading hierarchy
	const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
	allHeadings.forEach(h => {
	analysis.headings.hierarchy.push({
	level: h.tagName,
	text: h.textContent.trim().substring(0, 50)
	});
	});

	// Check images
	const images = document.querySelectorAll('img');
	analysis.images.total = images.length;
	images.forEach(img => {
	if (!img.alt) {
	analysis.images.without_alt++;
	}
	});

	if (analysis.images.without_alt > 0) {
	analysis.images.issues.push(`${analysis.images.without_alt} images without alt text`);
	}

	// Check links
	const links = document.querySelectorAll('a[href]');
	analysis.links.total = links.length;
	links.forEach(link => {
	try {
	const linkUrl = new URL(link.href, window.location.href);
	if (linkUrl.hostname !== window.location.hostname) {
	analysis.links.external++;
	}
	if (link.rel && link.rel.includes('nofollow')) {
	analysis.links.nofollow++;
	}
	} catch(e) {}
	});

	// Check canonical
	const canonical = document.querySelector('link[rel="canonical"]');
	if (canonical) {
	analysis.canonical = canonical.href;
	}

	// Check robots meta
	const robots = document.querySelector('meta[name="robots"]');
	if (robots) {
	analysis.robots = robots.content;
	}

	// Count structured data
	analysis.structured_data_count = document.querySelectorAll('script[type="application/ld+json"]').length;

	return analysis;
	""")

	# Calculate SEO score
	score = 100
	total_issues = 0

	for key in ['title', 'meta_description', 'headings', 'images']:
	if key in seo_data and 'issues' in seo_data[key]:
	issues = len(seo_data[key]['issues'])
	total_issues += issues
	score -= (issues * 10)

	score = max(0, score)

	result = {
	"url": url,
	"seo_score": score,
	"analysis": seo_data,
	"total_issues": total_issues,
	"recommendations": get_seo_recommendations(seo_data)
	}

	return json.dumps(result, indent=2)
	except Exception as e:
	logger.error(f"Error in seo_analysis: {e}")
	return f"Error: {e}"
	finally:
	cleanup_driver(driver, use_persistent)

	def get_seo_recommendations(seo_data):
	"""Get SEO recommendations based on analysis"""
	recommendations = []

	if seo_data['title']['issues']:
	recommendations.extend(seo_data['title']['issues'])

	if seo_data['meta_description']['issues']:
	recommendations.extend(seo_data['meta_description']['issues'])

	if seo_data['headings']['issues']:
	recommendations.extend(seo_data['headings']['issues'])

	if seo_data['images']['issues']:
	recommendations.extend(seo_data['images']['issues'])

	if not seo_data['canonical']:
	recommendations.append("Add canonical URL to prevent duplicate content issues")

	if not seo_data['lang']:
	recommendations.append("Add lang attribute to HTML tag for better internationalization")

	if seo_data['structured_data_count'] == 0:
	recommendations.append("Add structured data (JSON-LD) for better search engine understanding")

	return recommendations