Browser-Use-mcp / features /extraction.py
diamond-in's picture
Update features/extraction.py
a44d039 verified
"""
Text and HTML extraction features
"""
import json
import logging
import re
from datetime import datetime
from selenium.webdriver.support.ui import WebDriverWait
from browser.driver import get_driver, cleanup_driver
logger = logging.getLogger(__name__)
def get_html_source(url: str, use_persistent: bool = False) -> str:
"""Get full HTML source code of the page"""
driver = None
try:
driver = get_driver(url, use_persistent)
# Wait for page to fully load
WebDriverWait(driver, 10).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
# Get the full HTML
html = driver.page_source
return html
except Exception as e:
logger.error(f"Error in get_html_source: {e}")
return f"Error: {e}"
finally:
cleanup_driver(driver, use_persistent)
def save_html_to_file(url: str, filename: str = "", use_persistent: bool = False) -> str:
"""Save HTML source to file"""
driver = None
try:
driver = get_driver(url, use_persistent)
# Wait for page to fully load
WebDriverWait(driver, 10).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
# Get the full HTML
html = driver.page_source
# Generate filename if not provided
if not filename:
safe_url = re.sub(r'[^\w\s-]', '', url.replace('https://', '').replace('http://', ''))[:50]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{safe_url}_{timestamp}.html"
filepath = f"/tmp/{filename}"
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html)
return filepath
except Exception as e:
logger.error(f"Error in save_html_to_file: {e}")
return f"Error: {e}"
finally:
cleanup_driver(driver, use_persistent)
def get_page_info(url: str, use_persistent: bool = False) -> str:
"""Get comprehensive page information"""
driver = None
try:
driver = get_driver(url, use_persistent)
info = {
"title": driver.title,
"url": driver.current_url,
"page_source_length": len(driver.page_source),
"cookies_count": len(driver.get_cookies()),
"viewport": driver.execute_script("return {width: window.innerWidth, height: window.innerHeight};"),
"scroll_height": driver.execute_script("return document.body.scrollHeight;"),
"ready_state": driver.execute_script("return document.readyState;"),
"links_count": len(driver.find_elements("tag name", "a")),
"images_count": len(driver.find_elements("tag name", "img")),
"forms_count": len(driver.find_elements("tag name", "form"))
}
return json.dumps(info, indent=2)
except Exception as e:
logger.error(f"Error in get_page_info: {e}")
return f"Error: {e}"
finally:
cleanup_driver(driver, use_persistent)