""" Text and HTML extraction features """ import json import logging import re from datetime import datetime from selenium.webdriver.support.ui import WebDriverWait from browser.driver import get_driver, cleanup_driver logger = logging.getLogger(__name__) def get_html_source(url: str, use_persistent: bool = False) -> str: """Get full HTML source code of the page""" driver = None try: driver = get_driver(url, use_persistent) # Wait for page to fully load WebDriverWait(driver, 10).until( lambda d: d.execute_script("return document.readyState") == "complete" ) # Get the full HTML html = driver.page_source return html except Exception as e: logger.error(f"Error in get_html_source: {e}") return f"Error: {e}" finally: cleanup_driver(driver, use_persistent) def save_html_to_file(url: str, filename: str = "", use_persistent: bool = False) -> str: """Save HTML source to file""" driver = None try: driver = get_driver(url, use_persistent) # Wait for page to fully load WebDriverWait(driver, 10).until( lambda d: d.execute_script("return document.readyState") == "complete" ) # Get the full HTML html = driver.page_source # Generate filename if not provided if not filename: safe_url = re.sub(r'[^\w\s-]', '', url.replace('https://', '').replace('http://', ''))[:50] timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{safe_url}_{timestamp}.html" filepath = f"/tmp/{filename}" with open(filepath, 'w', encoding='utf-8') as f: f.write(html) return filepath except Exception as e: logger.error(f"Error in save_html_to_file: {e}") return f"Error: {e}" finally: cleanup_driver(driver, use_persistent) def get_page_info(url: str, use_persistent: bool = False) -> str: """Get comprehensive page information""" driver = None try: driver = get_driver(url, use_persistent) info = { "title": driver.title, "url": driver.current_url, "page_source_length": len(driver.page_source), "cookies_count": len(driver.get_cookies()), "viewport": driver.execute_script("return {width: window.innerWidth, height: window.innerHeight};"), "scroll_height": driver.execute_script("return document.body.scrollHeight;"), "ready_state": driver.execute_script("return document.readyState;"), "links_count": len(driver.find_elements("tag name", "a")), "images_count": len(driver.find_elements("tag name", "img")), "forms_count": len(driver.find_elements("tag name", "form")) } return json.dumps(info, indent=2) except Exception as e: logger.error(f"Error in get_page_info: {e}") return f"Error: {e}" finally: cleanup_driver(driver, use_persistent)