Spaces:
Sleeping
Sleeping
| """ | |
| Text and HTML extraction features | |
| """ | |
| import json | |
| import logging | |
| import re | |
| from datetime import datetime | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from browser.driver import get_driver, cleanup_driver | |
| logger = logging.getLogger(__name__) | |
| def get_html_source(url: str, use_persistent: bool = False) -> str: | |
| """Get full HTML source code of the page""" | |
| driver = None | |
| try: | |
| driver = get_driver(url, use_persistent) | |
| # Wait for page to fully load | |
| WebDriverWait(driver, 10).until( | |
| lambda d: d.execute_script("return document.readyState") == "complete" | |
| ) | |
| # Get the full HTML | |
| html = driver.page_source | |
| return html | |
| except Exception as e: | |
| logger.error(f"Error in get_html_source: {e}") | |
| return f"Error: {e}" | |
| finally: | |
| cleanup_driver(driver, use_persistent) | |
| def save_html_to_file(url: str, filename: str = "", use_persistent: bool = False) -> str: | |
| """Save HTML source to file""" | |
| driver = None | |
| try: | |
| driver = get_driver(url, use_persistent) | |
| # Wait for page to fully load | |
| WebDriverWait(driver, 10).until( | |
| lambda d: d.execute_script("return document.readyState") == "complete" | |
| ) | |
| # Get the full HTML | |
| html = driver.page_source | |
| # Generate filename if not provided | |
| if not filename: | |
| safe_url = re.sub(r'[^\w\s-]', '', url.replace('https://', '').replace('http://', ''))[:50] | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"{safe_url}_{timestamp}.html" | |
| filepath = f"/tmp/{filename}" | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write(html) | |
| return filepath | |
| except Exception as e: | |
| logger.error(f"Error in save_html_to_file: {e}") | |
| return f"Error: {e}" | |
| finally: | |
| cleanup_driver(driver, use_persistent) | |
| def get_page_info(url: str, use_persistent: bool = False) -> str: | |
| """Get comprehensive page information""" | |
| driver = None | |
| try: | |
| driver = get_driver(url, use_persistent) | |
| info = { | |
| "title": driver.title, | |
| "url": driver.current_url, | |
| "page_source_length": len(driver.page_source), | |
| "cookies_count": len(driver.get_cookies()), | |
| "viewport": driver.execute_script("return {width: window.innerWidth, height: window.innerHeight};"), | |
| "scroll_height": driver.execute_script("return document.body.scrollHeight;"), | |
| "ready_state": driver.execute_script("return document.readyState;"), | |
| "links_count": len(driver.find_elements("tag name", "a")), | |
| "images_count": len(driver.find_elements("tag name", "img")), | |
| "forms_count": len(driver.find_elements("tag name", "form")) | |
| } | |
| return json.dumps(info, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error in get_page_info: {e}") | |
| return f"Error: {e}" | |
| finally: | |
| cleanup_driver(driver, use_persistent) |