Spaces:
Paused
Paused
| import time | |
| import os | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| class WebArchiver: | |
| def __init__(self): | |
| self._init_driver() | |
| def _init_driver(self): | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| # ๐ Bind to the installed Chrome binary | |
| chrome_bin = os.getenv("CHROME_BIN") | |
| if chrome_bin: | |
| chrome_options.binary_location = chrome_bin | |
| self.chrome_options = chrome_options | |
| def archive_url(self, url): | |
| # Silent archiving | |
| if not url.startswith("http"): return "โ Error: Invalid URL scheme." | |
| try: | |
| # Setup Headless Chrome for each job to ensure clean state | |
| from selenium.webdriver.chrome.service import Service | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=self.chrome_options) | |
| try: | |
| driver.get(url) | |
| time.sleep(2) # Wait for dynamic content | |
| title = driver.title | |
| screenshot_path = f"archive_store/snap_{int(time.time())}.png" | |
| driver.save_screenshot(screenshot_path) | |
| # Simulate WARC metadata | |
| page_source = driver.page_source | |
| size_kb = len(page_source) / 1024 | |
| return f"โ Archival Complete.\n\nTitle: {title}\nSnapshot: {screenshot_path}\nSize: {size_kb:.1f} KB\nEngine: Chrome Headless" | |
| finally: | |
| driver.quit() | |
| except Exception as e: | |
| return f"โ Archival Failed: {str(e)}" | |