import time import os from selenium import webdriver from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager class WebArchiver: def __init__(self): self._init_driver() def _init_driver(self): chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") # 🔗 Bind to the installed Chrome binary chrome_bin = os.getenv("CHROME_BIN") if chrome_bin: chrome_options.binary_location = chrome_bin self.chrome_options = chrome_options def archive_url(self, url): # Silent archiving if not url.startswith("http"): return "❌ Error: Invalid URL scheme." try: # Setup Headless Chrome for each job to ensure clean state from selenium.webdriver.chrome.service import Service service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=self.chrome_options) try: driver.get(url) time.sleep(2) # Wait for dynamic content title = driver.title screenshot_path = f"archive_store/snap_{int(time.time())}.png" driver.save_screenshot(screenshot_path) # Simulate WARC metadata page_source = driver.page_source size_kb = len(page_source) / 1024 return f"✅ Archival Complete.\n\nTitle: {title}\nSnapshot: {screenshot_path}\nSize: {size_kb:.1f} KB\nEngine: Chrome Headless" finally: driver.quit() except Exception as e: return f"❌ Archival Failed: {str(e)}"