Web-Archive

Paused

File size: 1,906 Bytes

e67896b

import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

class WebArchiver:
    def __init__(self):
        self._init_driver()

    def _init_driver(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        # 🔗 Bind to the installed Chrome binary
        chrome_bin = os.getenv("CHROME_BIN")
        if chrome_bin:
            chrome_options.binary_location = chrome_bin
            
        self.chrome_options = chrome_options

    def archive_url(self, url):
        # Silent archiving
        if not url.startswith("http"): return "❌ Error: Invalid URL scheme."

        try:
            # Setup Headless Chrome for each job to ensure clean state
            from selenium.webdriver.chrome.service import Service
            service = Service(ChromeDriverManager().install())
            
            driver = webdriver.Chrome(service=service, options=self.chrome_options)
            try:
                driver.get(url)
                time.sleep(2) # Wait for dynamic content
                
                title = driver.title
                screenshot_path = f"archive_store/snap_{int(time.time())}.png"
                driver.save_screenshot(screenshot_path)
                
                # Simulate WARC metadata
                page_source = driver.page_source
                size_kb = len(page_source) / 1024
                
                return f"✅ Archival Complete.\n\nTitle: {title}\nSnapshot: {screenshot_path}\nSize: {size_kb:.1f} KB\nEngine: Chrome Headless"
            finally:
                driver.quit()
        except Exception as e:
            return f"❌ Archival Failed: {str(e)}"