Spaces:
Paused
Paused
File size: 1,906 Bytes
e67896b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
class WebArchiver:
def __init__(self):
self._init_driver()
def _init_driver(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# 🔗 Bind to the installed Chrome binary
chrome_bin = os.getenv("CHROME_BIN")
if chrome_bin:
chrome_options.binary_location = chrome_bin
self.chrome_options = chrome_options
def archive_url(self, url):
# Silent archiving
if not url.startswith("http"): return "❌ Error: Invalid URL scheme."
try:
# Setup Headless Chrome for each job to ensure clean state
from selenium.webdriver.chrome.service import Service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=self.chrome_options)
try:
driver.get(url)
time.sleep(2) # Wait for dynamic content
title = driver.title
screenshot_path = f"archive_store/snap_{int(time.time())}.png"
driver.save_screenshot(screenshot_path)
# Simulate WARC metadata
page_source = driver.page_source
size_kb = len(page_source) / 1024
return f"✅ Archival Complete.\n\nTitle: {title}\nSnapshot: {screenshot_path}\nSize: {size_kb:.1f} KB\nEngine: Chrome Headless"
finally:
driver.quit()
except Exception as e:
return f"❌ Archival Failed: {str(e)}"
|