Spaces:

ztcoco
/

Semantic-Bookmark

Paused

File size: 1,694 Bytes

840261a

import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

class BrowserScraper:
    def __init__(self):
        self.options = Options()
        self.options.add_argument("--headless")
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-dev-shm-usage")
        
        chrome_bin = os.getenv("CHROME_BIN")
        if chrome_bin:
            self.options.binary_location = chrome_bin

    def fetch_page_metadata(self, url):
        """
        Uses explicit Browser (Selenium) to scrape the page title and summary.
        This fulfills the requirement of 'real browser operation'.
        """
        driver = None
        try:
            print(f"🌍 Browser Navigating to: {url}")
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=self.options)
            
            driver.get(url)
            time.sleep(2) # Wait for JS to load
            
            title = driver.title
            content = driver.find_element("tag name", "body").text[:500] # Get first 500 chars for summary
            
            return {
                "title": title,
                "summary": content,
                "status": "success"
            }
        except Exception as e:
            return {
                "title": "Error",
                "summary": str(e),
                "status": "failed"
            }
        finally:
            if driver:
                driver.quit()