Spaces:
Paused
Paused
| import time | |
| import os | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from bs4 import BeautifulSoup | |
| class BrowserScraper: | |
| def __init__(self): | |
| self.options = Options() | |
| self.options.add_argument("--headless") | |
| self.options.add_argument("--no-sandbox") | |
| self.options.add_argument("--disable-dev-shm-usage") | |
| chrome_bin = os.getenv("CHROME_BIN") | |
| if chrome_bin: | |
| self.options.binary_location = chrome_bin | |
| def fetch_page_metadata(self, url): | |
| """ | |
| Uses explicit Browser (Selenium) to scrape the page title and summary. | |
| This fulfills the requirement of 'real browser operation'. | |
| """ | |
| driver = None | |
| try: | |
| print(f"๐ Browser Navigating to: {url}") | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=self.options) | |
| driver.get(url) | |
| time.sleep(2) # Wait for JS to load | |
| title = driver.title | |
| content = driver.find_element("tag name", "body").text[:500] # Get first 500 chars for summary | |
| return { | |
| "title": title, | |
| "summary": content, | |
| "status": "success" | |
| } | |
| except Exception as e: | |
| return { | |
| "title": "Error", | |
| "summary": str(e), | |
| "status": "failed" | |
| } | |
| finally: | |
| if driver: | |
| driver.quit() | |