Spaces:
Sleeping
Sleeping
| # scraper_agent.py | |
| import os | |
| import time | |
| from dotenv import load_dotenv | |
| import requests | |
| from bs4 import BeautifulSoup | |
| load_dotenv(override=True) | |
| class ScraperAgent: | |
| def __init__(self): | |
| self.user_agent = os.getenv( | |
| "SCRAPER_USER_AGENT", | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36" | |
| ) | |
| self.timeout = int(os.getenv("SCRAPER_TIMEOUT", "10")) | |
| self.delay = float(os.getenv("SCRAPER_DELAY", "0.5")) | |
| def fetch(self, url: str) -> dict: | |
| headers = {"User-Agent": self.user_agent} | |
| resp = requests.get(url, headers=headers, timeout=self.timeout) | |
| resp.raise_for_status() | |
| html = resp.text | |
| soup = BeautifulSoup(html, "html.parser") | |
| images = [img["src"] for img in soup.find_all("img", src=True)] | |
| body = soup.body.get_text("\n", strip=True) if soup.body else "" | |
| time.sleep(self.delay) | |
| return { | |
| "title": soup.title.string if soup.title else "", | |
| "html": html, | |
| "images": images, | |
| "text": body | |
| } | |
| def close(self): | |
| """ | |
| Clean up any resources. | |
| No-op for requests-based scraper, | |
| but lets pipeline always call scraper.close(). | |
| """ | |
| pass | |