# scraper_agent.py import os import time from dotenv import load_dotenv import requests from bs4 import BeautifulSoup load_dotenv(override=True) class ScraperAgent: def __init__(self): self.user_agent = os.getenv( "SCRAPER_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36" ) self.timeout = int(os.getenv("SCRAPER_TIMEOUT", "10")) self.delay = float(os.getenv("SCRAPER_DELAY", "0.5")) def fetch(self, url: str) -> dict: headers = {"User-Agent": self.user_agent} resp = requests.get(url, headers=headers, timeout=self.timeout) resp.raise_for_status() html = resp.text soup = BeautifulSoup(html, "html.parser") images = [img["src"] for img in soup.find_all("img", src=True)] body = soup.body.get_text("\n", strip=True) if soup.body else "" time.sleep(self.delay) return { "title": soup.title.string if soup.title else "", "html": html, "images": images, "text": body } def close(self): """ Clean up any resources. No-op for requests-based scraper, but lets pipeline always call scraper.close(). """ pass