Website_intelligent_report / scraper_agent.py
Tngarg's picture
Upload 12 files
db16232 verified
# scraper_agent.py
import os
import time
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
load_dotenv(override=True)
class ScraperAgent:
def __init__(self):
self.user_agent = os.getenv(
"SCRAPER_USER_AGENT",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
)
self.timeout = int(os.getenv("SCRAPER_TIMEOUT", "10"))
self.delay = float(os.getenv("SCRAPER_DELAY", "0.5"))
def fetch(self, url: str) -> dict:
headers = {"User-Agent": self.user_agent}
resp = requests.get(url, headers=headers, timeout=self.timeout)
resp.raise_for_status()
html = resp.text
soup = BeautifulSoup(html, "html.parser")
images = [img["src"] for img in soup.find_all("img", src=True)]
body = soup.body.get_text("\n", strip=True) if soup.body else ""
time.sleep(self.delay)
return {
"title": soup.title.string if soup.title else "",
"html": html,
"images": images,
"text": body
}
def close(self):
"""
Clean up any resources.
No-op for requests-based scraper,
but lets pipeline always call scraper.close().
"""
pass