Spaces:

ShivanshCodex
/

Web_Scrapping_Agent

Runtime error

Upload 47 files

f085180 verified 11 months ago

1.7 kB

	# agent/scraper_tool.py

	import requests
	from bs4 import BeautifulSoup
	from typing import List, Dict
	import logging

	# ─── Logging Config ─────────────────────────────────────────────
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)

	class WebScraperTool:
	def __init__(self, user_agent: str = "Mozilla/5.0"):
	self.headers = {"User-Agent": user_agent}

	def scrape(self, url: str) -> Dict[str, str]:
	"""
	Scrapes clean text content (from <p> tags) from a webpage.

	Args:
	url (str): URL of the page to scrape

	Returns:
	Dict[str, str]: Dictionary with 'url' and clipped 'content'
	"""
	try:
	response = requests.get(url, headers=self.headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")
	paragraphs = soup.find_all("p")
	if not paragraphs:
	logger.warning(f"No <p> tags found at: {url}")

	clean_text = "\n".join(p.get_text().strip() for p in paragraphs if p.get_text())

	return {
	"url": url,
	"content": clean_text[:5000] # Limit content to avoid LLM overflow
	}

	except requests.exceptions.RequestException as e:
	logger.error(f"Failed to scrape {url}: {e}")
	return {"url": url, "content": ""}

	except Exception as e:
	logger.exception(f"Unexpected error scraping {url}")
	return {"url": url, "content": ""}