Spaces:

OjciecTadeusz
/

agent-dev-07-space

Runtime error

agent-dev-07-space / src /tools /web_scraper.py

Upload src/tools/web_scraper.py with huggingface_hub

6701858 verified about 1 month ago

1.66 kB

	"""Simple web‑scraping tool used by the agent.

	The real implementation would use :pypi:`trafilatura` (or Playwright) to fetch
	and clean the main article text from a URL. Here we provide a lightweight
	fallback that works in the HF Space without extra system dependencies.
	"""
	import os
	import requests
	from loguru import logger


	def scrape(url: str, timeout: int = 10) -> str:
	"""Return the main textual content of url.

	The function tries to fetch the page with ``requests`` and then extracts a
	crude text representation by stripping HTML tags. If the optional
	``trafilatura`` package is available it will be used for a higher‑quality
	extraction.
	"""
	logger.info(f"Scraping URL: {url}")
	try:
	resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
	resp.raise_for_status()
	html = resp.text
	except Exception as e:
	logger.error(f"Failed to fetch {url}: {e}")
	return f"Error: could not retrieve the page ({e})"

	# Try to use trafilatura if installed – it gives a clean article body.
	try:
	import trafilatura
	text = trafilatura.extract(html)
	if text:
	return text.strip()
	except Exception:
	# Fallback: very naive tag removal.
	from html import unescape
	import re
	text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
	text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
	text = re.sub(r"<[^>]+>", " ", text)
	text = unescape(text)
	# Collapse whitespace.
	text = re.sub(r"\s+", " ", text).strip()
	return text