Spaces:

MinhTai
/

ai-agent-app

Running

App Files Files Community

ai-agent-app / scripts /crawl /sources /generic_html.py

MinhTai

deploy: 622c6e2

34331f9 1 day ago

raw

history blame contribute delete

2.16 kB

	import asyncio
	import logging
	import re
	from urllib.parse import urljoin, urlparse
	import httpx
	from bs4 import BeautifulSoup
	from crawl.http_utils import fetch_with_retry, safe_text

	logger = logging.getLogger(__name__)


	async def fetch_generic_html(
	index_url: str,
	seen: set[str],
	link_pattern: str = r".*",
	max_pages: int = 150,
	crawl_delay: float = 0.0,
	) -> tuple[list[tuple[str, str]], int]:
	"""Fetch index_url, follow links matching link_pattern, return (pages, skipped_count).

	Discovered links are de-duplicated and fragments stripped before comparison.
	The index page itself is never returned as a content page.
	"""
	try:
	index_resp = await fetch_with_retry(index_url)
	except (httpx.HTTPStatusError, httpx.RequestError) as exc:
	logger.warning("Generic index fetch failed for %r: %s", index_url, exc)
	return [], 0

	soup = BeautifulSoup(safe_text(index_resp), "html.parser")
	pattern = re.compile(link_pattern)
	index_bare = index_url.split("#")[0]

	seen_urls: set[str] = set()
	discovered: list[str] = []
	for a in soup.find_all("a", href=True):
	href = a["href"].strip()
	if not href or href.startswith("mailto:") or href.startswith("javascript:"):
	continue
	url = urljoin(index_url, href).split("#")[0]
	if url == index_bare or url in seen_urls:
	continue
	if pattern.search(url):
	seen_urls.add(url)
	discovered.append(url)

	discovered = discovered[:max_pages]
	logger.info("Generic source: found %d candidate pages from %s", len(discovered), index_url)

	results: list[tuple[str, str]] = []
	skipped = 0
	for url in discovered:
	if url in seen:
	skipped += 1
	continue
	if crawl_delay:
	await asyncio.sleep(crawl_delay)
	try:
	page_resp = await fetch_with_retry(url)
	results.append((url, safe_text(page_resp)))
	except (httpx.HTTPStatusError, httpx.RequestError) as exc:
	logger.warning("Generic page fetch failed for %r: %s", url, exc)

	return results, skipped