Spaces:

eligapris
/

search_api

Sleeping

search_api / website_viewer.py

Upload 7 files

e51e040 verified 8 months ago

1.94 kB

	import logging
	import aiohttp
	from bs4 import BeautifulSoup
	from html2text import HTML2Text
	from typing import Dict, List, Optional
	from urllib.parse import urljoin, urlparse
	from crawl4ai import *


	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class WebsiteViewerError(Exception):
	pass

	async def fetch_website_content(url: str) -> Dict:
	"""
	Fetch website content using aiohttp and BeautifulSoup.

	Args:
	url (str): The URL of the website to fetch

	Returns:
	Dict: A dictionary containing:
	- title: The page title (str)
	- content: The main content in markdown format (str)
	- links: List of absolute URLs found on the page (List[str])
	- images: List of image URLs found on the page (List[str])
	- url: The original URL (str)
	"""
	try:
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(
	url=url,
	)
	soup = BeautifulSoup(result.html, 'html.parser')

	# Process links - convert relative URLs to absolute
	links = result.links

	# Process images - get src attributes
	media = result.media


	# Get title
	title = soup.title.string if soup.title else ''

	output = {
	"title": title,
	"markdown": result.markdown,
	"links": links,
	"media": media,
	"url": url
	}

	return output

	except Exception as e:
	logger.error(f"Error fetching website content: {str(e)}")
	raise WebsiteViewerError(f"Failed to fetch website content: {str(e)}")