File size: 1,936 Bytes
e51e040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import logging
import aiohttp
from bs4 import BeautifulSoup
from html2text import HTML2Text
from typing import Dict, List, Optional
from urllib.parse import urljoin, urlparse
from crawl4ai import *


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class WebsiteViewerError(Exception):
    pass

async def fetch_website_content(url: str) -> Dict:
    """
    Fetch website content using aiohttp and BeautifulSoup.
    
    Args:
        url (str): The URL of the website to fetch
        
    Returns:
        Dict: A dictionary containing:
            - title: The page title (str)
            - content: The main content in markdown format (str)
            - links: List of absolute URLs found on the page (List[str])
            - images: List of image URLs found on the page (List[str])
            - url: The original URL (str)
    """
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
            
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(
                url=url,
            )
            soup = BeautifulSoup(result.html, 'html.parser')
            
            # Process links - convert relative URLs to absolute
            links = result.links
            
            # Process images - get src attributes
            media = result.media

            
            # Get title
            title = soup.title.string if soup.title else ''
            
            output = {
                "title": title,
                "markdown": result.markdown,
                "links": links,
                "media": media,
                "url": url
            }
            
            return output
        
    except Exception as e:
        logger.error(f"Error fetching website content: {str(e)}")
        raise WebsiteViewerError(f"Failed to fetch website content: {str(e)}")