Spaces:

simar007
/

web-scraper

Running

File size: 1,343 Bytes

ae4572b

# scraper.py
import urllib.request
from bs4 import BeautifulSoup

def extract_content(url):
    """

    Extracts HTML content from a URL and returns:

      - all headings (h1-h6)

      - all paragraph texts

      - all image URLs

      - all hyperlinks

      - all visible text

    """
    try:
        # Fetch webpage
        response = urllib.request.urlopen(url)
        page_data = response.read()
        soup = BeautifulSoup(page_data, "html5lib")

        # Headings
        headings = []
        for i in range(1, 7):
            tag = f'h{i}'
            headings += [h.get_text(strip=True) for h in soup.find_all(tag)]

        # Paragraphs
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]

        # Images
        images = [img['src'] for img in soup.find_all('img', src=True)]

        # Hyperlinks
        links = [a['href'] for a in soup.find_all('a', href=True)]

        # Visible text
        text = soup.get_text(separator=' ', strip=True)

        return {
            "headings": headings,
            "paragraphs": paragraphs,
            "images": images,
            "links": links,
            "text": text
        }

    except Exception as e:
        print("❌ Error while fetching webpage:", e)
        return None