# scraper.py
import urllib.request
from bs4 import BeautifulSoup

def extract_content(url):
    """
    Extracts HTML content from a URL and returns:
      - all headings (h1-h6)
      - all paragraph texts
      - all image URLs
      - all hyperlinks
      - all visible text
    """
    try:
        # Fetch webpage
        response = urllib.request.urlopen(url)
        page_data = response.read()
        soup = BeautifulSoup(page_data, "html5lib")

        # Headings
        headings = []
        for i in range(1, 7):
            tag = f'h{i}'
            headings += [h.get_text(strip=True) for h in soup.find_all(tag)]

        # Paragraphs
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]

        # Images
        images = [img['src'] for img in soup.find_all('img', src=True)]

        # Hyperlinks
        links = [a['href'] for a in soup.find_all('a', href=True)]

        # Visible text
        text = soup.get_text(separator=' ', strip=True)

        return {
            "headings": headings,
            "paragraphs": paragraphs,
            "images": images,
            "links": links,
            "text": text
        }

    except Exception as e:
        print("❌ Error while fetching webpage:", e)
        return None