# scraper.py import urllib.request from bs4 import BeautifulSoup def extract_content(url): """ Extracts HTML content from a URL and returns: - all headings (h1-h6) - all paragraph texts - all image URLs - all hyperlinks - all visible text """ try: # Fetch webpage response = urllib.request.urlopen(url) page_data = response.read() soup = BeautifulSoup(page_data, "html5lib") # Headings headings = [] for i in range(1, 7): tag = f'h{i}' headings += [h.get_text(strip=True) for h in soup.find_all(tag)] # Paragraphs paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)] # Images images = [img['src'] for img in soup.find_all('img', src=True)] # Hyperlinks links = [a['href'] for a in soup.find_all('a', href=True)] # Visible text text = soup.get_text(separator=' ', strip=True) return { "headings": headings, "paragraphs": paragraphs, "images": images, "links": links, "text": text } except Exception as e: print("❌ Error while fetching webpage:", e) return None