web-scraper / scraper.py
simar007's picture
Upload 3 files
ae4572b verified
# scraper.py
import urllib.request
from bs4 import BeautifulSoup
def extract_content(url):
"""
Extracts HTML content from a URL and returns:
- all headings (h1-h6)
- all paragraph texts
- all image URLs
- all hyperlinks
- all visible text
"""
try:
# Fetch webpage
response = urllib.request.urlopen(url)
page_data = response.read()
soup = BeautifulSoup(page_data, "html5lib")
# Headings
headings = []
for i in range(1, 7):
tag = f'h{i}'
headings += [h.get_text(strip=True) for h in soup.find_all(tag)]
# Paragraphs
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
# Images
images = [img['src'] for img in soup.find_all('img', src=True)]
# Hyperlinks
links = [a['href'] for a in soup.find_all('a', href=True)]
# Visible text
text = soup.get_text(separator=' ', strip=True)
return {
"headings": headings,
"paragraphs": paragraphs,
"images": images,
"links": links,
"text": text
}
except Exception as e:
print("❌ Error while fetching webpage:", e)
return None