Spaces:

simar007
/

web-scraper

Running

web-scraper / scraper.py

Upload 3 files

ae4572b verified 3 months ago

1.34 kB

	# scraper.py
	import urllib.request
	from bs4 import BeautifulSoup

	def extract_content(url):
	"""
	Extracts HTML content from a URL and returns:
	- all headings (h1-h6)
	- all paragraph texts
	- all image URLs
	- all hyperlinks
	- all visible text
	"""
	try:
	# Fetch webpage
	response = urllib.request.urlopen(url)
	page_data = response.read()
	soup = BeautifulSoup(page_data, "html5lib")

	# Headings
	headings = []
	for i in range(1, 7):
	tag = f'h{i}'
	headings += [h.get_text(strip=True) for h in soup.find_all(tag)]

	# Paragraphs
	paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]

	# Images
	images = [img['src'] for img in soup.find_all('img', src=True)]

	# Hyperlinks
	links = [a['href'] for a in soup.find_all('a', href=True)]

	# Visible text
	text = soup.get_text(separator=' ', strip=True)

	return {
	"headings": headings,
	"paragraphs": paragraphs,
	"images": images,
	"links": links,
	"text": text
	}

	except Exception as e:
	print("❌ Error while fetching webpage:", e)
	return None