Spaces:

selfDotOsman
/

Web-sight

Sleeping

Web-sight / scraper.py

done

54e8517 over 1 year ago

1.02 kB

	import requests
	from bs4 import BeautifulSoup
	import re

	def extract_info(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	except requests.RequestException as e:
	print(f"Error fetching the website: {e}")
	return None

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove common noise elements
	for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
	element.decompose()

	# Find the main content area (adjust as needed for specific websites)
	main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content\|main\|body'))

	if not main_content:
	main_content = soup.body

	important_text = []
	for elem in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
	text = elem.get_text(strip=True)
	if text: # Ignore empty paragraphs
	important_text.append(text)

	return ' '.join(important_text)