Web-sight / scraper.py
selfDotOsman's picture
done
54e8517
import requests
from bs4 import BeautifulSoup
import re
def extract_info(url):
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching the website: {e}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
# Remove common noise elements
for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
element.decompose()
# Find the main content area (adjust as needed for specific websites)
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|body'))
if not main_content:
main_content = soup.body
important_text = []
for elem in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
text = elem.get_text(strip=True)
if text: # Ignore empty paragraphs
important_text.append(text)
return ' '.join(important_text)