Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| def extract_info(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"Error fetching the website: {e}") | |
| return None | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove common noise elements | |
| for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']): | |
| element.decompose() | |
| # Find the main content area (adjust as needed for specific websites) | |
| main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|body')) | |
| if not main_content: | |
| main_content = soup.body | |
| important_text = [] | |
| for elem in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): | |
| text = elem.get_text(strip=True) | |
| if text: # Ignore empty paragraphs | |
| important_text.append(text) | |
| return ' '.join(important_text) | |