File size: 1,019 Bytes
54e8517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import requests
from bs4 import BeautifulSoup
import re

def extract_info(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching the website: {e}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove common noise elements
    for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
        element.decompose()

    # Find the main content area (adjust as needed for specific websites)
    main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|body'))

    if not main_content:
        main_content = soup.body

    important_text = []
    for elem in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = elem.get_text(strip=True)
        if text:  # Ignore empty paragraphs
            important_text.append(text)
            
    return ' '.join(important_text)