File size: 2,382 Bytes
cbcc9fd
 
 
 
 
50eb7ef
cbcc9fd
50eb7ef
 
ae85af3
cbcc9fd
 
 
 
 
 
50eb7ef
 
 
 
cbcc9fd
 
 
 
 
ae85af3
cbcc9fd
 
50eb7ef
 
cbcc9fd
50eb7ef
 
 
 
 
 
 
 
cbcc9fd
 
 
50eb7ef
 
 
 
 
cbcc9fd
ae85af3
50eb7ef
 
cbcc9fd
 
50eb7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import pipeline
from bs4 import BeautifulSoup
import requests

def fetch_webpage_content(url):
    """Fetch the content of a webpage."""
    try:
        response = requests.get(url, timeout=10)  # Ensures the use of standard HTTP/HTTPS ports
        response.raise_for_status()  # Raises an error for bad responses
        print("Hello", response.text)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

def parse_and_segment_content(html_content):
    """Parse and segment HTML content into manageable chunks."""
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    results = soup.find_all(['h1', 'p'])
    text = ' '.join([result.text for result in results])
    text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
    sentences = text.split('<eos>')
    print("Doing segmentation")
    max_chunk = 500
    chunks = []
    current_chunk = -1

    for sentence in sentences:
        if len(sentence.strip()) == 0:
            continue
        if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
            chunks.append([])
            current_chunk += 1
        chunks[current_chunk].extend(sentence.split())

    chunks = [' '.join(chunk).strip() for chunk in chunks]
    return chunks

def summarize_text(chunks):
    """Summarize the given text chunks."""
    if not chunks:
        return "No content to summarize."

    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    summaries = []
    print("Summarizing content")
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error in summarization: {e}")
            summaries.append("Error summarizing text.")  # Keep the flow even if summarization fails

    return ' '.join(summaries)

# Example usage
# url = "https://example.com"
# html_content = fetch_webpage_content(url)
# if html_content:
#     chunks = parse_and_segment_content(html_content)
#     summary = summarize_text(chunks)
#     print(summary)
# else:
#     print("Failed to fetch or parse webpage content.")