Spaces:

enstazao
/

Blog-Summarizer

Runtime error

File size: 2,382 Bytes

cbcc9fd
 
 
 
 
50eb7ef
cbcc9fd
50eb7ef
 
ae85af3
cbcc9fd
 
 
 
 
 
50eb7ef
 
 
 
cbcc9fd
 
 
 
 
ae85af3
cbcc9fd
 
50eb7ef
 
cbcc9fd
50eb7ef
 
 
 
 
 
 
 
cbcc9fd
 
 
50eb7ef
 
 
 
 
cbcc9fd
ae85af3
50eb7ef
 
cbcc9fd
 
50eb7ef

from transformers import pipeline
from bs4 import BeautifulSoup
import requests

def fetch_webpage_content(url):
    """Fetch the content of a webpage."""
    try:
        response = requests.get(url, timeout=10)  # Ensures the use of standard HTTP/HTTPS ports
        response.raise_for_status()  # Raises an error for bad responses
        print("Hello", response.text)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

def parse_and_segment_content(html_content):
    """Parse and segment HTML content into manageable chunks."""
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    results = soup.find_all(['h1', 'p'])
    text = ' '.join([result.text for result in results])
    text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
    sentences = text.split('<eos>')
    print("Doing segmentation")
    max_chunk = 500
    chunks = []
    current_chunk = -1

    for sentence in sentences:
        if len(sentence.strip()) == 0:
            continue
        if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
            chunks.append([])
            current_chunk += 1
        chunks[current_chunk].extend(sentence.split())

    chunks = [' '.join(chunk).strip() for chunk in chunks]
    return chunks

def summarize_text(chunks):
    """Summarize the given text chunks."""
    if not chunks:
        return "No content to summarize."

    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    summaries = []
    print("Summarizing content")
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error in summarization: {e}")
            summaries.append("Error summarizing text.")  # Keep the flow even if summarization fails

    return ' '.join(summaries)

# Example usage
# url = "https://example.com"
# html_content = fetch_webpage_content(url)
# if html_content:
#     chunks = parse_and_segment_content(html_content)
#     summary = summarize_text(chunks)
#     print(summary)
# else:
#     print("Failed to fetch or parse webpage content.")