Spaces:

enstazao
/

Blog-Summarizer

Runtime error

App Files Files Community

enstazao commited on Mar 5, 2024

Commit

50eb7ef

1 Parent(s): 9a4b00c

updated code

Browse files

Files changed (1) hide show

main.py +41 -28

main.py CHANGED Viewed

@@ -2,53 +2,66 @@ from transformers import pipeline
 from bs4 import BeautifulSoup
 import requests
-# @desc Get the content of the web page
 def fetch_webpage_content(url):
     try:
-        response = requests.get(url)
-        response.raise_for_status()  # Raises an HTTPError if the status is 4xx, 5xx
         return response.text
     except requests.exceptions.RequestException as e:
         print(f"Error fetching the webpage: {e}")
         return None
-# @desc Get the chunks of the content from the the scrapped content
 def parse_and_segment_content(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     results = soup.find_all(['h1', 'p'])
     text = ' '.join([result.text for result in results])
     text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
     sentences = text.split('<eos>')
     max_chunk = 500
     chunks = []
-    current_chunk = 0
     for sentence in sentences:
-        if len(sentence) > 0:  # Check if sentence is not empty
-            if len(chunks) == current_chunk + 1:
-                if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
-                    chunks[current_chunk].extend(sentence.split(' '))
-                else:
-                    current_chunk += 1
-                    chunks.append(sentence.split(' '))
-            else:
-                chunks.append(sentence.split(' '))
-    for chunk_id in range(len(chunks)):
-        chunks[chunk_id] = ' '.join(chunks[chunk_id]).strip()
     return chunks
-# @desc Summarize the content and then return that back
 def summarize_text(chunks):
-    summarizer = pipeline("summarization")
     summaries = []
-    try:
-        for chunk in chunks:
             summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
             summaries.append(summary[0]['summary_text'])
-        return ' '.join(summaries)
-    except Exception as e:
-        print(f"Error in summarization: {e}")
-        return None

 from bs4 import BeautifulSoup
 import requests
 def fetch_webpage_content(url):
+    """Fetch the content of a webpage."""
     try:
+        response = requests.get(url, timeout=10)  # Ensures the use of standard HTTP/HTTPS ports
+        response.raise_for_status()  # Raises an error for bad responses
         return response.text
     except requests.exceptions.RequestException as e:
         print(f"Error fetching the webpage: {e}")
         return None
 def parse_and_segment_content(html_content):
+    """Parse and segment HTML content into manageable chunks."""
+    if not html_content:
+        return []
     soup = BeautifulSoup(html_content, 'html.parser')
     results = soup.find_all(['h1', 'p'])
     text = ' '.join([result.text for result in results])
     text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
     sentences = text.split('<eos>')
     max_chunk = 500
     chunks = []
+    current_chunk = -1
     for sentence in sentences:
+        if len(sentence.strip()) == 0:
+            continue
+        if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
+            chunks.append([])
+            current_chunk += 1
+        chunks[current_chunk].extend(sentence.split())
+    chunks = [' '.join(chunk).strip() for chunk in chunks]
     return chunks
 def summarize_text(chunks):
+    """Summarize the given text chunks."""
+    if not chunks:
+        return "No content to summarize."
+    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
     summaries = []
+    for chunk in chunks:
+        try:
             summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
             summaries.append(summary[0]['summary_text'])
+        except Exception as e:
+            print(f"Error in summarization: {e}")
+            summaries.append("Error summarizing text.")  # Keep the flow even if summarization fails
+    return ' '.join(summaries)
+# Example usage
+# url = "https://example.com"
+# html_content = fetch_webpage_content(url)
+# if html_content:
+#     chunks = parse_and_segment_content(html_content)
+#     summary = summarize_text(chunks)
+#     print(summary)
+# else:
+#     print("Failed to fetch or parse webpage content.")