Spaces:
Runtime error
Runtime error
enstazao
commited on
Commit
·
50eb7ef
1
Parent(s):
9a4b00c
updated code
Browse files
main.py
CHANGED
|
@@ -2,53 +2,66 @@ from transformers import pipeline
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
import requests
|
| 4 |
|
| 5 |
-
# @desc Get the content of the web page
|
| 6 |
def fetch_webpage_content(url):
|
|
|
|
| 7 |
try:
|
| 8 |
-
response = requests.get(url)
|
| 9 |
-
response.raise_for_status() # Raises an
|
| 10 |
return response.text
|
| 11 |
except requests.exceptions.RequestException as e:
|
| 12 |
print(f"Error fetching the webpage: {e}")
|
| 13 |
return None
|
| 14 |
|
| 15 |
-
# @desc Get the chunks of the content from the the scrapped content
|
| 16 |
def parse_and_segment_content(html_content):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 18 |
results = soup.find_all(['h1', 'p'])
|
| 19 |
text = ' '.join([result.text for result in results])
|
| 20 |
text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
|
| 21 |
sentences = text.split('<eos>')
|
| 22 |
-
|
| 23 |
max_chunk = 500
|
| 24 |
chunks = []
|
| 25 |
-
current_chunk =
|
| 26 |
-
|
| 27 |
for sentence in sentences:
|
| 28 |
-
if len(sentence)
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
chunks.append(sentence.split(' '))
|
| 37 |
-
|
| 38 |
-
for chunk_id in range(len(chunks)):
|
| 39 |
-
chunks[chunk_id] = ' '.join(chunks[chunk_id]).strip()
|
| 40 |
-
|
| 41 |
return chunks
|
| 42 |
|
| 43 |
-
# @desc Summarize the content and then return that back
|
| 44 |
def summarize_text(chunks):
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
summaries = []
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
|
| 50 |
summaries.append(summary[0]['summary_text'])
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
import requests
|
| 4 |
|
|
|
|
| 5 |
def fetch_webpage_content(url):
|
| 6 |
+
"""Fetch the content of a webpage."""
|
| 7 |
try:
|
| 8 |
+
response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports
|
| 9 |
+
response.raise_for_status() # Raises an error for bad responses
|
| 10 |
return response.text
|
| 11 |
except requests.exceptions.RequestException as e:
|
| 12 |
print(f"Error fetching the webpage: {e}")
|
| 13 |
return None
|
| 14 |
|
|
|
|
| 15 |
def parse_and_segment_content(html_content):
|
| 16 |
+
"""Parse and segment HTML content into manageable chunks."""
|
| 17 |
+
if not html_content:
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 21 |
results = soup.find_all(['h1', 'p'])
|
| 22 |
text = ' '.join([result.text for result in results])
|
| 23 |
text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
|
| 24 |
sentences = text.split('<eos>')
|
| 25 |
+
|
| 26 |
max_chunk = 500
|
| 27 |
chunks = []
|
| 28 |
+
current_chunk = -1
|
| 29 |
+
|
| 30 |
for sentence in sentences:
|
| 31 |
+
if len(sentence.strip()) == 0:
|
| 32 |
+
continue
|
| 33 |
+
if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
|
| 34 |
+
chunks.append([])
|
| 35 |
+
current_chunk += 1
|
| 36 |
+
chunks[current_chunk].extend(sentence.split())
|
| 37 |
+
|
| 38 |
+
chunks = [' '.join(chunk).strip() for chunk in chunks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return chunks
|
| 40 |
|
|
|
|
| 41 |
def summarize_text(chunks):
|
| 42 |
+
"""Summarize the given text chunks."""
|
| 43 |
+
if not chunks:
|
| 44 |
+
return "No content to summarize."
|
| 45 |
+
|
| 46 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
| 47 |
summaries = []
|
| 48 |
+
|
| 49 |
+
for chunk in chunks:
|
| 50 |
+
try:
|
| 51 |
summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
|
| 52 |
summaries.append(summary[0]['summary_text'])
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"Error in summarization: {e}")
|
| 55 |
+
summaries.append("Error summarizing text.") # Keep the flow even if summarization fails
|
| 56 |
+
|
| 57 |
+
return ' '.join(summaries)
|
| 58 |
+
|
| 59 |
+
# Example usage
|
| 60 |
+
# url = "https://example.com"
|
| 61 |
+
# html_content = fetch_webpage_content(url)
|
| 62 |
+
# if html_content:
|
| 63 |
+
# chunks = parse_and_segment_content(html_content)
|
| 64 |
+
# summary = summarize_text(chunks)
|
| 65 |
+
# print(summary)
|
| 66 |
+
# else:
|
| 67 |
+
# print("Failed to fetch or parse webpage content.")
|