Spaces:

enstazao
/

Blog-Summarizer

Runtime error

Blog-Summarizer / main.py

enstazao

added lines for dubug

ae85af3 almost 2 years ago

2.38 kB

	from transformers import pipeline
	from bs4 import BeautifulSoup
	import requests

	def fetch_webpage_content(url):
	"""Fetch the content of a webpage."""
	try:
	response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports
	response.raise_for_status() # Raises an error for bad responses
	print("Hello", response.text)
	return response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching the webpage: {e}")
	return None

	def parse_and_segment_content(html_content):
	"""Parse and segment HTML content into manageable chunks."""
	if not html_content:
	return []

	soup = BeautifulSoup(html_content, 'html.parser')
	results = soup.find_all(['h1', 'p'])
	text = ' '.join([result.text for result in results])
	text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>')
	sentences = text.split('<eos>')
	print("Doing segmentation")
	max_chunk = 500
	chunks = []
	current_chunk = -1

	for sentence in sentences:
	if len(sentence.strip()) == 0:
	continue
	if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk:
	chunks.append([])
	current_chunk += 1
	chunks[current_chunk].extend(sentence.split())

	chunks = [' '.join(chunk).strip() for chunk in chunks]
	return chunks

	def summarize_text(chunks):
	"""Summarize the given text chunks."""
	if not chunks:
	return "No content to summarize."

	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
	summaries = []
	print("Summarizing content")
	for chunk in chunks:
	try:
	summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False)
	summaries.append(summary[0]['summary_text'])
	except Exception as e:
	print(f"Error in summarization: {e}")
	summaries.append("Error summarizing text.") # Keep the flow even if summarization fails

	return ' '.join(summaries)

	# Example usage
	# url = "https://example.com"
	# html_content = fetch_webpage_content(url)
	# if html_content:
	# chunks = parse_and_segment_content(html_content)
	# summary = summarize_text(chunks)
	# print(summary)
	# else:
	# print("Failed to fetch or parse webpage content.")