Spaces:

VoltIC
/

News_Summarizer

Sleeping

App Files Files Community

News_Summarizer / app.py

VoltIC

Update app.py

eae07dc verified 3 days ago

raw

history blame contribute delete

5.81 kB

	import gradio as gr
	import torch
	from transformers import pipeline
	import nltk
	from newspaper import Article, Config
	import re
	import cloudscraper
	from fpdf import FPDF
	import os

	# Ensure NLTK data is available
	nltk.download('punkt')
	nltk.download('punkt_tab')

	# ── Load Model ──
	# (For HF spaces, CPU is the default unless you upgrade to a GPU space)
	summarizer = pipeline(
	"summarization",
	model="facebook/bart-large-cnn",
	device=0 if torch.cuda.is_available() else -1
	)

	# ── Fetch Article from URL ──
	def get_article(url):
	scraper = cloudscraper.create_scraper(browser={
	'browser': 'chrome',
	'platform': 'windows',
	'desktop': True
	})
	response = scraper.get(url)
	article = Article(url)
	article.set_html(response.text)
	article.parse()
	return article.text

	# ── Split Text ──
	def split_text(text, max_words=500):
	sentences = nltk.sent_tokenize(text)
	chunks, chunk = [], []

	for sentence in sentences:
	chunk.append(sentence)
	if len(" ".join(chunk).split()) > max_words:
	chunks.append(" ".join(chunk))
	chunk = []

	if chunk:
	chunks.append(" ".join(chunk))

	return chunks

	# ── Summarize ──
	def summarize_text(text):
	if len(text.split()) < 30:
	return "Article too short to summarize."

	chunks = split_text(text)
	summaries = []

	for chunk in chunks:
	words = len(chunk.split())

	max_len = min(200, int(words * 0.6))
	min_len = int(words * 0.3)

	if min_len >= max_len:
	min_len = max_len - 10

	if min_len < 10:
	min_len = 10

	summary = summarizer(
	chunk,
	max_length=max_len,
	min_length=min_len,
	do_sample=False
	)[0]['summary_text']

	summaries.append(summary)

	return " ".join(summaries)

	# ── Save PDF ──
	def save_summary_to_pdf(summary_text):
	words = summary_text.split()
	title_words = words[:5]
	filename = "_".join(re.sub(r'[^a-zA-Z0-9]', '', word) for word in title_words if re.sub(r'[^a-zA-Z0-9]', '', word))
	if not filename:
	filename = "summary"
	filename += ".pdf"

	clean_text = summary_text.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').replace('\u2013', '-').replace('\u2014', '-')
	clean_text = clean_text.encode('latin-1', 'replace').decode('latin-1')

	pdf = FPDF()
	pdf.add_page()

	pdf.set_font('helvetica', 'B', 16)
	pdf.cell(0, 10, 'News Summary', align='C', ln=1)
	pdf.ln(5)

	pdf.set_font('helvetica', '', 12)
	pdf.multi_cell(0, 8, clean_text)

	# Save the PDF to the local directory
	pdf.output(filename)
	return filename

	# ── Gradio Inference Functions ──
	def process_article(input_text):
	try:
	if not input_text.strip():
	return "Please provide some text to summarize.", None

	final_summary = summarize_text(input_text)

	if final_summary == "Article too short to summarize.":
	error_msg = "The provided text was too short to summarize. Please paste a longer block of text."
	return error_msg, None

	pdf_filename = save_summary_to_pdf(final_summary)

	return final_summary, pdf_filename

	except Exception as e:
	return f"An error occurred: {str(e)}", None

	def process_url(url):
	try:
	if not url.strip():
	return "Please provide a valid URL.", None

	article_text = get_article(url)

	if not article_text or len(article_text.strip()) == 0 or len(article_text.split()) < 30:
	return "Could not extract a full article from the provided URL. The website might be blocking Hugging Face servers from downloading the text (like a paywall or an anti-bot filter) or the page contains virtually no text formatting. Please copy the text and try the 'Via Text' tab.", None

	return process_article(article_text)

	except Exception as e:
	return f"An error occurred fetching the URL: {str(e)}", None

	# ── Gradio UI ──
	with gr.Blocks(title="News & Text Summarizer") as demo:
	gr.Markdown("# 📰 News & Text Summarizer")
	gr.Markdown("Choose whether to summarize via a URL link or by pasting the raw text. You can download the result as a PDF!")

	with gr.Row():
	with gr.Column():
	with gr.Tabs():
	with gr.TabItem("🔗 Via URL"):
	gr.Markdown("Works best with: BBC, NPR, The Guardian, Reuters, TechCrunch, & standard blogs.\nMight fail/block on: Times of India, NYT, WSJ, Forbes. (Use the 'Via Text' tab for these!)")
	url_input = gr.Textbox(lines=1, placeholder="Paste News Article URL Here...", label="Article URL")
	url_submit_btn = gr.Button("Summarize Link", variant="primary")

	with gr.TabItem("📝 Via Text"):
	text_input = gr.Textbox(lines=10, placeholder="Copy and paste the article text here...", label="Article Text Input")
	text_submit_btn = gr.Button("Summarize Text", variant="primary")

	with gr.Column():
	summary_output = gr.Textbox(label="Generated Summary", lines=8)
	pdf_output = gr.File(label="Download PDF Summary")

	url_submit_btn.click(
	fn=process_url,
	inputs=url_input,
	outputs=[summary_output, pdf_output]
	)

	text_submit_btn.click(
	fn=process_article,
	inputs=text_input,
	outputs=[summary_output, pdf_output]
	)

	if __name__ == "__main__":
	demo.launch()