News_Summarizer / app.py
VoltIC's picture
Update app.py
eae07dc verified
import gradio as gr
import torch
from transformers import pipeline
import nltk
from newspaper import Article, Config
import re
import cloudscraper
from fpdf import FPDF
import os
# Ensure NLTK data is available
nltk.download('punkt')
nltk.download('punkt_tab')
# ── Load Model ──
# (For HF spaces, CPU is the default unless you upgrade to a GPU space)
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn",
device=0 if torch.cuda.is_available() else -1
)
# ── Fetch Article from URL ──
def get_article(url):
scraper = cloudscraper.create_scraper(browser={
'browser': 'chrome',
'platform': 'windows',
'desktop': True
})
response = scraper.get(url)
article = Article(url)
article.set_html(response.text)
article.parse()
return article.text
# ── Split Text ──
def split_text(text, max_words=500):
sentences = nltk.sent_tokenize(text)
chunks, chunk = [], []
for sentence in sentences:
chunk.append(sentence)
if len(" ".join(chunk).split()) > max_words:
chunks.append(" ".join(chunk))
chunk = []
if chunk:
chunks.append(" ".join(chunk))
return chunks
# ── Summarize ──
def summarize_text(text):
if len(text.split()) < 30:
return "Article too short to summarize."
chunks = split_text(text)
summaries = []
for chunk in chunks:
words = len(chunk.split())
max_len = min(200, int(words * 0.6))
min_len = int(words * 0.3)
if min_len >= max_len:
min_len = max_len - 10
if min_len < 10:
min_len = 10
summary = summarizer(
chunk,
max_length=max_len,
min_length=min_len,
do_sample=False
)[0]['summary_text']
summaries.append(summary)
return " ".join(summaries)
# ── Save PDF ──
def save_summary_to_pdf(summary_text):
words = summary_text.split()
title_words = words[:5]
filename = "_".join(re.sub(r'[^a-zA-Z0-9]', '', word) for word in title_words if re.sub(r'[^a-zA-Z0-9]', '', word))
if not filename:
filename = "summary"
filename += ".pdf"
clean_text = summary_text.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').replace('\u2013', '-').replace('\u2014', '-')
clean_text = clean_text.encode('latin-1', 'replace').decode('latin-1')
pdf = FPDF()
pdf.add_page()
pdf.set_font('helvetica', 'B', 16)
pdf.cell(0, 10, 'News Summary', align='C', ln=1)
pdf.ln(5)
pdf.set_font('helvetica', '', 12)
pdf.multi_cell(0, 8, clean_text)
# Save the PDF to the local directory
pdf.output(filename)
return filename
# ── Gradio Inference Functions ──
def process_article(input_text):
try:
if not input_text.strip():
return "Please provide some text to summarize.", None
final_summary = summarize_text(input_text)
if final_summary == "Article too short to summarize.":
error_msg = "The provided text was too short to summarize. Please paste a longer block of text."
return error_msg, None
pdf_filename = save_summary_to_pdf(final_summary)
return final_summary, pdf_filename
except Exception as e:
return f"An error occurred: {str(e)}", None
def process_url(url):
try:
if not url.strip():
return "Please provide a valid URL.", None
article_text = get_article(url)
if not article_text or len(article_text.strip()) == 0 or len(article_text.split()) < 30:
return "Could not extract a full article from the provided URL. The website might be blocking Hugging Face servers from downloading the text (like a paywall or an anti-bot filter) or the page contains virtually no text formatting. Please copy the text and try the 'Via Text' tab.", None
return process_article(article_text)
except Exception as e:
return f"An error occurred fetching the URL: {str(e)}", None
# ── Gradio UI ──
with gr.Blocks(title="News & Text Summarizer") as demo:
gr.Markdown("# 📰 News & Text Summarizer")
gr.Markdown("Choose whether to summarize via a URL link or by pasting the raw text. You can download the result as a PDF!")
with gr.Row():
with gr.Column():
with gr.Tabs():
with gr.TabItem("🔗 Via URL"):
gr.Markdown("*Works best with:* **BBC, NPR, The Guardian, Reuters, TechCrunch, & standard blogs.**\n*Might fail/block on:* **Times of India, NYT, WSJ, Forbes.** (Use the 'Via Text' tab for these!)")
url_input = gr.Textbox(lines=1, placeholder="Paste News Article URL Here...", label="Article URL")
url_submit_btn = gr.Button("Summarize Link", variant="primary")
with gr.TabItem("📝 Via Text"):
text_input = gr.Textbox(lines=10, placeholder="Copy and paste the article text here...", label="Article Text Input")
text_submit_btn = gr.Button("Summarize Text", variant="primary")
with gr.Column():
summary_output = gr.Textbox(label="Generated Summary", lines=8)
pdf_output = gr.File(label="Download PDF Summary")
url_submit_btn.click(
fn=process_url,
inputs=url_input,
outputs=[summary_output, pdf_output]
)
text_submit_btn.click(
fn=process_article,
inputs=text_input,
outputs=[summary_output, pdf_output]
)
if __name__ == "__main__":
demo.launch()