import gradio as gr import torch from transformers import pipeline import nltk from newspaper import Article, Config import re import cloudscraper from fpdf import FPDF import os # Ensure NLTK data is available nltk.download('punkt') nltk.download('punkt_tab') # ── Load Model ── # (For HF spaces, CPU is the default unless you upgrade to a GPU space) summarizer = pipeline( "summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1 ) # ── Fetch Article from URL ── def get_article(url): scraper = cloudscraper.create_scraper(browser={ 'browser': 'chrome', 'platform': 'windows', 'desktop': True }) response = scraper.get(url) article = Article(url) article.set_html(response.text) article.parse() return article.text # ── Split Text ── def split_text(text, max_words=500): sentences = nltk.sent_tokenize(text) chunks, chunk = [], [] for sentence in sentences: chunk.append(sentence) if len(" ".join(chunk).split()) > max_words: chunks.append(" ".join(chunk)) chunk = [] if chunk: chunks.append(" ".join(chunk)) return chunks # ── Summarize ── def summarize_text(text): if len(text.split()) < 30: return "Article too short to summarize." chunks = split_text(text) summaries = [] for chunk in chunks: words = len(chunk.split()) max_len = min(200, int(words * 0.6)) min_len = int(words * 0.3) if min_len >= max_len: min_len = max_len - 10 if min_len < 10: min_len = 10 summary = summarizer( chunk, max_length=max_len, min_length=min_len, do_sample=False )[0]['summary_text'] summaries.append(summary) return " ".join(summaries) # ── Save PDF ── def save_summary_to_pdf(summary_text): words = summary_text.split() title_words = words[:5] filename = "_".join(re.sub(r'[^a-zA-Z0-9]', '', word) for word in title_words if re.sub(r'[^a-zA-Z0-9]', '', word)) if not filename: filename = "summary" filename += ".pdf" clean_text = summary_text.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').replace('\u2013', '-').replace('\u2014', '-') clean_text = clean_text.encode('latin-1', 'replace').decode('latin-1') pdf = FPDF() pdf.add_page() pdf.set_font('helvetica', 'B', 16) pdf.cell(0, 10, 'News Summary', align='C', ln=1) pdf.ln(5) pdf.set_font('helvetica', '', 12) pdf.multi_cell(0, 8, clean_text) # Save the PDF to the local directory pdf.output(filename) return filename # ── Gradio Inference Functions ── def process_article(input_text): try: if not input_text.strip(): return "Please provide some text to summarize.", None final_summary = summarize_text(input_text) if final_summary == "Article too short to summarize.": error_msg = "The provided text was too short to summarize. Please paste a longer block of text." return error_msg, None pdf_filename = save_summary_to_pdf(final_summary) return final_summary, pdf_filename except Exception as e: return f"An error occurred: {str(e)}", None def process_url(url): try: if not url.strip(): return "Please provide a valid URL.", None article_text = get_article(url) if not article_text or len(article_text.strip()) == 0 or len(article_text.split()) < 30: return "Could not extract a full article from the provided URL. The website might be blocking Hugging Face servers from downloading the text (like a paywall or an anti-bot filter) or the page contains virtually no text formatting. Please copy the text and try the 'Via Text' tab.", None return process_article(article_text) except Exception as e: return f"An error occurred fetching the URL: {str(e)}", None # ── Gradio UI ── with gr.Blocks(title="News & Text Summarizer") as demo: gr.Markdown("# 📰 News & Text Summarizer") gr.Markdown("Choose whether to summarize via a URL link or by pasting the raw text. You can download the result as a PDF!") with gr.Row(): with gr.Column(): with gr.Tabs(): with gr.TabItem("🔗 Via URL"): gr.Markdown("*Works best with:* **BBC, NPR, The Guardian, Reuters, TechCrunch, & standard blogs.**\n*Might fail/block on:* **Times of India, NYT, WSJ, Forbes.** (Use the 'Via Text' tab for these!)") url_input = gr.Textbox(lines=1, placeholder="Paste News Article URL Here...", label="Article URL") url_submit_btn = gr.Button("Summarize Link", variant="primary") with gr.TabItem("📝 Via Text"): text_input = gr.Textbox(lines=10, placeholder="Copy and paste the article text here...", label="Article Text Input") text_submit_btn = gr.Button("Summarize Text", variant="primary") with gr.Column(): summary_output = gr.Textbox(label="Generated Summary", lines=8) pdf_output = gr.File(label="Download PDF Summary") url_submit_btn.click( fn=process_url, inputs=url_input, outputs=[summary_output, pdf_output] ) text_submit_btn.click( fn=process_article, inputs=text_input, outputs=[summary_output, pdf_output] ) if __name__ == "__main__": demo.launch()