Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import pipeline | |
| import nltk | |
| from newspaper import Article, Config | |
| import re | |
| import cloudscraper | |
| from fpdf import FPDF | |
| import os | |
| # Ensure NLTK data is available | |
| nltk.download('punkt') | |
| nltk.download('punkt_tab') | |
| # ── Load Model ── | |
| # (For HF spaces, CPU is the default unless you upgrade to a GPU space) | |
| summarizer = pipeline( | |
| "summarization", | |
| model="facebook/bart-large-cnn", | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| # ── Fetch Article from URL ── | |
| def get_article(url): | |
| scraper = cloudscraper.create_scraper(browser={ | |
| 'browser': 'chrome', | |
| 'platform': 'windows', | |
| 'desktop': True | |
| }) | |
| response = scraper.get(url) | |
| article = Article(url) | |
| article.set_html(response.text) | |
| article.parse() | |
| return article.text | |
| # ── Split Text ── | |
| def split_text(text, max_words=500): | |
| sentences = nltk.sent_tokenize(text) | |
| chunks, chunk = [], [] | |
| for sentence in sentences: | |
| chunk.append(sentence) | |
| if len(" ".join(chunk).split()) > max_words: | |
| chunks.append(" ".join(chunk)) | |
| chunk = [] | |
| if chunk: | |
| chunks.append(" ".join(chunk)) | |
| return chunks | |
| # ── Summarize ── | |
| def summarize_text(text): | |
| if len(text.split()) < 30: | |
| return "Article too short to summarize." | |
| chunks = split_text(text) | |
| summaries = [] | |
| for chunk in chunks: | |
| words = len(chunk.split()) | |
| max_len = min(200, int(words * 0.6)) | |
| min_len = int(words * 0.3) | |
| if min_len >= max_len: | |
| min_len = max_len - 10 | |
| if min_len < 10: | |
| min_len = 10 | |
| summary = summarizer( | |
| chunk, | |
| max_length=max_len, | |
| min_length=min_len, | |
| do_sample=False | |
| )[0]['summary_text'] | |
| summaries.append(summary) | |
| return " ".join(summaries) | |
| # ── Save PDF ── | |
| def save_summary_to_pdf(summary_text): | |
| words = summary_text.split() | |
| title_words = words[:5] | |
| filename = "_".join(re.sub(r'[^a-zA-Z0-9]', '', word) for word in title_words if re.sub(r'[^a-zA-Z0-9]', '', word)) | |
| if not filename: | |
| filename = "summary" | |
| filename += ".pdf" | |
| clean_text = summary_text.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').replace('\u2013', '-').replace('\u2014', '-') | |
| clean_text = clean_text.encode('latin-1', 'replace').decode('latin-1') | |
| pdf = FPDF() | |
| pdf.add_page() | |
| pdf.set_font('helvetica', 'B', 16) | |
| pdf.cell(0, 10, 'News Summary', align='C', ln=1) | |
| pdf.ln(5) | |
| pdf.set_font('helvetica', '', 12) | |
| pdf.multi_cell(0, 8, clean_text) | |
| # Save the PDF to the local directory | |
| pdf.output(filename) | |
| return filename | |
| # ── Gradio Inference Functions ── | |
| def process_article(input_text): | |
| try: | |
| if not input_text.strip(): | |
| return "Please provide some text to summarize.", None | |
| final_summary = summarize_text(input_text) | |
| if final_summary == "Article too short to summarize.": | |
| error_msg = "The provided text was too short to summarize. Please paste a longer block of text." | |
| return error_msg, None | |
| pdf_filename = save_summary_to_pdf(final_summary) | |
| return final_summary, pdf_filename | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}", None | |
| def process_url(url): | |
| try: | |
| if not url.strip(): | |
| return "Please provide a valid URL.", None | |
| article_text = get_article(url) | |
| if not article_text or len(article_text.strip()) == 0 or len(article_text.split()) < 30: | |
| return "Could not extract a full article from the provided URL. The website might be blocking Hugging Face servers from downloading the text (like a paywall or an anti-bot filter) or the page contains virtually no text formatting. Please copy the text and try the 'Via Text' tab.", None | |
| return process_article(article_text) | |
| except Exception as e: | |
| return f"An error occurred fetching the URL: {str(e)}", None | |
| # ── Gradio UI ── | |
| with gr.Blocks(title="News & Text Summarizer") as demo: | |
| gr.Markdown("# 📰 News & Text Summarizer") | |
| gr.Markdown("Choose whether to summarize via a URL link or by pasting the raw text. You can download the result as a PDF!") | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Tabs(): | |
| with gr.TabItem("🔗 Via URL"): | |
| gr.Markdown("*Works best with:* **BBC, NPR, The Guardian, Reuters, TechCrunch, & standard blogs.**\n*Might fail/block on:* **Times of India, NYT, WSJ, Forbes.** (Use the 'Via Text' tab for these!)") | |
| url_input = gr.Textbox(lines=1, placeholder="Paste News Article URL Here...", label="Article URL") | |
| url_submit_btn = gr.Button("Summarize Link", variant="primary") | |
| with gr.TabItem("📝 Via Text"): | |
| text_input = gr.Textbox(lines=10, placeholder="Copy and paste the article text here...", label="Article Text Input") | |
| text_submit_btn = gr.Button("Summarize Text", variant="primary") | |
| with gr.Column(): | |
| summary_output = gr.Textbox(label="Generated Summary", lines=8) | |
| pdf_output = gr.File(label="Download PDF Summary") | |
| url_submit_btn.click( | |
| fn=process_url, | |
| inputs=url_input, | |
| outputs=[summary_output, pdf_output] | |
| ) | |
| text_submit_btn.click( | |
| fn=process_article, | |
| inputs=text_input, | |
| outputs=[summary_output, pdf_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |