import gradio as gr
import torch
from transformers import pipeline
import nltk
from newspaper import Article, Config
import re
import cloudscraper
from fpdf import FPDF
import os

# Ensure NLTK data is available
nltk.download('punkt')
nltk.download('punkt_tab')

# ── Load Model ──
# (For HF spaces, CPU is the default unless you upgrade to a GPU space)
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=0 if torch.cuda.is_available() else -1
)

# ── Fetch Article from URL ──
def get_article(url):
    scraper = cloudscraper.create_scraper(browser={
        'browser': 'chrome',
        'platform': 'windows',
        'desktop': True
    })
    response = scraper.get(url)
    article = Article(url)
    article.set_html(response.text)
    article.parse()
    return article.text

# ── Split Text ──
def split_text(text, max_words=500):
    sentences = nltk.sent_tokenize(text)
    chunks, chunk = [], []

    for sentence in sentences:
        chunk.append(sentence)
        if len(" ".join(chunk).split()) > max_words:
            chunks.append(" ".join(chunk))
            chunk = []

    if chunk:
        chunks.append(" ".join(chunk))

    return chunks

# ── Summarize ──
def summarize_text(text):
    if len(text.split()) < 30:
        return "Article too short to summarize."

    chunks = split_text(text)
    summaries = []

    for chunk in chunks:
        words = len(chunk.split())

        max_len = min(200, int(words * 0.6))
        min_len = int(words * 0.3)

        if min_len >= max_len:
            min_len = max_len - 10

        if min_len < 10:
            min_len = 10

        summary = summarizer(
            chunk,
            max_length=max_len,
            min_length=min_len,
            do_sample=False
        )[0]['summary_text']

        summaries.append(summary)

    return " ".join(summaries)

# ── Save PDF ──
def save_summary_to_pdf(summary_text):
    words = summary_text.split()
    title_words = words[:5]
    filename = "_".join(re.sub(r'[^a-zA-Z0-9]', '', word) for word in title_words if re.sub(r'[^a-zA-Z0-9]', '', word))
    if not filename:
        filename = "summary"
    filename += ".pdf"
    
    clean_text = summary_text.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').replace('\u2013', '-').replace('\u2014', '-')
    clean_text = clean_text.encode('latin-1', 'replace').decode('latin-1')

    pdf = FPDF()
    pdf.add_page()
    
    pdf.set_font('helvetica', 'B', 16)
    pdf.cell(0, 10, 'News Summary', align='C', ln=1)
    pdf.ln(5)
    
    pdf.set_font('helvetica', '', 12)
    pdf.multi_cell(0, 8, clean_text)
    
    # Save the PDF to the local directory
    pdf.output(filename)
    return filename

# ── Gradio Inference Functions ──
def process_article(input_text):
    try:
        if not input_text.strip():
            return "Please provide some text to summarize.", None
            
        final_summary = summarize_text(input_text)
        
        if final_summary == "Article too short to summarize.":
            error_msg = "The provided text was too short to summarize. Please paste a longer block of text."
            return error_msg, None
            
        pdf_filename = save_summary_to_pdf(final_summary)
        
        return final_summary, pdf_filename
        
    except Exception as e:
        return f"An error occurred: {str(e)}", None

def process_url(url):
    try:
        if not url.strip():
            return "Please provide a valid URL.", None
            
        article_text = get_article(url)
        
        if not article_text or len(article_text.strip()) == 0 or len(article_text.split()) < 30:
            return "Could not extract a full article from the provided URL. The website might be blocking Hugging Face servers from downloading the text (like a paywall or an anti-bot filter) or the page contains virtually no text formatting. Please copy the text and try the 'Via Text' tab.", None
            
        return process_article(article_text)
        
    except Exception as e:
        return f"An error occurred fetching the URL: {str(e)}", None

# ── Gradio UI ──
with gr.Blocks(title="News & Text Summarizer") as demo:
    gr.Markdown("# 📰 News & Text Summarizer")
    gr.Markdown("Choose whether to summarize via a URL link or by pasting the raw text. You can download the result as a PDF!")
    
    with gr.Row():
        with gr.Column():
            with gr.Tabs():
                with gr.TabItem("🔗 Via URL"):
                    gr.Markdown("*Works best with:* **BBC, NPR, The Guardian, Reuters, TechCrunch, & standard blogs.**\n*Might fail/block on:* **Times of India, NYT, WSJ, Forbes.** (Use the 'Via Text' tab for these!)")
                    url_input = gr.Textbox(lines=1, placeholder="Paste News Article URL Here...", label="Article URL")
                    url_submit_btn = gr.Button("Summarize Link", variant="primary")
                    
                with gr.TabItem("📝 Via Text"):
                    text_input = gr.Textbox(lines=10, placeholder="Copy and paste the article text here...", label="Article Text Input")
                    text_submit_btn = gr.Button("Summarize Text", variant="primary")
            
        with gr.Column():
            summary_output = gr.Textbox(label="Generated Summary", lines=8)
            pdf_output = gr.File(label="Download PDF Summary")
            
    url_submit_btn.click(
        fn=process_url,
        inputs=url_input,
        outputs=[summary_output, pdf_output]
    )
    
    text_submit_btn.click(
        fn=process_article,
        inputs=text_input,
        outputs=[summary_output, pdf_output]
    )

if __name__ == "__main__":
    demo.launch()