Spaces:

BasitAliii
/

Smart-PDF-Summarizer

Sleeping

App Files Files Community

BasitAliii commited on Nov 2, 2025

Commit

ddec509

verified ·

1 Parent(s): 108f6c2

Create app.py

Browse files

Files changed (1) hide show

app.py +247 -0

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os
+import re
+import tempfile
+from datetime import datetime
+import gradio as gr
+from transformers import pipeline
+import pdfplumber
+from gtts import gTTS
+import nltk
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+# ==========================================================
+# 🧠 NLTK Setup (Fix for punkt_tab)
+# ==========================================================
+for pkg in ["punkt", "punkt_tab"]:
+    try:
+        nltk.data.find(f"tokenizers/{pkg}")
+    except LookupError:
+        nltk.download(pkg)
+# ==========================================================
+# ⚙️ Model Setup
+# ==========================================================
+DEVICE = -1  # CPU (-1), 0 for GPU if available
+SUMMARIZER_MODEL = "facebook/bart-large-cnn"
+QA_MODEL = "deepset/roberta-base-squad2"
+print("Loading models... please wait ⏳")
+try:
+    summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
+    qa_pipeline = pipeline("question-answering", model=QA_MODEL, device=DEVICE)
+except Exception as e:
+    print("Model load error:", e)
+    summarizer = None
+    qa_pipeline = None
+# ==========================================================
+# 🧩 Utility Functions
+# ==========================================================
+def clean_text(text: str) -> str:
+    text = re.sub(r'\r\n?', '\n', text)
+    text = re.sub(r'\n{2,}', '\n\n', text)
+    text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def extract_text_from_pdf(path: str) -> str:
+    try:
+        text = ""
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n\n"
+        return text.strip() if text.strip() else "No text extracted from PDF."
+    except Exception as e:
+        return f"Error extracting text: {e}"
+def sentence_tokenize(text: str):
+    return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
+def chunk_text(text: str, max_chars=1500):
+    sents = sentence_tokenize(text)
+    chunks, cur = [], ""
+    for s in sents:
+        if len(cur) + len(s) < max_chars:
+            cur += (" " if cur else "") + s
+        else:
+            chunks.append(cur)
+            cur = s
+    if cur:
+        chunks.append(cur)
+    return chunks
+def extract_keywords_tfidf(text: str, top_k=8):
+    try:
+        paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
+        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
+        X = vectorizer.fit_transform(paras)
+        features = vectorizer.get_feature_names_out()
+        scores = np.asarray(X.mean(axis=0)).ravel()
+        idx = np.argsort(scores)[::-1][:top_k]
+        return [features[i] for i in idx]
+    except Exception:
+        return []
+# ==========================================================
+# ✍️ Summarization
+# ==========================================================
+def summarize_long_text(text: str) -> str:
+    if summarizer is None:
+        return "Summarization model unavailable."
+    text = clean_text(text)
+    L = len(text)
+    if L < 1500:
+        max_len, min_len, chunk_size = 180, 60, 1400
+    elif L < 5000:
+        max_len, min_len, chunk_size = 250, 100, 1600
+    elif L < 15000:
+        max_len, min_len, chunk_size = 350, 150, 1800
+    else:
+        max_len, min_len, chunk_size = 500, 200, 2000
+    if L <= chunk_size:
+        return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
+    parts = chunk_text(text, max_chars=chunk_size)[:6]
+    summaries = []
+    for p in parts:
+        try:
+            summaries.append(summarizer(p, max_length=200, min_length=80, do_sample=False)[0]["summary_text"])
+        except Exception:
+            continue
+    combined = " ".join(summaries)
+    final = summarizer(combined, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
+    return final
+# ==========================================================
+# 🔊 Text to Speech
+# ==========================================================
+def text_to_speech(text):
+    if not text:
+        return None
+    try:
+        t = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+        gTTS(text=text[:900], lang="en").save(t.name)
+        return t.name
+    except Exception:
+        return None
+# ==========================================================
+# 💬 Q&A Generation
+# ==========================================================
+def generate_auto_questions(text: str, n=5):
+    sents = sentence_tokenize(text)
+    qs = []
+    for s in sents[:n]:
+        words = s.split()
+        if len(words) > 5:
+            qs.append(f"What is meant by: '{' '.join(words[:8])}...'?")
+    return qs
+def answer_question(question, context):
+    if qa_pipeline is None or not context:
+        return "Q&A model unavailable or no context."
+    try:
+        res = qa_pipeline(question=question, context=context)
+        return res.get("answer", "No answer found.")
+    except Exception:
+        return "Error while generating answer."
+# ==========================================================
+# 📄 PDF Handler
+# ==========================================================
+def process_pdf(pdf_file):
+    if not pdf_file:
+        return "Please upload a PDF.", "", None, "", ""
+    text = extract_text_from_pdf(pdf_file)
+    if text.startswith("Error") or text.startswith("No text"):
+        return text, "", None, "", ""
+    text = clean_text(text)
+    summary = summarize_long_text(text)
+    keywords = ", ".join(extract_keywords_tfidf(text))
+    audio = text_to_speech(summary)
+    auto_qs = "\n".join(generate_auto_questions(text, n=6))
+    return text, summary, audio, keywords, auto_qs
+# ==========================================================
+# 🎨 Gradio Interface
+# ==========================================================
+with gr.Blocks(title="AI PDF Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📘 AI PDF Assistant — Smart Chat & Summarizer")
+    gr.Markdown("Easily extract, summarize, and chat with your PDFs using AI.")
+    # --- Analyze PDF Tab ---
+    with gr.Tab("📄 Analyze PDF"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath")
+                process_btn = gr.Button("🚀 Process PDF", variant="primary")
+            with gr.Column(scale=2):
+                extracted_text = gr.Textbox(label="Extracted Text", lines=8, interactive=False)
+                summary_box = gr.Textbox(label="Summary", lines=6, interactive=False)
+                audio_box = gr.Audio(label="Summary Audio", interactive=False)
+                keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
+    # --- Chat with PDF Tab ---
+    with gr.Tab("💬 Chat with PDF"):
+        gr.Markdown("### Auto-Generated Questions")
+        auto_q_box = gr.Textbox(label="Generated Questions", lines=6, interactive=False)
+        gr.Markdown("### Ask Your Own Question")
+        user_q = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+        ask_btn = gr.Button("Ask", variant="primary")
+        answer_box = gr.Textbox(label="Answer", lines=4, interactive=False)
+    # --- About Tab ---
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+## 📘 About AI PDF Assistant
+**AI PDF Assistant** helps you understand and interact with PDFs effortlessly.
+### Features
+- Extracts and cleans text
+- Generates adaptive summaries
+- Identifies keywords
+- Creates audio summaries
+- Auto-generates Q&A
+- Lets you chat with your PDF content
+Built with ❤️ using Hugging Face Transformers, gTTS, and Gradio.
+        """)
+    # --- Event Connections ---
+    process_btn.click(
+        process_pdf,
+        inputs=[pdf_input],
+        outputs=[extracted_text, summary_box, audio_box, keywords_box, auto_q_box],
+    )
+    ask_btn.click(
+        answer_question,
+        inputs=[user_q, extracted_text],
+        outputs=[answer_box],
+    )
+print("🚀 Launching AI PDF Assistant...")
+demo.launch()