Spaces:

BasitAliii
/

Smart-PDF-Summarizer

Sleeping

App Files Files Community

BasitAliii commited on Nov 2, 2025

Commit

dec4eb8

verified ·

1 Parent(s): cca0ee6

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -63

app.py CHANGED Viewed

@@ -24,23 +24,21 @@ for pkg in ["punkt", "punkt_tab"]:
 # ==========================================================
 DEVICE = -1  # CPU (-1), use 0 for GPU if available
 SUMMARIZER_MODEL = "facebook/bart-large-cnn"
-QA_MODEL = "deepset/roberta-base-squad2"
-print("Loading models... please wait ⏳")
 try:
     summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
-    qa_pipeline = pipeline("question-answering", model=QA_MODEL, device=DEVICE)
 except Exception as e:
     print("Model load error:", e)
     summarizer = None
-    qa_pipeline = None
 # ==========================================================
 # 🧩 Utility Functions
 # ==========================================================
 def clean_text(text: str) -> str:
     text = re.sub(r'\r\n?', '\n', text)
     text = re.sub(r'\n{2,}', '\n\n', text)
     text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
@@ -50,6 +48,7 @@ def clean_text(text: str) -> str:
 def extract_text_from_pdf(path: str) -> str:
     try:
         text = ""
         with pdfplumber.open(path) as pdf:
@@ -63,10 +62,12 @@ def extract_text_from_pdf(path: str) -> str:
 def sentence_tokenize(text: str):
     return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
 def chunk_text(text: str, max_chars=1500):
     sents = sentence_tokenize(text)
     chunks, cur = [], ""
     for s in sents:
@@ -81,6 +82,7 @@ def chunk_text(text: str, max_chars=1500):
 def extract_keywords_tfidf(text: str, top_k=8):
     try:
         paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
         vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
@@ -97,10 +99,14 @@ def extract_keywords_tfidf(text: str, top_k=8):
 # ✍️ Adaptive Summarization
 # ==========================================================
 def summarize_long_text(text: str) -> str:
     if summarizer is None:
         return "Summarization model unavailable."
     text = clean_text(text)
     L = len(text)
     if L < 1500:
         max_len, min_len, chunk_size = 180, 60, 1400
     elif L < 5000:
@@ -130,6 +136,7 @@ def summarize_long_text(text: str) -> str:
 # 🔊 Text-to-Speech
 # ==========================================================
 def text_to_speech(text):
     if not text:
         return None
     try:
@@ -140,56 +147,34 @@ def text_to_speech(text):
         return None
-# ==========================================================
-# 🧠 Q&A
-# ==========================================================
-def generate_auto_questions(text: str, n=5):
-    sents = sentence_tokenize(text)
-    qs = []
-    for s in sents[:n]:
-        words = s.split()
-        if len(words) > 5:
-            qs.append(f"What is meant by: '{' '.join(words[:8])}...'?")
-    return qs
-def answer_question(question, context):
-    if qa_pipeline is None or not context:
-        return "Q&A model unavailable or no context."
-    try:
-        res = qa_pipeline(question=question, context=context)
-        return res.get("answer", "No answer found.")
-    except Exception:
-        return "Error while generating answer."
 # ==========================================================
 # 📄 PDF Handler
 # ==========================================================
 def process_pdf(pdf_file):
     if not pdf_file:
-        return "Please upload a PDF.", "", None, "", ""
     text = extract_text_from_pdf(pdf_file)
     if text.startswith("Error") or text.startswith("No text"):
-        return text, "", None, "", ""
     text = clean_text(text)
     summary = summarize_long_text(text)
     keywords = ", ".join(extract_keywords_tfidf(text))
     audio = text_to_speech(summary)
-    auto_qs = "\n".join(generate_auto_questions(text, n=6))
-    return text, summary, audio, keywords, auto_qs
 # ==========================================================
 # 🎨 Gradio UI
 # ==========================================================
-with gr.Blocks(title="AI PDF Assistant", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📘 AI PDF Assistant — Smart Chat & Summarizer")
-    gr.Markdown("Easily extract, summarize, and chat with your PDFs using AI.")
     with gr.Tab("📄 Analyze PDF"):
         with gr.Row():
             with gr.Column(scale=1):
@@ -201,41 +186,27 @@ with gr.Blocks(title="AI PDF Assistant", theme=gr.themes.Soft()) as demo:
                 audio_box = gr.Audio(label="Summary Audio", interactive=False)
                 keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
-    with gr.Tab("💬 Chat with PDF"):
-        gr.Markdown("### Auto-Generated Questions")
-        auto_q_box = gr.Textbox(label="Generated Questions", lines=6, interactive=False, placeholder="Questions will appear after PDF is processed.")
-        gr.Markdown("### Ask Your Own Question")
-        user_q = gr.Textbox(label="Your Question", placeholder="Type your question here...")
-        ask_btn = gr.Button("Ask", variant="primary")
-        answer_box = gr.Textbox(label="Answer", lines=4, interactive=False)
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
-## 📘 About AI PDF Assistant
-**AI PDF Assistant** helps you understand and interact with PDFs effortlessly.
-### Features
-- Extracts and cleans text
-- Generates adaptive summaries
-- Identifies keywords
-- Creates audio summaries
-- Auto-generates Q&A
-- Lets you chat with your PDF content
-Built with ❤️ using Hugging Face Transformers, gTTS, and Gradio.
         """)
     process_btn.click(
         process_pdf,
         inputs=[pdf_input],
-        outputs=[extracted_text, summary_box, audio_box, keywords_box, auto_q_box],
-    )
-    ask_btn.click(
-        answer_question,
-        inputs=[user_q, extracted_text],
-        outputs=[answer_box],
     )
-print("🚀 Launching AI PDF Assistant...")
-demo.launch()

 # ==========================================================
 DEVICE = -1  # CPU (-1), use 0 for GPU if available
 SUMMARIZER_MODEL = "facebook/bart-large-cnn"
+print("Loading summarization model... please wait ⏳")
 try:
     summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
 except Exception as e:
     print("Model load error:", e)
     summarizer = None
 # ==========================================================
 # 🧩 Utility Functions
 # ==========================================================
 def clean_text(text: str) -> str:
+    """Clean extracted PDF text."""
     text = re.sub(r'\r\n?', '\n', text)
     text = re.sub(r'\n{2,}', '\n\n', text)
     text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
 def extract_text_from_pdf(path: str) -> str:
+    """Extract text from all pages of PDF."""
     try:
         text = ""
         with pdfplumber.open(path) as pdf:
 def sentence_tokenize(text: str):
+    """Split text into sentences."""
     return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
 def chunk_text(text: str, max_chars=1500):
+    """Split text into chunks for summarization."""
     sents = sentence_tokenize(text)
     chunks, cur = [], ""
     for s in sents:
 def extract_keywords_tfidf(text: str, top_k=8):
+    """Extract keywords using TF-IDF."""
     try:
         paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
         vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
 # ✍️ Adaptive Summarization
 # ==========================================================
 def summarize_long_text(text: str) -> str:
+    """Adaptive summarization based on PDF length."""
     if summarizer is None:
         return "Summarization model unavailable."
     text = clean_text(text)
     L = len(text)
+    # Dynamic chunking
     if L < 1500:
         max_len, min_len, chunk_size = 180, 60, 1400
     elif L < 5000:
 # 🔊 Text-to-Speech
 # ==========================================================
 def text_to_speech(text):
+    """Convert text to speech."""
     if not text:
         return None
     try:
         return None
 # ==========================================================
 # 📄 PDF Handler
 # ==========================================================
 def process_pdf(pdf_file):
+    """Main handler to process PDF."""
     if not pdf_file:
+        return "Please upload a PDF.", "", None, ""
     text = extract_text_from_pdf(pdf_file)
     if text.startswith("Error") or text.startswith("No text"):
+        return text, "", None, ""
     text = clean_text(text)
     summary = summarize_long_text(text)
     keywords = ", ".join(extract_keywords_tfidf(text))
     audio = text_to_speech(summary)
+    return text, summary, audio, keywords
 # ==========================================================
 # 🎨 Gradio UI
 # ==========================================================
+with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📘 AI PDF Summarizer — Extract, Summarize & Listen")
+    gr.Markdown("Easily extract and summarize text from PDFs with AI, and listen to audio summaries.")
+    # --- Analyze PDF Tab ---
     with gr.Tab("📄 Analyze PDF"):
         with gr.Row():
             with gr.Column(scale=1):
                 audio_box = gr.Audio(label="Summary Audio", interactive=False)
                 keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
+    # --- About Tab ---
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
+## 📘 About AI PDF Summarizer
+**AI PDF Summarizer** helps you quickly understand the contents of any PDF using AI.
+### ✨ Features
+- Extracts and cleans text from PDFs
+- Creates adaptive, high-quality summaries
+- Identifies key terms and topics using TF-IDF
+- Generates audio summaries for listening convenience
+Built with ❤️ using **Hugging Face Transformers**, **Gradio**, and **gTTS**.
         """)
+    # --- Event Connections ---
     process_btn.click(
         process_pdf,
         inputs=[pdf_input],
+        outputs=[extracted_text, summary_box, audio_box, keywords_box],
     )
+print("🚀 Launching AI PDF Summarizer...")
+demo.launch(share=True, debug=True)