Spaces:

BasitAliii
/

Smart-PDF-Summarizer

Sleeping

App Files Files Community

BasitAliii commited on Nov 2, 2025

Commit

b8ffb22

verified ·

1 Parent(s): ca8c5b3

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -26

app.py CHANGED Viewed

@@ -9,9 +9,10 @@ from gtts import gTTS
 import nltk
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 # ==========================================================
-# 🧠 NLTK Setup (Fixed punkt_tab Issue)
 # ==========================================================
 for pkg in ["punkt", "punkt_tab"]:
     try:
@@ -26,11 +27,11 @@ DEVICE = -1  # CPU (-1), use 0 for GPU if available
 SUMMARIZER_MODEL = "facebook/bart-large-cnn"
 print("Loading summarization model... please wait ⏳")
 try:
     summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
 except Exception as e:
-    print("Model load error:", e)
     summarizer = None
@@ -48,7 +49,7 @@ def clean_text(text: str) -> str:
 def extract_text_from_pdf(path: str) -> str:
-    """Extract text from all pages of PDF."""
     try:
         text = ""
         with pdfplumber.open(path) as pdf:
@@ -106,7 +107,7 @@ def summarize_long_text(text: str) -> str:
     text = clean_text(text)
     L = len(text)
-    # Dynamic chunking
     if L < 1500:
         max_len, min_len, chunk_size = 180, 60, 1400
     elif L < 5000:
@@ -133,22 +134,31 @@ def summarize_long_text(text: str) -> str:
 # ==========================================================
-# 🔊 Text-to-Speech
 # ==========================================================
 def text_to_speech(text):
-    """Convert text to speech."""
     if not text:
         return None
     try:
-        t = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-        gTTS(text=text[:900], lang="en").save(t.name)
-        return t.name
-    except Exception:
         return None
 # ==========================================================
-# 📄 PDF Handler
 # ==========================================================
 def process_pdf(pdf_file):
     """Main handler to process PDF."""
@@ -168,40 +178,41 @@ def process_pdf(pdf_file):
 # ==========================================================
-# 🎨 Gradio UI
 # ==========================================================
 with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📘 AI PDF Summarizer — Extract, Summarize & Listen")
-    gr.Markdown("Easily extract and summarize text from PDFs with AI, and listen to audio summaries.")
-    # --- Analyze PDF Tab ---
     with gr.Tab("📄 Analyze PDF"):
         with gr.Row():
             with gr.Column(scale=1):
-                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath")
                 process_btn = gr.Button("🚀 Process PDF", variant="primary")
             with gr.Column(scale=2):
-                extracted_text = gr.Textbox(label="Extracted Text", lines=8, interactive=False)
-                summary_box = gr.Textbox(label="Summary", lines=6, interactive=False)
-                audio_box = gr.Audio(label="Summary Audio", interactive=False)
-                keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
     # --- About Tab ---
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
 ## 📘 About AI PDF Summarizer
-**AI PDF Summarizer** helps you quickly understand the contents of any PDF using AI.
 ### ✨ Features
 - Extracts and cleans text from PDFs
-- Creates adaptive, high-quality summaries
-- Identifies key terms and topics using TF-IDF
-- Generates audio summaries for listening convenience
-Built with ❤️ using **Hugging Face Transformers**, **Gradio**, and **gTTS**.
         """)
-    # --- Event Connections ---
     process_btn.click(
         process_pdf,
         inputs=[pdf_input],

 import nltk
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
+from pydub import AudioSegment
 # ==========================================================
+# 🧠 NLTK Setup (Fixes punkt_tab issue)
 # ==========================================================
 for pkg in ["punkt", "punkt_tab"]:
     try:
 SUMMARIZER_MODEL = "facebook/bart-large-cnn"
 print("Loading summarization model... please wait ⏳")
 try:
     summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
+    print("✅ Summarizer loaded successfully.")
 except Exception as e:
+    print("❌ Model load error:", e)
     summarizer = None
 def extract_text_from_pdf(path: str) -> str:
+    """Extract text from all pages of a PDF."""
     try:
         text = ""
         with pdfplumber.open(path) as pdf:
     text = clean_text(text)
     L = len(text)
+    # Dynamic summarization scaling
     if L < 1500:
         max_len, min_len, chunk_size = 180, 60, 1400
     elif L < 5000:
 # ==========================================================
+# 🔊 Text-to-Speech (Fixed for Hugging Face)
 # ==========================================================
 def text_to_speech(text):
+    """Convert text to speech and ensure WAV output for Hugging Face playback."""
     if not text:
         return None
     try:
+        # Temporary paths
+        mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
+        wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+        # Generate TTS (MP3)
+        gTTS(text=text[:900], lang="en").save(mp3_path)
+        # Convert to WAV for browser playback
+        AudioSegment.from_mp3(mp3_path).export(wav_path, format="wav")
+        return wav_path
+    except Exception as e:
+        print("TTS error:", e)
         return None
 # ==========================================================
+# 📄 PDF Processing
 # ==========================================================
 def process_pdf(pdf_file):
     """Main handler to process PDF."""
 # ==========================================================
+# 🎨 Gradio Interface
 # ==========================================================
 with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📘 AI PDF Summarizer — Extract, Summarize & Listen")
+    gr.Markdown("Easily extract and summarize text from PDFs using AI, and listen to clear audio summaries.")
+    # --- Main Tab ---
     with gr.Tab("📄 Analyze PDF"):
         with gr.Row():
             with gr.Column(scale=1):
+                pdf_input = gr.File(label="📂 Upload PDF", file_types=[".pdf"], type="filepath")
                 process_btn = gr.Button("🚀 Process PDF", variant="primary")
             with gr.Column(scale=2):
+                extracted_text = gr.Textbox(label="🧾 Extracted Text", lines=10, interactive=False)
+                summary_box = gr.Textbox(label="🧠 Summary", lines=6, interactive=False)
+                audio_box = gr.Audio(label="🔊 Summary Audio (Playable)", type="filepath", interactive=False)
+                keywords_box = gr.Textbox(label="🏷️ Top Keywords", lines=2, interactive=False)
     # --- About Tab ---
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
 ## 📘 About AI PDF Summarizer
+**AI PDF Summarizer** helps you quickly understand long PDFs using Artificial Intelligence.
 ### ✨ Features
 - Extracts and cleans text from PDFs
+- Creates adaptive, context-aware summaries
+- Identifies top keywords using TF-IDF
+- Converts summaries into **natural-sounding speech** (WAV format for Spaces compatibility)
+Built with ❤️ using **Hugging Face Transformers**, **Gradio**, **gTTS**, and **pydub**.
         """)
+    # --- Button Functionality ---
     process_btn.click(
         process_pdf,
         inputs=[pdf_input],