File size: 7,825 Bytes
ddec509
 
 
 
 
 
 
 
 
 
 
b8ffb22
ddec509
 
b8ffb22
ddec509
 
 
 
 
 
 
 
 
 
b3fa4cf
ddec509
 
dec4eb8
ddec509
 
b8ffb22
ddec509
b8ffb22
ddec509
 
 
 
 
 
 
dec4eb8
ddec509
 
 
 
 
 
 
 
 
b8ffb22
ddec509
 
 
 
 
 
 
 
 
 
 
 
 
dec4eb8
ddec509
 
 
 
dec4eb8
ddec509
 
 
 
 
 
 
 
 
 
 
 
 
 
dec4eb8
ddec509
 
 
 
 
 
 
 
 
 
 
 
 
b3fa4cf
ddec509
 
dec4eb8
ddec509
 
dec4eb8
ddec509
 
dec4eb8
b8ffb22
ddec509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8ffb22
ddec509
 
b8ffb22
ddec509
 
 
b8ffb22
 
 
 
 
 
 
 
 
 
 
 
 
ddec509
 
 
 
b8ffb22
ddec509
 
dec4eb8
ddec509
dec4eb8
ddec509
 
 
dec4eb8
ddec509
 
 
 
 
 
dec4eb8
ddec509
 
 
b8ffb22
ddec509
dec4eb8
 
b8ffb22
ddec509
b8ffb22
ddec509
 
 
b8ffb22
ddec509
b8ffb22
ddec509
b8ffb22
 
 
 
ddec509
dec4eb8
ddec509
 
dec4eb8
b8ffb22
dec4eb8
 
 
b8ffb22
 
 
dec4eb8
b8ffb22
ddec509
 
b8ffb22
ddec509
 
 
dec4eb8
ddec509
 
dec4eb8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
import re
import tempfile
from datetime import datetime
import gradio as gr
from transformers import pipeline
import pdfplumber
from gtts import gTTS
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from pydub import AudioSegment

# ==========================================================
# 🧠 NLTK Setup (Fixes punkt_tab issue)
# ==========================================================
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        nltk.download(pkg)

# ==========================================================
# βš™οΈ Model Setup
# ==========================================================
DEVICE = -1  # CPU (-1), use 0 for GPU if available
SUMMARIZER_MODEL = "facebook/bart-large-cnn"

print("Loading summarization model... please wait ⏳")
try:
    summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
    print("βœ… Summarizer loaded successfully.")
except Exception as e:
    print("❌ Model load error:", e)
    summarizer = None


# ==========================================================
# 🧩 Utility Functions
# ==========================================================
def clean_text(text: str) -> str:
    """Clean extracted PDF text."""
    text = re.sub(r'\r\n?', '\n', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def extract_text_from_pdf(path: str) -> str:
    """Extract text from all pages of a PDF."""
    try:
        text = ""
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n\n"
        return text.strip() if text.strip() else "No text extracted from PDF."
    except Exception as e:
        return f"Error extracting text: {e}"


def sentence_tokenize(text: str):
    """Split text into sentences."""
    return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]


def chunk_text(text: str, max_chars=1500):
    """Split text into chunks for summarization."""
    sents = sentence_tokenize(text)
    chunks, cur = [], ""
    for s in sents:
        if len(cur) + len(s) < max_chars:
            cur += (" " if cur else "") + s
        else:
            chunks.append(cur)
            cur = s
    if cur:
        chunks.append(cur)
    return chunks


def extract_keywords_tfidf(text: str, top_k=8):
    """Extract keywords using TF-IDF."""
    try:
        paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
        X = vectorizer.fit_transform(paras)
        features = vectorizer.get_feature_names_out()
        scores = np.asarray(X.mean(axis=0)).ravel()
        idx = np.argsort(scores)[::-1][:top_k]
        return [features[i] for i in idx]
    except Exception:
        return []


# ==========================================================
# ✍️ Adaptive Summarization
# ==========================================================
def summarize_long_text(text: str) -> str:
    """Adaptive summarization based on PDF length."""
    if summarizer is None:
        return "Summarization model unavailable."

    text = clean_text(text)
    L = len(text)

    # Dynamic summarization scaling
    if L < 1500:
        max_len, min_len, chunk_size = 180, 60, 1400
    elif L < 5000:
        max_len, min_len, chunk_size = 250, 100, 1600
    elif L < 15000:
        max_len, min_len, chunk_size = 350, 150, 1800
    else:
        max_len, min_len, chunk_size = 500, 200, 2000

    if L <= chunk_size:
        return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]

    parts = chunk_text(text, max_chars=chunk_size)[:6]
    summaries = []
    for p in parts:
        try:
            summaries.append(summarizer(p, max_length=200, min_length=80, do_sample=False)[0]["summary_text"])
        except Exception:
            continue

    combined = " ".join(summaries)
    final = summarizer(combined, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
    return final


# ==========================================================
# πŸ”Š Text-to-Speech (Fixed for Hugging Face)
# ==========================================================
def text_to_speech(text):
    """Convert text to speech and ensure WAV output for Hugging Face playback."""
    if not text:
        return None
    try:
        # Temporary paths
        mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
        wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name

        # Generate TTS (MP3)
        gTTS(text=text[:900], lang="en").save(mp3_path)

        # Convert to WAV for browser playback
        AudioSegment.from_mp3(mp3_path).export(wav_path, format="wav")

        return wav_path
    except Exception as e:
        print("TTS error:", e)
        return None


# ==========================================================
# πŸ“„ PDF Processing
# ==========================================================
def process_pdf(pdf_file):
    """Main handler to process PDF."""
    if not pdf_file:
        return "Please upload a PDF.", "", None, ""

    text = extract_text_from_pdf(pdf_file)
    if text.startswith("Error") or text.startswith("No text"):
        return text, "", None, ""

    text = clean_text(text)
    summary = summarize_long_text(text)
    keywords = ", ".join(extract_keywords_tfidf(text))
    audio = text_to_speech(summary)

    return text, summary, audio, keywords


# ==========================================================
# 🎨 Gradio Interface
# ==========================================================
with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ“˜ AI PDF Summarizer β€” Extract, Summarize & Listen")
    gr.Markdown("Easily extract and summarize text from PDFs using AI, and listen to clear audio summaries.")

    # --- Main Tab ---
    with gr.Tab("πŸ“„ Analyze PDF"):
        with gr.Row():
            with gr.Column(scale=1):
                pdf_input = gr.File(label="πŸ“‚ Upload PDF", file_types=[".pdf"], type="filepath")
                process_btn = gr.Button("πŸš€ Process PDF", variant="primary")

            with gr.Column(scale=2):
                extracted_text = gr.Textbox(label="🧾 Extracted Text", lines=10, interactive=False)
                summary_box = gr.Textbox(label="🧠 Summary", lines=6, interactive=False)
                audio_box = gr.Audio(label="πŸ”Š Summary Audio (Playable)", type="filepath", interactive=False)
                keywords_box = gr.Textbox(label="🏷️ Top Keywords", lines=2, interactive=False)

    # --- About Tab ---
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
## πŸ“˜ About AI PDF Summarizer
**AI PDF Summarizer** helps you quickly understand long PDFs using Artificial Intelligence.

### ✨ Features
- Extracts and cleans text from PDFs  
- Creates adaptive, context-aware summaries  
- Identifies top keywords using TF-IDF  
- Converts summaries into **natural-sounding speech** (WAV format for Spaces compatibility)  

Built with ❀️ using **Hugging Face Transformers**, **Gradio**, **gTTS**, and **pydub**.
        """)

    # --- Button Functionality ---
    process_btn.click(
        process_pdf,
        inputs=[pdf_input],
        outputs=[extracted_text, summary_box, audio_box, keywords_box],
    )

print("πŸš€ Launching AI PDF Summarizer...")
demo.launch(share=True, debug=True)