Spaces:
Sleeping
Sleeping
File size: 7,825 Bytes
ddec509 b8ffb22 ddec509 b8ffb22 ddec509 b3fa4cf ddec509 dec4eb8 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 dec4eb8 ddec509 b8ffb22 ddec509 dec4eb8 ddec509 dec4eb8 ddec509 dec4eb8 ddec509 b3fa4cf ddec509 dec4eb8 ddec509 dec4eb8 ddec509 dec4eb8 b8ffb22 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 dec4eb8 ddec509 dec4eb8 ddec509 dec4eb8 ddec509 dec4eb8 ddec509 b8ffb22 ddec509 dec4eb8 b8ffb22 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 b8ffb22 ddec509 dec4eb8 ddec509 dec4eb8 b8ffb22 dec4eb8 b8ffb22 dec4eb8 b8ffb22 ddec509 b8ffb22 ddec509 dec4eb8 ddec509 dec4eb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import os
import re
import tempfile
from datetime import datetime
import gradio as gr
from transformers import pipeline
import pdfplumber
from gtts import gTTS
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from pydub import AudioSegment
# ==========================================================
# π§ NLTK Setup (Fixes punkt_tab issue)
# ==========================================================
for pkg in ["punkt", "punkt_tab"]:
try:
nltk.data.find(f"tokenizers/{pkg}")
except LookupError:
nltk.download(pkg)
# ==========================================================
# βοΈ Model Setup
# ==========================================================
DEVICE = -1 # CPU (-1), use 0 for GPU if available
SUMMARIZER_MODEL = "facebook/bart-large-cnn"
print("Loading summarization model... please wait β³")
try:
summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
print("β
Summarizer loaded successfully.")
except Exception as e:
print("β Model load error:", e)
summarizer = None
# ==========================================================
# π§© Utility Functions
# ==========================================================
def clean_text(text: str) -> str:
"""Clean extracted PDF text."""
text = re.sub(r'\r\n?', '\n', text)
text = re.sub(r'\n{2,}', '\n\n', text)
text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def extract_text_from_pdf(path: str) -> str:
"""Extract text from all pages of a PDF."""
try:
text = ""
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
return text.strip() if text.strip() else "No text extracted from PDF."
except Exception as e:
return f"Error extracting text: {e}"
def sentence_tokenize(text: str):
"""Split text into sentences."""
return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
def chunk_text(text: str, max_chars=1500):
"""Split text into chunks for summarization."""
sents = sentence_tokenize(text)
chunks, cur = [], ""
for s in sents:
if len(cur) + len(s) < max_chars:
cur += (" " if cur else "") + s
else:
chunks.append(cur)
cur = s
if cur:
chunks.append(cur)
return chunks
def extract_keywords_tfidf(text: str, top_k=8):
"""Extract keywords using TF-IDF."""
try:
paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(paras)
features = vectorizer.get_feature_names_out()
scores = np.asarray(X.mean(axis=0)).ravel()
idx = np.argsort(scores)[::-1][:top_k]
return [features[i] for i in idx]
except Exception:
return []
# ==========================================================
# βοΈ Adaptive Summarization
# ==========================================================
def summarize_long_text(text: str) -> str:
"""Adaptive summarization based on PDF length."""
if summarizer is None:
return "Summarization model unavailable."
text = clean_text(text)
L = len(text)
# Dynamic summarization scaling
if L < 1500:
max_len, min_len, chunk_size = 180, 60, 1400
elif L < 5000:
max_len, min_len, chunk_size = 250, 100, 1600
elif L < 15000:
max_len, min_len, chunk_size = 350, 150, 1800
else:
max_len, min_len, chunk_size = 500, 200, 2000
if L <= chunk_size:
return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
parts = chunk_text(text, max_chars=chunk_size)[:6]
summaries = []
for p in parts:
try:
summaries.append(summarizer(p, max_length=200, min_length=80, do_sample=False)[0]["summary_text"])
except Exception:
continue
combined = " ".join(summaries)
final = summarizer(combined, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
return final
# ==========================================================
# π Text-to-Speech (Fixed for Hugging Face)
# ==========================================================
def text_to_speech(text):
"""Convert text to speech and ensure WAV output for Hugging Face playback."""
if not text:
return None
try:
# Temporary paths
mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
# Generate TTS (MP3)
gTTS(text=text[:900], lang="en").save(mp3_path)
# Convert to WAV for browser playback
AudioSegment.from_mp3(mp3_path).export(wav_path, format="wav")
return wav_path
except Exception as e:
print("TTS error:", e)
return None
# ==========================================================
# π PDF Processing
# ==========================================================
def process_pdf(pdf_file):
"""Main handler to process PDF."""
if not pdf_file:
return "Please upload a PDF.", "", None, ""
text = extract_text_from_pdf(pdf_file)
if text.startswith("Error") or text.startswith("No text"):
return text, "", None, ""
text = clean_text(text)
summary = summarize_long_text(text)
keywords = ", ".join(extract_keywords_tfidf(text))
audio = text_to_speech(summary)
return text, summary, audio, keywords
# ==========================================================
# π¨ Gradio Interface
# ==========================================================
with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# π AI PDF Summarizer β Extract, Summarize & Listen")
gr.Markdown("Easily extract and summarize text from PDFs using AI, and listen to clear audio summaries.")
# --- Main Tab ---
with gr.Tab("π Analyze PDF"):
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(label="π Upload PDF", file_types=[".pdf"], type="filepath")
process_btn = gr.Button("π Process PDF", variant="primary")
with gr.Column(scale=2):
extracted_text = gr.Textbox(label="π§Ύ Extracted Text", lines=10, interactive=False)
summary_box = gr.Textbox(label="π§ Summary", lines=6, interactive=False)
audio_box = gr.Audio(label="π Summary Audio (Playable)", type="filepath", interactive=False)
keywords_box = gr.Textbox(label="π·οΈ Top Keywords", lines=2, interactive=False)
# --- About Tab ---
with gr.Tab("βΉοΈ About"):
gr.Markdown("""
## π About AI PDF Summarizer
**AI PDF Summarizer** helps you quickly understand long PDFs using Artificial Intelligence.
### β¨ Features
- Extracts and cleans text from PDFs
- Creates adaptive, context-aware summaries
- Identifies top keywords using TF-IDF
- Converts summaries into **natural-sounding speech** (WAV format for Spaces compatibility)
Built with β€οΈ using **Hugging Face Transformers**, **Gradio**, **gTTS**, and **pydub**.
""")
# --- Button Functionality ---
process_btn.click(
process_pdf,
inputs=[pdf_input],
outputs=[extracted_text, summary_box, audio_box, keywords_box],
)
print("π Launching AI PDF Summarizer...")
demo.launch(share=True, debug=True)
|