Spaces:

BasitAliii
/

Smart-PDF-Summarizer

Sleeping

App Files Files Community

Smart-PDF-Summarizer / app.py

BasitAliii

Update app.py

b8ffb22 verified 3 months ago

raw

history blame contribute delete

7.83 kB

	import os
	import re
	import tempfile
	from datetime import datetime
	import gradio as gr
	from transformers import pipeline
	import pdfplumber
	from gtts import gTTS
	import nltk
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from pydub import AudioSegment

	# ==========================================================
	# 🧠 NLTK Setup (Fixes punkt_tab issue)
	# ==========================================================
	for pkg in ["punkt", "punkt_tab"]:
	try:
	nltk.data.find(f"tokenizers/{pkg}")
	except LookupError:
	nltk.download(pkg)

	# ==========================================================
	# ⚙️ Model Setup
	# ==========================================================
	DEVICE = -1 # CPU (-1), use 0 for GPU if available
	SUMMARIZER_MODEL = "facebook/bart-large-cnn"

	print("Loading summarization model... please wait ⏳")
	try:
	summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
	print("✅ Summarizer loaded successfully.")
	except Exception as e:
	print("❌ Model load error:", e)
	summarizer = None


	# ==========================================================
	# 🧩 Utility Functions
	# ==========================================================
	def clean_text(text: str) -> str:
	"""Clean extracted PDF text."""
	text = re.sub(r'\r\n?', '\n', text)
	text = re.sub(r'\n{2,}', '\n\n', text)
	text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
	text = re.sub(r'[^\x00-\x7F]+', ' ', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()


	def extract_text_from_pdf(path: str) -> str:
	"""Extract text from all pages of a PDF."""
	try:
	text = ""
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n\n"
	return text.strip() if text.strip() else "No text extracted from PDF."
	except Exception as e:
	return f"Error extracting text: {e}"


	def sentence_tokenize(text: str):
	"""Split text into sentences."""
	return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]


	def chunk_text(text: str, max_chars=1500):
	"""Split text into chunks for summarization."""
	sents = sentence_tokenize(text)
	chunks, cur = [], ""
	for s in sents:
	if len(cur) + len(s) < max_chars:
	cur += (" " if cur else "") + s
	else:
	chunks.append(cur)
	cur = s
	if cur:
	chunks.append(cur)
	return chunks


	def extract_keywords_tfidf(text: str, top_k=8):
	"""Extract keywords using TF-IDF."""
	try:
	paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
	vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
	X = vectorizer.fit_transform(paras)
	features = vectorizer.get_feature_names_out()
	scores = np.asarray(X.mean(axis=0)).ravel()
	idx = np.argsort(scores)[::-1][:top_k]
	return [features[i] for i in idx]
	except Exception:
	return []


	# ==========================================================
	# ✍️ Adaptive Summarization
	# ==========================================================
	def summarize_long_text(text: str) -> str:
	"""Adaptive summarization based on PDF length."""
	if summarizer is None:
	return "Summarization model unavailable."

	text = clean_text(text)
	L = len(text)

	# Dynamic summarization scaling
	if L < 1500:
	max_len, min_len, chunk_size = 180, 60, 1400
	elif L < 5000:
	max_len, min_len, chunk_size = 250, 100, 1600
	elif L < 15000:
	max_len, min_len, chunk_size = 350, 150, 1800
	else:
	max_len, min_len, chunk_size = 500, 200, 2000

	if L <= chunk_size:
	return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]

	parts = chunk_text(text, max_chars=chunk_size)[:6]
	summaries = []
	for p in parts:
	try:
	summaries.append(summarizer(p, max_length=200, min_length=80, do_sample=False)[0]["summary_text"])
	except Exception:
	continue

	combined = " ".join(summaries)
	final = summarizer(combined, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
	return final


	# ==========================================================
	# 🔊 Text-to-Speech (Fixed for Hugging Face)
	# ==========================================================
	def text_to_speech(text):
	"""Convert text to speech and ensure WAV output for Hugging Face playback."""
	if not text:
	return None
	try:
	# Temporary paths
	mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
	wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name

	# Generate TTS (MP3)
	gTTS(text=text[:900], lang="en").save(mp3_path)

	# Convert to WAV for browser playback
	AudioSegment.from_mp3(mp3_path).export(wav_path, format="wav")

	return wav_path
	except Exception as e:
	print("TTS error:", e)
	return None


	# ==========================================================
	# 📄 PDF Processing
	# ==========================================================
	def process_pdf(pdf_file):
	"""Main handler to process PDF."""
	if not pdf_file:
	return "Please upload a PDF.", "", None, ""

	text = extract_text_from_pdf(pdf_file)
	if text.startswith("Error") or text.startswith("No text"):
	return text, "", None, ""

	text = clean_text(text)
	summary = summarize_long_text(text)
	keywords = ", ".join(extract_keywords_tfidf(text))
	audio = text_to_speech(summary)

	return text, summary, audio, keywords


	# ==========================================================
	# 🎨 Gradio Interface
	# ==========================================================
	with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📘 AI PDF Summarizer — Extract, Summarize & Listen")
	gr.Markdown("Easily extract and summarize text from PDFs using AI, and listen to clear audio summaries.")

	# --- Main Tab ---
	with gr.Tab("📄 Analyze PDF"):
	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(label="📂 Upload PDF", file_types=[".pdf"], type="filepath")
	process_btn = gr.Button("🚀 Process PDF", variant="primary")

	with gr.Column(scale=2):
	extracted_text = gr.Textbox(label="🧾 Extracted Text", lines=10, interactive=False)
	summary_box = gr.Textbox(label="🧠 Summary", lines=6, interactive=False)
	audio_box = gr.Audio(label="🔊 Summary Audio (Playable)", type="filepath", interactive=False)
	keywords_box = gr.Textbox(label="🏷️ Top Keywords", lines=2, interactive=False)

	# --- About Tab ---
	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## 📘 About AI PDF Summarizer
	AI PDF Summarizer helps you quickly understand long PDFs using Artificial Intelligence.

	### ✨ Features
	- Extracts and cleans text from PDFs
	- Creates adaptive, context-aware summaries
	- Identifies top keywords using TF-IDF
	- Converts summaries into natural-sounding speech (WAV format for Spaces compatibility)

	Built with ❤️ using Hugging Face Transformers, Gradio, gTTS, and pydub.
	""")

	# --- Button Functionality ---
	process_btn.click(
	process_pdf,
	inputs=[pdf_input],
	outputs=[extracted_text, summary_box, audio_box, keywords_box],
	)

	print("🚀 Launching AI PDF Summarizer...")
	demo.launch(share=True, debug=True)