Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import tempfile | |
| from datetime import datetime | |
| import gradio as gr | |
| from transformers import pipeline | |
| import pdfplumber | |
| from gtts import gTTS | |
| import nltk | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from pydub import AudioSegment | |
| # ========================================================== | |
| # π§ NLTK Setup (Fixes punkt_tab issue) | |
| # ========================================================== | |
| for pkg in ["punkt", "punkt_tab"]: | |
| try: | |
| nltk.data.find(f"tokenizers/{pkg}") | |
| except LookupError: | |
| nltk.download(pkg) | |
| # ========================================================== | |
| # βοΈ Model Setup | |
| # ========================================================== | |
| DEVICE = -1 # CPU (-1), use 0 for GPU if available | |
| SUMMARIZER_MODEL = "facebook/bart-large-cnn" | |
| print("Loading summarization model... please wait β³") | |
| try: | |
| summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE) | |
| print("β Summarizer loaded successfully.") | |
| except Exception as e: | |
| print("β Model load error:", e) | |
| summarizer = None | |
| # ========================================================== | |
| # π§© Utility Functions | |
| # ========================================================== | |
| def clean_text(text: str) -> str: | |
| """Clean extracted PDF text.""" | |
| text = re.sub(r'\r\n?', '\n', text) | |
| text = re.sub(r'\n{2,}', '\n\n', text) | |
| text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'[^\x00-\x7F]+', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def extract_text_from_pdf(path: str) -> str: | |
| """Extract text from all pages of a PDF.""" | |
| try: | |
| text = "" | |
| with pdfplumber.open(path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n\n" | |
| return text.strip() if text.strip() else "No text extracted from PDF." | |
| except Exception as e: | |
| return f"Error extracting text: {e}" | |
| def sentence_tokenize(text: str): | |
| """Split text into sentences.""" | |
| return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10] | |
| def chunk_text(text: str, max_chars=1500): | |
| """Split text into chunks for summarization.""" | |
| sents = sentence_tokenize(text) | |
| chunks, cur = [], "" | |
| for s in sents: | |
| if len(cur) + len(s) < max_chars: | |
| cur += (" " if cur else "") + s | |
| else: | |
| chunks.append(cur) | |
| cur = s | |
| if cur: | |
| chunks.append(cur) | |
| return chunks | |
| def extract_keywords_tfidf(text: str, top_k=8): | |
| """Extract keywords using TF-IDF.""" | |
| try: | |
| paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0] | |
| vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2)) | |
| X = vectorizer.fit_transform(paras) | |
| features = vectorizer.get_feature_names_out() | |
| scores = np.asarray(X.mean(axis=0)).ravel() | |
| idx = np.argsort(scores)[::-1][:top_k] | |
| return [features[i] for i in idx] | |
| except Exception: | |
| return [] | |
| # ========================================================== | |
| # βοΈ Adaptive Summarization | |
| # ========================================================== | |
| def summarize_long_text(text: str) -> str: | |
| """Adaptive summarization based on PDF length.""" | |
| if summarizer is None: | |
| return "Summarization model unavailable." | |
| text = clean_text(text) | |
| L = len(text) | |
| # Dynamic summarization scaling | |
| if L < 1500: | |
| max_len, min_len, chunk_size = 180, 60, 1400 | |
| elif L < 5000: | |
| max_len, min_len, chunk_size = 250, 100, 1600 | |
| elif L < 15000: | |
| max_len, min_len, chunk_size = 350, 150, 1800 | |
| else: | |
| max_len, min_len, chunk_size = 500, 200, 2000 | |
| if L <= chunk_size: | |
| return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"] | |
| parts = chunk_text(text, max_chars=chunk_size)[:6] | |
| summaries = [] | |
| for p in parts: | |
| try: | |
| summaries.append(summarizer(p, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]) | |
| except Exception: | |
| continue | |
| combined = " ".join(summaries) | |
| final = summarizer(combined, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"] | |
| return final | |
| # ========================================================== | |
| # π Text-to-Speech (Fixed for Hugging Face) | |
| # ========================================================== | |
| def text_to_speech(text): | |
| """Convert text to speech and ensure WAV output for Hugging Face playback.""" | |
| if not text: | |
| return None | |
| try: | |
| # Temporary paths | |
| mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name | |
| wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name | |
| # Generate TTS (MP3) | |
| gTTS(text=text[:900], lang="en").save(mp3_path) | |
| # Convert to WAV for browser playback | |
| AudioSegment.from_mp3(mp3_path).export(wav_path, format="wav") | |
| return wav_path | |
| except Exception as e: | |
| print("TTS error:", e) | |
| return None | |
| # ========================================================== | |
| # π PDF Processing | |
| # ========================================================== | |
| def process_pdf(pdf_file): | |
| """Main handler to process PDF.""" | |
| if not pdf_file: | |
| return "Please upload a PDF.", "", None, "" | |
| text = extract_text_from_pdf(pdf_file) | |
| if text.startswith("Error") or text.startswith("No text"): | |
| return text, "", None, "" | |
| text = clean_text(text) | |
| summary = summarize_long_text(text) | |
| keywords = ", ".join(extract_keywords_tfidf(text)) | |
| audio = text_to_speech(summary) | |
| return text, summary, audio, keywords | |
| # ========================================================== | |
| # π¨ Gradio Interface | |
| # ========================================================== | |
| with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π AI PDF Summarizer β Extract, Summarize & Listen") | |
| gr.Markdown("Easily extract and summarize text from PDFs using AI, and listen to clear audio summaries.") | |
| # --- Main Tab --- | |
| with gr.Tab("π Analyze PDF"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File(label="π Upload PDF", file_types=[".pdf"], type="filepath") | |
| process_btn = gr.Button("π Process PDF", variant="primary") | |
| with gr.Column(scale=2): | |
| extracted_text = gr.Textbox(label="π§Ύ Extracted Text", lines=10, interactive=False) | |
| summary_box = gr.Textbox(label="π§ Summary", lines=6, interactive=False) | |
| audio_box = gr.Audio(label="π Summary Audio (Playable)", type="filepath", interactive=False) | |
| keywords_box = gr.Textbox(label="π·οΈ Top Keywords", lines=2, interactive=False) | |
| # --- About Tab --- | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ## π About AI PDF Summarizer | |
| **AI PDF Summarizer** helps you quickly understand long PDFs using Artificial Intelligence. | |
| ### β¨ Features | |
| - Extracts and cleans text from PDFs | |
| - Creates adaptive, context-aware summaries | |
| - Identifies top keywords using TF-IDF | |
| - Converts summaries into **natural-sounding speech** (WAV format for Spaces compatibility) | |
| Built with β€οΈ using **Hugging Face Transformers**, **Gradio**, **gTTS**, and **pydub**. | |
| """) | |
| # --- Button Functionality --- | |
| process_btn.click( | |
| process_pdf, | |
| inputs=[pdf_input], | |
| outputs=[extracted_text, summary_box, audio_box, keywords_box], | |
| ) | |
| print("π Launching AI PDF Summarizer...") | |
| demo.launch(share=True, debug=True) | |