Spaces:
Running
Running
File size: 11,997 Bytes
9f1a341 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
import gradio as gr
import PyPDF2
import io
from transformers import pipeline, AutoTokenizer
import torch
import re
from typing import List, Tuple
import warnings
warnings.filterwarnings("ignore")
class PDFSummarizer:
def __init__(self):
# Use a much faster, lighter model for summarization
self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
try:
# Initialize the summarization pipeline with optimizations
self.summarizer = pipeline(
"summarization",
model=self.model_name,
device=0 if self.device == "cuda" else -1,
framework="pt",
model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
)
# Initialize tokenizer for length calculations
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
print("Model loaded successfully")
except Exception as e:
print(f"Error loading model: {e}")
# Fallback to an even faster model
self.model_name = "facebook/bart-large-cnn"
self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
print("Fallback model loaded")
def extract_text_from_pdf(self, pdf_file) -> str:
"""Extract text content from PDF file"""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
text = ""
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
if page_text.strip():
text += f"\n--- Page {page_num + 1} ---\n"
text += page_text
return text.strip()
except Exception as e:
raise Exception(f"Error extracting text from PDF: {str(e)}")
def clean_text(self, text: str) -> str:
"""Clean and preprocess text"""
# Remove extra whitespaces and newlines
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
# Remove page markers
text = re.sub(r'--- Page \d+ ---', '', text)
return text.strip()
def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
"""Split text into smaller, more manageable chunks for faster processing"""
sentences = text.split('. ')
chunks = []
current_chunk = ""
for sentence in sentences:
# Check if adding this sentence would exceed the limit
potential_chunk = current_chunk + sentence + ". "
# Use faster length estimation
if len(potential_chunk.split()) <= max_chunk_length:
current_chunk = potential_chunk
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
# Limit number of chunks for speed
return chunks[:5] # Process max 5 chunks for speed
def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
"""Summarize a single chunk of text with speed optimizations"""
try:
# Speed optimizations
summary = self.summarizer(
chunk,
max_length=max_length,
min_length=min_length,
do_sample=False,
truncation=True,
early_stopping=True,
num_beams=2 # Reduced from default 4 for speed
)
return summary[0]['summary_text']
except Exception as e:
return f"Error summarizing chunk: {str(e)}"
def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
"""Main function to process PDF and generate summary"""
try:
# Extract text from PDF
raw_text = self.extract_text_from_pdf(pdf_file)
if not raw_text.strip():
return "β Error: No text could be extracted from the PDF.", "", ""
# Clean the text
cleaned_text = self.clean_text(raw_text)
# Calculate text statistics
word_count = len(cleaned_text.split())
char_count = len(cleaned_text)
if word_count < 50:
return "β Error: PDF contains too little text to summarize.", "", ""
# Chunk the text for processing
chunks = self.chunk_text(cleaned_text)
# Determine summary parameters based on type (optimized for speed)
if summary_type == "Brief (Quick)":
max_len, min_len = 60, 20
elif summary_type == "Detailed":
max_len, min_len = 100, 40
else: # Comprehensive
max_len, min_len = 150, 60
# Summarize each chunk (with progress tracking)
chunk_summaries = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}")
summary = self.summarize_chunk(chunk, max_len, min_len)
chunk_summaries.append(summary)
# Combine summaries
combined_summary = " ".join(chunk_summaries)
# Skip final summarization for speed if we have few chunks
if len(chunks) <= 2:
final_summary = combined_summary
else:
# Quick final summary for multiple chunks
final_summary = self.summarize_chunk(
combined_summary,
max_length=min(200, max_len * 1.5),
min_length=min_len
)
# Create statistics
summary_stats = f"""
π **Document Statistics:**
- Original word count: {word_count:,}
- Original character count: {char_count:,}
- Pages processed: {len(chunks)}
- Summary word count: {len(final_summary.split()):,}
- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
"""
return final_summary, summary_stats, "β
Summary generated successfully!"
except Exception as e:
return f"β Error processing PDF: {str(e)}", "", ""
# Initialize the summarizer
pdf_summarizer = PDFSummarizer()
def summarize_pdf_interface(pdf_file, summary_type):
"""Gradio interface function"""
if pdf_file is None:
return "β Please upload a PDF file.", "", ""
try:
# Read the uploaded file - pdf_file is already the file path
with open(pdf_file, 'rb') as f:
pdf_content = f.read()
# Process the PDF
summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
return summary, stats, status
except Exception as e:
return f"β Error: {str(e)}", "", ""
# Create Gradio interface
def create_interface():
with gr.Blocks(
title="π AI PDF Summarizer",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1200px !important;
}
.summary-box {
border-left: 4px solid #2196F3;
padding: 16px;
background-color: #f8f9fa;
}
"""
) as interface:
gr.Markdown("""
# β¨ Your AI-Powered PDF Assistant
### Transform lengthy documents into clear, concise summaries in seconds.
Upload any PDF file below and let our intelligent model do the work for you. Perfect for students, researchers, and professionals who need to quickly grasp the core ideas of any document.
---
## Key Features
- β‘ **Lightning-Fast Summaries**: Powered by a state-of-the-art AI model.
- π§ **Intelligent Text Chunking**: Handles long documents with ease by breaking them down into manageable pieces.
- π **Detailed Document Stats**: Get instant insights on word count, compression ratio, and more.
- π― **Customizable Summary Length**: Choose the level of detail that works best for you.
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="π Upload PDF File",
file_types=[".pdf"],
type="filepath"
)
summary_type = gr.Radio(
choices=["Brief (Quick)", "Detailed", "Comprehensive"],
value="Detailed",
label="π Summary Length",
info="Choose how detailed you want the summary to be"
)
summarize_btn = gr.Button(
"π Generate Summary",
variant="primary",
size="lg"
)
status_output = gr.Textbox(
label="π Status",
interactive=False,
max_lines=2
)
with gr.Column(scale=2):
summary_output = gr.Textbox(
label="π Generated Summary",
lines=15,
max_lines=20,
interactive=False,
elem_classes=["summary-box"]
)
stats_output = gr.Markdown(
label="π Document Statistics",
value="Upload a PDF to see statistics"
)
# Examples section
gr.Markdown("""
---
## π‘ Tips for Optimal Results:
- **Quality Matters**: Ensure your PDF has selectable text (not just scanned images).
- **Length**: Works best with documents between 500-10,000 words.
- **Language**: Optimized for English content.
- **Format**: Clean, well-formatted PDFs produce the best summaries.
## π§ How It Works:
This application uses a fine-tuned **BART** model for summarization. The process involves:
1. Extracting text from your PDF.
2. Cleaning and preprocessing the text.
3. Intelligently chunking the document to handle long texts.
4. Generating a summary for each chunk and then combining them for a final, coherent result.
""")
# Connect the button to the function
summarize_btn.click(
fn=summarize_pdf_interface,
inputs=[pdf_input, summary_type],
outputs=[summary_output, stats_output, status_output]
)
# Auto-process when file is uploaded
pdf_input.change(
fn=summarize_pdf_interface,
inputs=[pdf_input, summary_type],
outputs=[summary_output, stats_output, status_output]
)
return interface
# Launch the application
if __name__ == "__main__":
interface = create_interface()
interface.launch()
|