pdf_summarization1

Sleeping

App Files Files Community

LovnishVerma commited on May 30, 2025

Commit

28b7150

verified ·

1 Parent(s): 52d9d60

Create app.py

Browse files

Files changed (1) hide show

app.py +331 -0

app.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import fitz  # PyMuPDF
+import gradio as gr
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import os
+import time
+from typing import List, Tuple, Optional
+# Initialize models with error handling
+try:
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+    qa_pipeline = pipeline("question-answering",
+                           model="distilbert-base-uncased-distilled-squad")
+except Exception as e:
+    print(f"Error loading models: {e}")
+    raise
+def extract_text_from_pdf(file_path: str) -> Tuple[str, int, bool]:
+    """Extract text from PDF with metadata and error handling."""
+    try:
+        doc = fitz.open(file_path)
+        text = ""
+        page_count = len(doc)
+        for page_num, page in enumerate(doc):
+            page_text = page.get_text()
+            text += f"\n--- Page {page_num + 1} ---\n{page_text}"
+        doc.close()
+        # Check if text extraction was successful
+        is_text_rich = len(text.strip()) > 100
+        return text, page_count, is_text_rich
+    except Exception as e:
+        return f"Error extracting text: {str(e)}", 0, False
+def chunk_text(text: str, max_chunk_size: int = 500, overlap: int = 50) -> List[str]:
+    """Improved text chunking with overlap for better context preservation."""
+    words = text.split()
+    chunks = []
+    if len(words) <= max_chunk_size:
+        return [text]
+    start = 0
+    while start < len(words):
+        end = min(start + max_chunk_size, len(words))
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+        # Move start position with overlap
+        start = end - overlap if end < len(words) else end
+    return chunks
+def build_faiss_index(chunks: List[str]) -> Tuple[faiss.IndexFlatL2, np.ndarray]:
+    """Build FAISS index with error handling."""
+    try:
+        embeddings = embedding_model.encode(chunks, show_progress_bar=True)
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(embeddings.astype('float32'))
+        return index, embeddings
+    except Exception as e:
+        raise Exception(f"Error building FAISS index: {str(e)}")
+def retrieve_relevant_chunks(query: str, chunks: List[str], index: faiss.IndexFlatL2,
+                             embeddings: np.ndarray, top_k: int = 5) -> List[str]:
+    """Retrieve most relevant chunks for the query."""
+    try:
+        query_embedding = embedding_model.encode([query]).astype('float32')
+        distances, indices = index.search(
+            query_embedding, min(top_k, len(chunks)))
+        retrieved_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]
+        return retrieved_chunks
+    except Exception as e:
+        return [f"Error retrieving chunks: {str(e)}"]
+def summarize_pdf(file_path: Optional[str]) -> str:
+    """Enhanced PDF summarization with progress tracking."""
+    if not file_path:
+        return "Please upload a PDF file first."
+    if not os.path.exists(file_path):
+        return "File not found. Please upload a valid PDF."
+    start_time = time.time()
+    # Extract text
+    raw_text, page_count, is_text_rich = extract_text_from_pdf(file_path)
+    if not is_text_rich:
+        return f"⚠️ Warning: Limited text extracted from PDF ({page_count} pages). The file may contain mostly images or be corrupted.\n\nExtracted content:\n{raw_text[:500]}..."
+    try:
+        # Improved chunking for summarization
+        max_chunk = 1000  # Slightly larger chunks for better context
+        chunks = [raw_text[i:i+max_chunk]
+                  # 50% overlap
+                  for i in range(0, len(raw_text), max_chunk//2)]
+        summary_parts = []
+        total_chunks = len(chunks)
+        for i, chunk in enumerate(chunks):
+            if len(chunk.strip()) < 50:  # Skip very short chunks
+                continue
+            try:
+                # Adjust max_length based on chunk size
+                max_len = min(150, len(chunk.split()) // 3)
+                min_len = min(30, max_len // 3)
+                result = summarizer(chunk, max_length=max_len,
+                                    min_length=min_len, do_sample=False)
+                summary_parts.append(result[0]['summary_text'])
+            except Exception as chunk_error:
+                summary_parts.append(
+                    f"[Error summarizing chunk {i+1}: {str(chunk_error)}]")
+        processing_time = time.time() - start_time
+        if summary_parts:
+            final_summary = " ".join(summary_parts)
+            # Meta information
+            meta_info = f"📄 Document Summary ({page_count} pages, processed in {processing_time:.1f}s)\n" + \
+                "="*60 + "\n\n"
+            return meta_info + final_summary
+        else:
+            return "Unable to generate summary. The document may be too short or contain unsupported content."
+    except Exception as e:
+        return f"Error during summarization: {str(e)}"
+def answer_question(file_path: Optional[str], question: str) -> str:
+    """Enhanced Q&A with better error handling and context."""
+    if not file_path:
+        return "Please upload a PDF file first."
+    if not question.strip():
+        return "Please enter a question."
+    if not os.path.exists(file_path):
+        return "File not found. Please upload a valid PDF."
+    try:
+        start_time = time.time()
+        # Extract and process text
+        raw_text, page_count, is_text_rich = extract_text_from_pdf(file_path)
+        if not is_text_rich:
+            return f"⚠️ Limited text available for Q&A. Extracted content may be insufficient.\n\nAttempting to answer based on available text..."
+        # Create chunks with overlap for better context
+        chunks = chunk_text(raw_text, max_chunk_size=400, overlap=50)
+        if not chunks:
+            return "No processable text found in the document."
+        # Build search index
+        index, embeddings = build_faiss_index(chunks)
+        # Retrieve relevant chunks
+        relevant_chunks = retrieve_relevant_chunks(
+            question, chunks, index, embeddings, top_k=5)
+        if not relevant_chunks:
+            return "No relevant information found for your question."
+        # Combine context (limit to avoid token limits)
+        context = " ".join(relevant_chunks)[:2000]  # Limit context length
+        # Get answer
+        try:
+            result = qa_pipeline(question=question, context=context)
+            answer = result['answer']
+            confidence = result.get('score', 0)
+            processing_time = time.time() - start_time
+            # Format response with metadata
+            response = f"🎯 Answer (confidence: {confidence:.2f}, {processing_time:.1f}s):\n"
+            response += f"{answer}\n\n"
+            if confidence < 0.5:
+                response += "⚠️ Low confidence answer. The information might not be directly stated in the document."
+            return response
+        except Exception as qa_error:
+            return f"Error generating answer: {str(qa_error)}\n\nRelevant context found:\n{context[:300]}..."
+    except Exception as e:
+        return f"Error processing question: {str(e)}"
+def clear_inputs():
+    """Helper function to clear inputs."""
+    return None, ""
+# Enhanced Gradio UI
+with gr.Blocks(title="PDF Analyzer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 📚 Advanced PDF Analyzer
+    Upload a PDF document to generate summaries or ask questions about its content.
+    **Features:**
+    - 🔍 Intelligent text extraction and processing
+    - 📝 AI-powered document summarization
+    - ❓ Question-answering with semantic search
+    - 📊 Processing metadata and confidence scores
+    """)
+    with gr.Tab("📝 Document Summarization"):
+        gr.Markdown("### Generate an AI summary of your PDF document")
+        with gr.Row():
+            with gr.Column(scale=1):
+                pdf_input = gr.File(
+                    label="Upload PDF Document",
+                    file_types=[".pdf"],
+                    type="filepath"
+                )
+                summarize_button = gr.Button(
+                    "📝 Generate Summary", variant="primary")
+                clear_summary_btn = gr.Button("🗑️ Clear", variant="secondary")
+            with gr.Column(scale=2):
+                summary_output = gr.Textbox(
+                    label="Document Summary",
+                    lines=15,
+                    placeholder="Upload a PDF and click 'Generate Summary' to see results here...",
+                    show_copy_button=True
+                )
+        summarize_button.click(
+            fn=summarize_pdf,
+            inputs=pdf_input,
+            outputs=summary_output
+        )
+        clear_summary_btn.click(
+            fn=lambda: (None, ""),
+            outputs=[pdf_input, summary_output]
+        )
+    with gr.Tab("❓ Question & Answer"):
+        gr.Markdown("### Ask questions about your PDF document")
+        with gr.Row():
+            with gr.Column(scale=1):
+                pdf_input_qa = gr.File(
+                    label="Upload PDF Document",
+                    file_types=[".pdf"],
+                    type="filepath"
+                )
+                question_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="e.g., What is the main topic of this document?",
+                    lines=2
+                )
+                answer_button = gr.Button("🎯 Get Answer", variant="primary")
+                clear_qa_btn = gr.Button("🗑️ Clear", variant="secondary")
+            with gr.Column(scale=2):
+                answer_output = gr.Textbox(
+                    label="Answer",
+                    lines=10,
+                    placeholder="Upload a PDF, enter your question, and click 'Get Answer'...",
+                    show_copy_button=True
+                )
+        # Example questions
+        gr.Markdown("**Example questions:**")
+        example_questions = [
+            "What is the main topic of this document?",
+            "Who are the key people mentioned?",
+            "What are the main conclusions?",
+            "Can you explain the methodology used?"
+        ]
+        with gr.Row():
+            for i, eq in enumerate(example_questions):
+                gr.Button(eq, size="sm").click(
+                    fn=lambda q=eq: q,
+                    outputs=question_input
+                )
+        answer_button.click(
+            fn=answer_question,
+            inputs=[pdf_input_qa, question_input],
+            outputs=answer_output
+        )
+        clear_qa_btn.click(
+            fn=lambda: (None, "", ""),
+            outputs=[pdf_input_qa, question_input, answer_output]
+        )
+    # Footer with tips
+    gr.Markdown("""
+    ---
+    ### 💡 Tips for better results:
+    - **For summarization**: Works best with text-heavy documents (research papers, reports, articles)
+    - **For Q&A**: Ask specific questions and ensure your PDF contains searchable text
+    - **File size**: Larger documents may take longer to process
+    - **Quality**: Scanned PDFs without OCR may have limited text extraction
+    """)
+if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )