Spaces:

SimranShaikh
/

pdf-processor-qa

Runtime error

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

52bcdc8

verified ·

1 Parent(s): 6b92160

commit

Browse files

Files changed (1) hide show

app.py +412 -48

app.py CHANGED Viewed

@@ -1,64 +1,428 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+# app.py - Main Hugging Face Spaces Application
 import gradio as gr
+import PyPDF2
+import pdfplumber
+import fitz  # PyMuPDF
+import pandas as pd
+import re
+import logging
+import os
+import tempfile
+from typing import Dict, List, Tuple, Optional
+from pathlib import Path
+import json
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PDFProcessorError(Exception):
+    """Custom exception for PDF processing errors"""
+    pass
+def enhanced_pdf_processor(file_path: str) -> Dict:
+    """
+    Enhanced PDF processor for Hugging Face deployment
+    """
+    results = {
+        'text': '',
+        'tables': [],
+        'metadata': {},
+        'extraction_method': 'unknown',
+        'success': False,
+        'error': None,
+        'file_info': {},
+        'summary': ''
+    }
+    try:
+        # Validate file
+        if not os.path.exists(file_path):
+            results['error'] = f"File does not exist: {file_path}"
+            return results
+        # Get file info
+        results['file_info'] = get_file_info(file_path)
+        # Try different extraction methods
+        extraction_methods = [
+            ('PyMuPDF', extract_with_pymupdf),
+            ('pdfplumber', extract_with_pdfplumber),
+            ('PyPDF2', extract_with_pypdf2)
+        ]
+        for method_name, method_func in extraction_methods:
+            try:
+                logger.info(f"Trying extraction method: {method_name}")
+                if method_name == 'pdfplumber':
+                    text_result, tables = method_func(file_path)
+                    if text_result and len(text_result.strip()) > 10:
+                        results['text'] = text_result
+                        results['tables'] = tables
+                        results['extraction_method'] = method_name
+                        results['success'] = True
+                        break
+                elif method_name == 'PyMuPDF':
+                    text_result, metadata = method_func(file_path)
+                    if text_result and len(text_result.strip()) > 10:
+                        results['text'] = text_result
+                        results['metadata'] = metadata
+                        results['extraction_method'] = method_name
+                        results['success'] = True
+                        break
+                else:  # PyPDF2
+                    text_result = method_func(file_path)
+                    if text_result and len(text_result.strip()) > 10:
+                        results['text'] = text_result
+                        results['extraction_method'] = method_name
+                        results['success'] = True
+                        break
+            except Exception as e:
+                logger.warning(f"{method_name} failed: {str(e)}")
+                continue
+        # Generate summary if successful
+        if results['success']:
+            results['summary'] = generate_document_summary(results['text'])
+        else:
+            results['error'] = "All extraction methods failed"
+    except Exception as e:
+        results['error'] = f"Processing error: {str(e)}"
+        logger.error(f"PDF processing error: {e}")
+    return results
+def extract_with_pypdf2(file_path: str) -> str:
+    """Extract text using PyPDF2"""
+    text = ""
+    try:
+        with open(file_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
+            if reader.is_encrypted:
+                try:
+                    reader.decrypt("")
+                except:
+                    raise PDFProcessorError("PDF is encrypted")
+            for page_num, page in enumerate(reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
+                except Exception as e:
+                    logger.warning(f"Failed to extract page {page_num + 1}: {e}")
+        return clean_text(text)
+    except Exception as e:
+        raise PDFProcessorError(f"PyPDF2 extraction failed: {e}")
+def extract_with_pdfplumber(file_path: str) -> Tuple[str, List[Dict]]:
+    """Extract text and tables using pdfplumber"""
+    text = ""
+    tables = []
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            for page_num, page in enumerate(pdf.pages):
+                try:
+                    # Extract text
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
+                    # Extract tables
+                    page_tables = page.extract_tables()
+                    for table_num, table in enumerate(page_tables):
+                        if table and len(table) > 1:
+                            tables.append({
+                                'page': page_num + 1,
+                                'table_number': table_num + 1,
+                                'data': table,
+                                'text_representation': table_to_text(table)
+                            })
+                except Exception as e:
+                    logger.warning(f"Failed to process page {page_num + 1}: {e}")
+        return clean_text(text), tables
+    except Exception as e:
+        raise PDFProcessorError(f"pdfplumber extraction failed: {e}")
+def extract_with_pymupdf(file_path: str) -> Tuple[str, Dict]:
+    """Extract text using PyMuPDF"""
+    text = ""
+    metadata = {}
+    try:
+        doc = fitz.open(file_path)
+        # Extract metadata
+        try:
+            doc_metadata = doc.metadata or {}
+            metadata = {
+                'page_count': doc.page_count,
+                'title': doc_metadata.get('title', ''),
+                'author': doc_metadata.get('author', ''),
+                'subject': doc_metadata.get('subject', ''),
+                'creator': doc_metadata.get('creator', ''),
+                'creation_date': doc_metadata.get('creationDate', '')
+            }
+        except Exception as e:
+            metadata = {'page_count': doc.page_count}
+        # Extract text
+        for page_num in range(doc.page_count):
+            try:
+                page = doc[page_num]
+                page_text = page.get_text()
+                if page_text:
+                    text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
+            except Exception as e:
+                logger.warning(f"Failed to extract page {page_num + 1}: {e}")
+        doc.close()
+        return clean_text(text), metadata
+    except Exception as e:
+        raise PDFProcessorError(f"PyMuPDF extraction failed: {e}")
+def clean_text(text: str) -> str:
+    """Clean extracted text"""
+    if not text:
+        return ""
+    # Remove excessive whitespace
+    text = re.sub(r'\n\s*\n', '\n\n', text)
+    text = re.sub(r' +', ' ', text)
+    # Remove problematic characters
+    text = text.replace('\ufffd', '')
+    text = text.replace('\x00', '')
+    text = text.replace('\u200b', '')
+    return text.strip()
+def table_to_text(table: List[List]) -> str:
+    """Convert table to text"""
+    if not table:
+        return ""
+    text_lines = []
+    for row in table:
+        if row:
+            clean_row = [str(cell).strip() if cell else "" for cell in row]
+            if any(clean_row):
+                text_lines.append(" | ".join(clean_row))
+    return "\n".join(text_lines)
+def get_file_info(file_path: str) -> Dict:
+    """Get file information"""
+    try:
+        path = Path(file_path)
+        stat = path.stat()
+        return {
+            'name': path.name,
+            'size': stat.st_size,
+            'size_mb': round(stat.st_size / (1024 * 1024), 2)
+        }
+    except Exception:
+        return {}
+def generate_document_summary(text: str) -> str:
+    """Generate a simple document summary"""
+    if not text:
+        return "No text extracted"
+    # Basic statistics
+    words = len(text.split())
+    lines = len(text.split('\n'))
+    chars = len(text)
+    # Extract first few sentences for preview
+    sentences = re.split(r'[.!?]+', text)
+    preview = '. '.join(sentences[:3]).strip()
+    if len(preview) > 300:
+        preview = preview[:300] + "..."
+    return f"""
+Document Statistics:
+- Characters: {chars:,}
+- Words: {words:,}
+- Lines: {lines:,}
+Preview:
+{preview}
 """
+def process_pdf_file(file) -> Tuple[str, str, str, str]:
+    """
+    Process uploaded PDF file for Gradio interface
+    """
+    if file is None:
+        return "No file uploaded", "", "", ""
+    try:
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(file.read())
+            tmp_file_path = tmp_file.name
+        # Process the PDF
+        result = enhanced_pdf_processor(tmp_file_path)
+        # Clean up
+        os.unlink(tmp_file_path)
+        if result['success']:
+            # Format results for display
+            status = f"✅ Successfully processed using {result['extraction_method']}"
+            # File info
+            file_info = result.get('file_info', {})
+            info = f"""
+File: {file_info.get('name', 'Unknown')}
+Size: {file_info.get('size_mb', 0)} MB
+Pages: {result.get('metadata', {}).get('page_count', 'Unknown')}
 """
+            # Summary
+            summary = result.get('summary', 'No summary available')
+            # Full text (truncated for display)
+            full_text = result['text']
+            if len(full_text) > 5000:
+                display_text = full_text[:5000] + f"\n\n... (Text truncated. Total length: {len(full_text)} characters)"
+            else:
+                display_text = full_text
+            # Tables info
+            if result['tables']:
+                tables_info = f"\n\nTables found: {len(result['tables'])}"
+                for i, table in enumerate(result['tables'][:3]):  # Show first 3 tables
+                    tables_info += f"\n\nTable {i+1} (Page {table['page']}):\n"
+                    tables_info += table['text_representation'][:500]
+                    if len(table['text_representation']) > 500:
+                        tables_info += "..."
+                display_text += tables_info
+            return status, info, summary, display_text
+        else:
+            error_msg = result.get('error', 'Unknown error')
+            return f"❌ Processing failed: {error_msg}", "", "", ""
+    except Exception as e:
+        return f"❌ Error: {str(e)}", "", "", ""
+def answer_question(text: str, question: str) -> str:
+    """
+    Simple keyword-based question answering
+    """
+    if not text or not question:
+        return "Please provide both text and a question."
+    # Convert to lowercase for searching
+    text_lower = text.lower()
+    question_lower = question.lower()
+    # Extract keywords from question
+    keywords = [word for word in question_lower.split() if len(word) > 3]
+    # Find relevant sentences
+    sentences = re.split(r'[.!?]+', text)
+    relevant_sentences = []
+    for sentence in sentences:
+        sentence_lower = sentence.lower()
+        score = sum(1 for keyword in keywords if keyword in sentence_lower)
+        if score > 0:
+            relevant_sentences.append((sentence.strip(), score))
+    # Sort by relevance and take top 3
+    relevant_sentences.sort(key=lambda x: x[1], reverse=True)
+    top_sentences = [sent[0] for sent in relevant_sentences[:3]]
+    if top_sentences:
+        return f"Based on the document, here are the most relevant sections:\n\n" + "\n\n".join(top_sentences)
+    else:
+        return "I couldn't find information related to your question in the document."
+# Global variable to store extracted text
+extracted_text = ""
+def update_extracted_text(status, info, summary, full_text):
+    """Update global extracted text variable"""
+    global extracted_text
+    extracted_text = full_text
+    return status, info, summary, full_text
+def qa_interface(question):
+    """Interface for question answering"""
+    global extracted_text
+    return answer_question(extracted_text, question)
+# Create Gradio interface
+with gr.Blocks(title="PDF Processor & Q&A System") as app:
+    gr.Markdown("# 📄 PDF Processor & Question Answering System")
+    gr.Markdown("Upload a PDF file to extract text and ask questions about its content.")
+    with gr.Tab("PDF Processing"):
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+                process_btn = gr.Button("Process PDF", variant="primary")
+            with gr.Column():
+                status_output = gr.Textbox(label="Status", lines=2)
+                info_output = gr.Textbox(label="File Information", lines=4)
+        summary_output = gr.Textbox(label="Document Summary", lines=8)
+        text_output = gr.Textbox(label="Extracted Text", lines=15, max_lines=20)
+    with gr.Tab("Question & Answer"):
+        gr.Markdown("Ask questions about the processed PDF content.")
+        with gr.Row():
+            question_input = gr.Textbox(label="Your Question", placeholder="What is this document about?")
+            ask_btn = gr.Button("Ask Question", variant="primary")
+        answer_output = gr.Textbox(label="Answer", lines=8)
+    # Event handlers
+    process_btn.click(
+        fn=process_pdf_file,
+        inputs=[file_input],
+        outputs=[status_output, info_output, summary_output, text_output]
+    ).then(
+        fn=update_extracted_text,
+        inputs=[status_output, info_output, summary_output, text_output],
+        outputs=[status_output, info_output, summary_output, text_output]
+    )
+    ask_btn.click(
+        fn=qa_interface,
+        inputs=[question_input],
+        outputs=[answer_output]
+    )
+    # Example
+    gr.Examples(
+        examples=[
+            ["What is the main topic of this document?"],
+            ["What are the key findings?"],
+            ["Who are the authors?"],
+            ["What is the conclusion?"]
+        ],
+        inputs=[question_input]
+    )
 if __name__ == "__main__":
+    app.launch()