Spaces:

Betimes-Solution
/

PDFtoDocx-OCR

Sleeping

App Files Files Community

Chirapath commited on Sep 12, 2025

Commit

6063ce4

verified ·

1 Parent(s): 0133ada

Upload 7 files

Browse files

Files changed (7) hide show

.env +57 -0
app.py +613 -0
backend.py +368 -0
ocr_service.py +531 -0
readme.md +231 -0
requirements.txt +40 -0
test_setup.py +183 -0

.env ADDED Viewed

	@@ -0,0 +1,57 @@

+# PDF OCR Service Environment Configuration
+# ======================
+# Azure Document Intelligence Configuration
+# ======================
+# Get these from Azure Portal > Document Intelligence resource
+AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://ocr-doc-ccib-service-01.cognitiveservices.azure.com/
+AZURE_DOCUMENT_INTELLIGENCE_KEY=73200Rna0T57qmWbFu0qK9Z8h4eA1AFvyHAOhSoVXRAnSRXy0ZJtJQQJ99BFACqBBLyXJ3w3AAALACOGjs1l
+# ======================
+# Server Configuration
+# ======================
+# Gradio server settings
+SERVER_HOST=127.0.0.1
+SERVER_PORT=7860
+SHARE_GRADIO=false
+# ======================
+# File Processing Limits
+# ======================
+# Maximum file size in MB
+MAX_FILE_SIZE_MB=50
+# Maximum number of history entries to keep
+MAX_HISTORY_SIZE=100
+# ======================
+# Logging Configuration
+# ======================
+# Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+LOG_LEVEL=INFO
+# ======================
+# Instructions for Setup
+# ======================
+# 1. Copy this file and rename it to .env (remove .example)
+# 2. Fill in your Azure Document Intelligence credentials above
+# 3. Adjust other settings as needed for your environment
+# 4. Never commit the .env file to version control
+# 5. Add .env to your .gitignore file
+# ======================
+# How to get Azure credentials:
+# ======================
+# 1. Go to Azure Portal (portal.azure.com)
+# 2. Create a new "Document Intelligence" resource
+# 3. Choose a subscription, resource group, and region
+# 4. Select pricing tier (F0 for free tier, S0 for standard)
+# 5. After creation, go to "Keys and Endpoint" section
+# 6. Copy the endpoint URL and one of the keys
+# 7. Replace the values above with your actual credentials
+# ======================
+# Example values:
+# ======================
+# AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://myservice.cognitiveservices.azure.com/
+# AZURE_DOCUMENT_INTELLIGENCE_KEY=1234567890abcdef1234567890abcdef

app.py ADDED Viewed

	@@ -0,0 +1,613 @@

+"""
+Gradio UI for PDF OCR Service
+User interface for PDF to text conversion with multiple OCR providers
+"""
+import gradio as gr
+import os
+import tempfile
+import logging
+from pathlib import Path
+from datetime import datetime
+# Load environment variables
+from dotenv import load_dotenv
+load_dotenv()
+from backend import BackendManager
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize backend manager
+backend_manager = BackendManager()
+# Check if python-docx is available
+from docx.shared import Pt
+from docx.enum.table import WD_TABLE_ALIGNMENT
+try:
+    from docx import Document
+    from docx.shared import Inches
+    HAS_DOCX_SUPPORT = True
+    logger.info("DOCX export available")
+except ImportError:
+    HAS_DOCX_SUPPORT = False
+    logger.info("DOCX export not available - install python-docx to enable")
+def process_pdf_file(pdf_file, ocr_method, progress=gr.Progress()):
+    """
+    Process uploaded PDF file and return extracted text
+    Args:
+        pdf_file: Uploaded PDF file object
+        ocr_method: Selected OCR method
+        progress: Gradio progress tracker
+    Returns:
+        Tuple of (extracted_text, metadata_info, processing_status)
+    """
+    if pdf_file is None:
+        return "No file uploaded.", "", "❌ Error: No file selected"
+    temp_file_path = None
+    try:
+        progress(0.1, desc="Initializing...")
+        # Handle Gradio file object - pdf_file.name contains the file path
+        temp_file_path = pdf_file.name
+        progress(0.3, desc="Processing PDF...")
+        # Process the PDF
+        result = backend_manager.process_pdf(temp_file_path, ocr_method)
+        progress(0.9, desc="Finalizing...")
+        progress(1.0, desc="Complete!")
+        if result['success']:
+            # Format metadata for display
+            metadata_info = format_metadata(result['metadata'], result['method_used'])
+            status = f"✅ Success: Processed using {result['method_used']}"
+            return result['text'], metadata_info, status
+        else:
+            error_msg = result.get('error', 'Unknown error occurred')
+            return f"Error: {error_msg}", "", f"❌ Processing failed: {error_msg}"
+    except Exception as e:
+        logger.error(f"UI processing error: {e}")
+        return f"Error: {str(e)}", "", f"❌ Unexpected error: {str(e)}"
+def format_metadata(metadata, method_used):
+    """Format metadata for display"""
+    if not metadata:
+        return f"Method used: {method_used}"
+    info_lines = [f"Method used: {method_used}"]
+    if 'pages' in metadata:
+        info_lines.append(f"Pages processed: {metadata['pages']}")
+    if 'tables' in metadata:
+        info_lines.append(f"Tables detected: {metadata['tables']}")
+    if 'has_handwritten' in metadata:
+        handwritten_status = "Yes" if metadata['has_handwritten'] else "No"
+        info_lines.append(f"Handwritten content: {handwritten_status}")
+    if 'processing_time_seconds' in metadata:
+        info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
+    return "\n".join(info_lines)
+def create_txt_file(text_content, metadata_info=""):
+    """Create a TXT file from extracted text"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    temp_file = tempfile.NamedTemporaryFile(
+        suffix=f'_extracted_text_{timestamp}.txt',
+        delete=False,
+        mode='w',
+        encoding='utf-8'
+    )
+    try:
+        # Add header
+        temp_file.write("PDF OCR Extraction Results\n")
+        temp_file.write("=" * 50 + "\n\n")
+        # Add metadata
+        if metadata_info:
+            temp_file.write("Processing Information:\n")
+            temp_file.write("-" * 25 + "\n")
+            temp_file.write(metadata_info + "\n\n")
+        # Add timestamp
+        temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        temp_file.write("=" * 50 + "\n\n")
+        # Add main content
+        temp_file.write("Extracted Text:\n")
+        temp_file.write("-" * 15 + "\n\n")
+        temp_file.write(text_content)
+        temp_file.close()
+        return temp_file.name
+    except Exception as e:
+        logger.error(f"Error creating TXT file: {e}")
+        temp_file.close()
+        raise
+def create_docx_file(text_content, metadata_info=""):
+    """Create a DOCX file with enhanced formatting and table preservation"""
+    if not HAS_DOCX_SUPPORT:
+        raise ImportError("python-docx not installed. Cannot create DOCX files.")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    temp_file = tempfile.NamedTemporaryFile(
+        suffix=f'_extracted_text_{timestamp}.docx',
+        delete=False
+    )
+    temp_file.close()
+    try:
+        from docx import Document
+        from docx.shared import Inches, Pt
+        from docx.enum.text import WD_ALIGN_PARAGRAPH
+        from docx.enum.table import WD_TABLE_ALIGNMENT
+        # Create new Document with better styling
+        doc = Document()
+        # Set document margins
+        sections = doc.sections
+        for section in sections:
+            section.top_margin = Inches(1)
+            section.bottom_margin = Inches(1)
+            section.left_margin = Inches(1)
+            section.right_margin = Inches(1)
+        # Add title
+        title = doc.add_heading('PDF OCR Extraction Results', 0)
+        title.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Add metadata section if available
+        if metadata_info:
+            doc.add_heading('Processing Information', level=1)
+            metadata_para = doc.add_paragraph(metadata_info)
+            metadata_para.style = 'Intense Quote'
+            doc.add_page_break()
+        # Enhanced content processing
+        if '=== PAGE ' in text_content:
+            # Process page-by-page with better formatting
+            pages = text_content.split('=== PAGE ')
+            for i, page_content in enumerate(pages):
+                if i == 0 and not page_content.strip():
+                    continue
+                if i > 0:
+                    # Add page header
+                    page_num = page_content.split(' ===')[0] if ' ===' in page_content else str(i)
+                    page_heading = doc.add_heading(f'Page {page_num}', level=1)
+                    page_heading.alignment = WD_ALIGN_PARAGRAPH.LEFT
+                    # Get content after page header
+                    content = page_content.split('===\n', 1)[-1] if '===\n' in page_content else page_content
+                else:
+                    content = page_content
+                # Process content with enhanced table handling
+                _process_page_content_enhanced(doc, content)
+        else:
+            # Process as continuous content
+            _process_page_content_enhanced(doc, text_content)
+        # Add footer
+        footer_section = doc.sections[0]
+        footer = footer_section.footer
+        footer_para = footer.paragraphs[0]
+        footer_para.text = f"Generated by PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+        # Save document
+        doc.save(temp_file.name)
+        logger.info(f"Enhanced DOCX file created: {temp_file.name}")
+        return temp_file.name
+    except Exception as e:
+        logger.error(f"Error creating enhanced DOCX file: {e}")
+        try:
+            os.unlink(temp_file.name)
+        except:
+            pass
+        raise
+def _process_page_content_enhanced(doc, content):
+    """Process page content with enhanced table and formatting handling"""
+    if not content.strip():
+        return
+    # Split content by tables and regular text
+    sections = content.split('--- TABLE ')
+    # Process first section (before any tables)
+    if sections[0].strip():
+        _add_formatted_text(doc, sections[0])
+    # Process tables and subsequent content
+    for i in range(1, len(sections)):
+        table_section = sections[i]
+        # Extract table content
+        if '---' in table_section:
+            parts = table_section.split('---', 1)
+            table_header = f"TABLE {parts[0].strip()}"
+            table_content = parts[1].strip() if len(parts) > 1 else ""
+            # Add table header
+            table_heading = doc.add_heading(table_header, level=3)
+            # Process table content
+            if table_content:
+                table_lines = [line for line in table_content.split('\n') if line.strip()]
+                if table_lines:
+                    _create_formatted_table(doc, table_lines)
+        # Add any remaining content after this table
+        remaining_parts = table_section.split('---')[2:] if '---' in table_section else []
+        if remaining_parts:
+            remaining_text = '---'.join(remaining_parts)
+            if remaining_text.strip():
+                _add_formatted_text(doc, remaining_text)
+def _add_formatted_text(doc, text):
+    """Add formatted text with better paragraph handling"""
+    if not text.strip():
+        return
+    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+    for para_text in paragraphs:
+        if para_text.startswith('# '):
+            # Main heading
+            heading = doc.add_heading(para_text[2:], level=1)
+        elif para_text.startswith('## '):
+            # Sub heading
+            heading = doc.add_heading(para_text[3:], level=2)
+        elif para_text.startswith('### '):
+            # Sub-sub heading
+            heading = doc.add_heading(para_text[4:], level=3)
+        else:
+            # Regular paragraph
+            para = doc.add_paragraph(para_text)
+            # Add spacing for readability
+            if len(para_text) < 80:
+                para.space_after = Pt(6)
+def _create_formatted_table(doc, table_lines):
+    """Create a properly formatted table from text lines"""
+    if not table_lines:
+        return
+    # Parse table structure
+    rows = []
+    for line in table_lines:
+        if '|' in line:
+            # Split by | and clean up cells
+            cells = [cell.strip() for cell in line.split('|')]
+            # Remove empty cells at start/end
+            while cells and not cells[0]:
+                cells.pop(0)
+            while cells and not cells[-1]:
+                cells.pop()
+            if cells:
+                rows.append(cells)
+        elif line.strip() and not line.startswith('-'):
+            # Single column row or header
+            rows.append([line.strip()])
+    if not rows:
+        # If no table structure found, add as preformatted text
+        para = doc.add_paragraph(style='Normal')
+        run = para.add_run('\n'.join(table_lines))
+        run.font.name = 'Courier New'
+        run.font.size = Pt(10)
+        return
+    # Determine number of columns
+    max_cols = max(len(row) for row in rows) if rows else 1
+    # Create table
+    table = doc.add_table(rows=len(rows), cols=max_cols)
+    table.style = 'Table Grid'
+    table.alignment = WD_TABLE_ALIGNMENT.LEFT
+    # Fill table cells
+    for row_idx, row_data in enumerate(rows):
+        table_row = table.rows[row_idx]
+        for col_idx in range(max_cols):
+            cell = table_row.cells[col_idx]
+            if col_idx < len(row_data):
+                cell.text = row_data[col_idx]
+            else:
+                cell.text = ""
+            # Format header row differently
+            if row_idx == 0:
+                for paragraph in cell.paragraphs:
+                    for run in paragraph.runs:
+                        run.bold = True
+            # Set font and alignment
+            for paragraph in cell.paragraphs:
+                for run in paragraph.runs:
+                    run.font.size = Pt(10)
+                    run.font.name = 'Calibri'
+    # Add spacing after table
+    doc.add_paragraph("")
+def get_method_info(method):
+    """Get information about selected OCR method"""
+    method_descriptions = {
+        "auto": "🤖 **Auto Selection**: Automatically chooses the best available method. Prefers Azure → Tesseract → PyMuPDF in order.",
+        "azure": "☁️ **Azure Document Intelligence**: Advanced cloud-based OCR with excellent layout preservation, table detection, and handwriting recognition. Best quality but requires API credentials.",
+        "tesseract": "🔍 **Tesseract OCR**: Open-source OCR engine with image preprocessing. Good for scanned documents and images. Works offline.",
+        "pymupdf": "📄 **PyMuPDF**: Fast text extraction for PDFs with embedded text. Best for digital PDFs but limited OCR capabilities for scanned documents."
+    }
+    return method_descriptions.get(method, "Select a method to see details.")
+def check_service_status():
+    """Check and display service status"""
+    available_methods = backend_manager.get_available_methods()
+    status_lines = ["**Available OCR Methods:**"]
+    if "azure" in available_methods:
+        status_lines.append("✅ Azure Document Intelligence - Ready")
+    else:
+        status_lines.append("❌ Azure Document Intelligence - Not configured")
+    if "tesseract" in available_methods:
+        status_lines.append("✅ Tesseract OCR - Ready")
+    else:
+        status_lines.append("❌ Tesseract OCR - Not available")
+    if "pymupdf" in available_methods:
+        status_lines.append("✅ PyMuPDF - Ready")
+    else:
+        status_lines.append("❌ PyMuPDF - Not available")
+    # Add DOCX support status
+    if HAS_DOCX_SUPPORT:
+        status_lines.append("✅ DOCX Export - Available")
+    else:
+        status_lines.append("❌ DOCX Export - Install python-docx to enable")
+    return "\n".join(status_lines)
+def process_and_prepare_downloads(pdf_file, method):
+    """Process PDF and prepare both TXT and DOCX downloads if successful"""
+    text, metadata, status = process_pdf_file(pdf_file, method)
+    # Prepare downloads if processing was successful
+    if text and not text.startswith("Error:") and not text.startswith("No file"):
+        try:
+            # Create TXT file
+            txt_path = create_txt_file(text, metadata)
+            # Create DOCX file if support is available
+            if HAS_DOCX_SUPPORT:
+                try:
+                    docx_path = create_docx_file(text, metadata)
+                    return (text, metadata, status,
+                           gr.update(visible=True, value=txt_path),
+                           gr.update(visible=True, value=docx_path))
+                except Exception as docx_error:
+                    logger.warning(f"DOCX creation failed: {docx_error}")
+                    return (text, metadata, status,
+                           gr.update(visible=True, value=txt_path),
+                           gr.update(visible=False))
+            else:
+                return (text, metadata, status,
+                       gr.update(visible=True, value=txt_path),
+                       gr.update(visible=False))
+        except Exception as file_error:
+            logger.error(f"File creation error: {file_error}")
+            return (text, metadata, status,
+                   gr.update(visible=False),
+                   gr.update(visible=False))
+    else:
+        return (text, metadata, status,
+               gr.update(visible=False),
+               gr.update(visible=False))
+def create_interface():
+    """Create and configure the Gradio interface"""
+    with gr.Blocks(
+        title="PDF OCR Service",
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header { text-align: center; margin-bottom: 2rem; }
+        .method-info { background-color: #f8f9fa; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0; }
+        .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
+        """
+    ) as interface:
+        gr.HTML("""
+        <div class="main-header">
+            <h1>📄 PDF OCR Service</h1>
+            <p>Convert PDF documents to text using advanced OCR technologies</p>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.HTML("<h3>📁 Upload & Configure</h3>")
+                # File upload
+                pdf_input = gr.File(
+                    label="Upload PDF File",
+                    file_types=[".pdf"],
+                    file_count="single"
+                )
+                # OCR method selection
+                method_choice = gr.Dropdown(
+                    choices=["auto", "azure", "tesseract", "pymupdf"],
+                    value="auto",
+                    label="OCR Method",
+                    info="Choose OCR method or use auto-selection"
+                )
+                # Method information display
+                method_info = gr.Markdown(
+                    value=get_method_info("auto"),
+                    elem_classes=["method-info"]
+                )
+                # Process button
+                process_btn = gr.Button(
+                    "🚀 Process PDF",
+                    variant="primary",
+                    size="lg"
+                )
+                # Service status
+                gr.HTML("<h4>🔧 Service Status</h4>")
+                service_status = gr.Markdown(
+                    value=check_service_status(),
+                    elem_classes=["status-box"]
+                )
+                # Refresh status button
+                refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
+            with gr.Column(scale=2):
+                gr.HTML("<h3>📝 Results</h3>")
+                # Processing status
+                processing_status = gr.Textbox(
+                    label="Processing Status",
+                    interactive=False,
+                    lines=1
+                )
+                # Extracted text output
+                text_output = gr.Textbox(
+                    label="Extracted Text",
+                    placeholder="Processed text will appear here...",
+                    lines=20,
+                    max_lines=30,
+                    interactive=False,
+                    show_copy_button=True
+                )
+                # Metadata information
+                metadata_output = gr.Textbox(
+                    label="Processing Information",
+                    interactive=False,
+                    lines=4
+                )
+                # Download buttons
+                with gr.Row():
+                    download_txt_btn = gr.DownloadButton(
+                        "📄 Download TXT",
+                        visible=False,
+                        variant="secondary"
+                    )
+                    download_docx_btn = gr.DownloadButton(
+                        "📝 Download DOCX",
+                        visible=False,
+                        variant="secondary"
+                    )
+        # Add tips section
+        gr.HTML("<h3>💡 Tips & Features</h3>")
+        # Create tips content based on available features
+        download_info = "Get results as formatted TXT files"
+        if HAS_DOCX_SUPPORT:
+            download_info += " and structured DOCX files"
+        else:
+            download_info += " (install python-docx for DOCX export)"
+        tips_html = f"""
+        <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0;">
+            <ul>
+                <li><strong>Auto method</strong> is recommended for most users - intelligently selects the best OCR method</li>
+                <li><strong>Azure Document Intelligence</strong> provides the best quality for complex documents with tables and formatting</li>
+                <li><strong>Tesseract</strong> works well for scanned documents and images with preprocessing</li>
+                <li><strong>PyMuPDF</strong> is fastest for digital PDFs with embedded text</li>
+                <li><strong>Download Options:</strong> {download_info}</li>
+                <li><strong>Format Preservation:</strong> Original spacing and layout are maintained where possible</li>
+                <li>Larger files may take longer to process - the progress bar shows current status</li>
+                <li>Supported file types: PDF documents (up to 50MB by default)</li>
+            </ul>
+        </div>
+        """
+        gr.HTML(tips_html)
+        # Event handlers
+        method_choice.change(
+            fn=get_method_info,
+            inputs=[method_choice],
+            outputs=[method_info]
+        )
+        refresh_btn.click(
+            fn=check_service_status,
+            outputs=[service_status]
+        )
+        process_btn.click(
+            fn=process_and_prepare_downloads,
+            inputs=[pdf_input, method_choice],
+            outputs=[text_output, metadata_output, processing_status, download_txt_btn, download_docx_btn]
+        )
+    return interface
+def launch_ui():
+    """Launch the Gradio interface"""
+    interface = create_interface()
+    # Get configuration from environment
+    server_port = int(os.getenv('SERVER_PORT', 7860))
+    server_host = os.getenv('SERVER_HOST', '127.0.0.1')
+    share_gradio = os.getenv('SHARE_GRADIO', 'false').lower() == 'true'
+    logger.info(f"Starting Gradio UI on {server_host}:{server_port}")
+    interface.launch(
+        server_name=server_host,
+        server_port=server_port,
+        share=share_gradio,
+        inbrowser=True,
+        show_error=True
+    )
+if __name__ == "__main__":
+    launch_ui()

backend.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+Backend Management Module
+Coordinates between UI and OCR services, handles file management and processing logic
+"""
+import os
+import logging
+import tempfile
+from typing import Dict, Any, List
+from pathlib import Path
+import hashlib
+import json
+from datetime import datetime
+# Load environment variables
+from dotenv import load_dotenv
+load_dotenv()
+from ocr_service import OCRService
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class BackendManager:
+    """Backend manager for PDF OCR processing"""
+    def __init__(self):
+        self.ocr_service = OCRService()
+        self.processing_history = []
+        self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
+        # Create directories for temporary files and logs
+        self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
+        self.temp_dir.mkdir(exist_ok=True)
+        logger.info("Backend manager initialized successfully")
+    def process_pdf(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
+        """
+        Process PDF file and return results
+        Args:
+            pdf_path: Path to the PDF file
+            method: OCR method to use
+        Returns:
+            Dict containing processing results
+        """
+        start_time = datetime.now()
+        # Validate input
+        if not os.path.exists(pdf_path):
+            return {
+                'success': False,
+                'error': f"File not found: {pdf_path}",
+                'text': '',
+                'method_used': '',
+                'metadata': {}
+            }
+        # Check file size (limit to 50MB by default)
+        max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
+        file_size = os.path.getsize(pdf_path)
+        if file_size > max_file_size:
+            return {
+                'success': False,
+                'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
+                'text': '',
+                'method_used': '',
+                'metadata': {}
+            }
+        # Generate file hash for caching/tracking
+        file_hash = self._calculate_file_hash(pdf_path)
+        logger.info(f"Processing PDF: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
+        logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
+        try:
+            # Process the PDF
+            result = self.ocr_service.convert_pdf_to_text(pdf_path, method)
+            # Add processing metadata
+            processing_time = (datetime.now() - start_time).total_seconds()
+            result['metadata'].update({
+                'file_hash': file_hash,
+                'file_size_mb': round(file_size / (1024*1024), 2),
+                'processing_time_seconds': round(processing_time, 2),
+                'timestamp': start_time.isoformat()
+            })
+            # Log results
+            if result['success']:
+                text_length = len(result['text'])
+                logger.info(f"Processing completed successfully in {processing_time:.2f}s")
+                logger.info(f"Method used: {result['method_used']}")
+                logger.info(f"Text extracted: {text_length} characters")
+                # Add to processing history
+                self._add_to_history({
+                    'timestamp': start_time.isoformat(),
+                    'file_hash': file_hash,
+                    'method_used': result['method_used'],
+                    'success': True,
+                    'text_length': text_length,
+                    'processing_time': processing_time
+                })
+            else:
+                logger.error(f"Processing failed: {result.get('error', 'Unknown error')}")
+                # Add to processing history
+                self._add_to_history({
+                    'timestamp': start_time.isoformat(),
+                    'file_hash': file_hash,
+                    'method_requested': method,
+                    'success': False,
+                    'error': result.get('error', 'Unknown error'),
+                    'processing_time': processing_time
+                })
+            return result
+        except Exception as e:
+            logger.error(f"Unexpected error during processing: {e}")
+            # Add to processing history
+            processing_time = (datetime.now() - start_time).total_seconds()
+            self._add_to_history({
+                'timestamp': start_time.isoformat(),
+                'file_hash': file_hash,
+                'method_requested': method,
+                'success': False,
+                'error': str(e),
+                'processing_time': processing_time
+            })
+            return {
+                'success': False,
+                'error': f"Processing error: {str(e)}",
+                'text': '',
+                'method_used': '',
+                'metadata': {
+                    'file_hash': file_hash,
+                    'processing_time_seconds': round(processing_time, 2),
+                    'timestamp': start_time.isoformat()
+                }
+            }
+    def get_available_methods(self) -> List[str]:
+        """Get list of available OCR methods"""
+        methods = self.ocr_service.get_available_methods()
+        logger.info(f"Available OCR methods: {methods}")
+        return methods
+    def get_service_status(self) -> Dict[str, Any]:
+        """Get comprehensive service status"""
+        available_methods = self.get_available_methods()
+        status = {
+            'service_healthy': True,
+            'available_methods': available_methods,
+            'azure_configured': 'azure' in available_methods,
+            'tesseract_available': 'tesseract' in available_methods,
+            'pymupdf_available': 'pymupdf' in available_methods,
+            'total_processed': len(self.processing_history),
+            'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
+            'temp_dir': str(self.temp_dir),
+            'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50))
+        }
+        return status
+    def get_processing_history(self, limit: int = 10) -> List[Dict[str, Any]]:
+        """Get recent processing history"""
+        return self.processing_history[-limit:]
+    def cleanup_temp_files(self):
+        """Clean up temporary files"""
+        try:
+            temp_files = list(self.temp_dir.glob('*'))
+            cleaned_count = 0
+            for temp_file in temp_files:
+                try:
+                    # Remove files older than 1 hour
+                    if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
+                        temp_file.unlink()
+                        cleaned_count += 1
+                except Exception as e:
+                    logger.warning(f"Could not remove temp file {temp_file}: {e}")
+            if cleaned_count > 0:
+                logger.info(f"Cleaned up {cleaned_count} temporary files")
+        except Exception as e:
+            logger.error(f"Error during cleanup: {e}")
+    def validate_pdf_file(self, file_path: str) -> Dict[str, Any]:
+        """
+        Validate PDF file before processing
+        Args:
+            file_path: Path to the PDF file
+        Returns:
+            Dict with validation results
+        """
+        validation_result = {
+            'valid': False,
+            'error': None,
+            'warnings': [],
+            'file_info': {}
+        }
+        try:
+            # Check if file exists
+            if not os.path.exists(file_path):
+                validation_result['error'] = "File does not exist"
+                return validation_result
+            # Check file extension
+            if not file_path.lower().endswith('.pdf'):
+                validation_result['warnings'].append("File does not have .pdf extension")
+            # Check file size
+            file_size = os.path.getsize(file_path)
+            max_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
+            if file_size > max_size:
+                validation_result['error'] = f"File too large ({file_size/(1024*1024):.1f}MB > {max_size/(1024*1024)}MB)"
+                return validation_result
+            if file_size == 0:
+                validation_result['error'] = "File is empty"
+                return validation_result
+            # Try to open with PyMuPDF to check if it's a valid PDF
+            try:
+                import fitz
+                doc = fitz.open(file_path)
+                page_count = len(doc)
+                doc.close()
+                if page_count == 0:
+                    validation_result['warnings'].append("PDF contains no pages")
+                validation_result['file_info'] = {
+                    'size_mb': round(file_size / (1024*1024), 2),
+                    'pages': page_count
+                }
+            except Exception as pdf_error:
+                validation_result['error'] = f"Invalid PDF file: {str(pdf_error)}"
+                return validation_result
+            validation_result['valid'] = True
+        except Exception as e:
+            validation_result['error'] = f"Validation error: {str(e)}"
+        return validation_result
+    def _calculate_file_hash(self, file_path: str) -> str:
+        """Calculate SHA-256 hash of file"""
+        sha256_hash = hashlib.sha256()
+        try:
+            with open(file_path, "rb") as f:
+                for chunk in iter(lambda: f.read(4096), b""):
+                    sha256_hash.update(chunk)
+            return sha256_hash.hexdigest()
+        except Exception as e:
+            logger.error(f"Error calculating file hash: {e}")
+            return f"error_{datetime.now().timestamp()}"
+    def _add_to_history(self, entry: Dict[str, Any]):
+        """Add entry to processing history"""
+        self.processing_history.append(entry)
+        # Limit history size
+        if len(self.processing_history) > self.max_history_size:
+            self.processing_history = self.processing_history[-self.max_history_size:]
+    def export_processing_history(self, file_path: str = None) -> str:
+        """Export processing history to JSON file"""
+        if file_path is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            file_path = self.temp_dir / f"processing_history_{timestamp}.json"
+        try:
+            history_data = {
+                'exported_at': datetime.now().isoformat(),
+                'total_entries': len(self.processing_history),
+                'service_status': self.get_service_status(),
+                'history': self.processing_history
+            }
+            with open(file_path, 'w') as f:
+                json.dump(history_data, f, indent=2)
+            logger.info(f"Processing history exported to: {file_path}")
+            return str(file_path)
+        except Exception as e:
+            logger.error(f"Error exporting history: {e}")
+            raise
+    def get_statistics(self) -> Dict[str, Any]:
+        """Get processing statistics"""
+        if not self.processing_history:
+            return {
+                'total_processed': 0,
+                'success_rate': 0,
+                'average_processing_time': 0,
+                'most_used_method': 'N/A',
+                'total_text_extracted': 0
+            }
+        total_processed = len(self.processing_history)
+        successful = [h for h in self.processing_history if h.get('success', False)]
+        success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
+        # Calculate average processing time
+        processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
+        avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
+        # Find most used method
+        methods = [h.get('method_used', 'unknown') for h in successful]
+        most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
+        # Calculate total text extracted
+        total_text = sum(h.get('text_length', 0) for h in successful)
+        return {
+            'total_processed': total_processed,
+            'success_rate': round(success_rate, 2),
+            'average_processing_time': round(avg_processing_time, 2),
+            'most_used_method': most_used_method,
+            'total_text_extracted': total_text,
+            'successful_processes': len(successful),
+            'failed_processes': total_processed - len(successful)
+        }
+# Initialize global backend manager instance
+_backend_manager = None
+def get_backend_manager() -> BackendManager:
+    """Get global backend manager instance"""
+    global _backend_manager
+    if _backend_manager is None:
+        _backend_manager = BackendManager()
+    return _backend_manager
+if __name__ == "__main__":
+    # Test the backend manager
+    manager = BackendManager()
+    print("Backend Manager Test")
+    print("===================")
+    print(f"Available methods: {manager.get_available_methods()}")
+    print(f"Service status: {manager.get_service_status()}")
+    print(f"Statistics: {manager.get_statistics()}")

ocr_service.py ADDED Viewed

	@@ -0,0 +1,531 @@

+"""
+OCR Service Module - FIXED VERSION
+Handles PDF to text conversion using Azure Document Intelligence with fallback methods
+"""
+import os
+import logging
+from typing import Optional, Dict, Any, Tuple
+import tempfile
+from pathlib import Path
+# Load environment variables
+from dotenv import load_dotenv
+load_dotenv()
+# Azure Document Intelligence
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.core.exceptions import AzureError
+# Fallback OCR libraries
+try:
+    import pytesseract
+    from PIL import Image
+    import cv2
+    import numpy as np
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    TESSERACT_AVAILABLE = False
+import fitz  # PyMuPDF
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class OCRService:
+    """Main OCR service with multiple providers and fallback mechanisms"""
+    def __init__(self):
+        self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
+        self.azure_key = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
+        # Initialize Azure client if credentials are available
+        self.azure_client = None
+        if self.azure_endpoint and self.azure_key:
+            try:
+                self.azure_client = DocumentIntelligenceClient(
+                    endpoint=self.azure_endpoint,
+                    credential=AzureKeyCredential(self.azure_key)
+                )
+                logger.info("Azure Document Intelligence client initialized successfully")
+            except Exception as e:
+                logger.error(f"Failed to initialize Azure client: {e}")
+        else:
+            logger.warning("Azure credentials not found. Azure OCR will be unavailable.")
+    def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
+        """
+        Convert PDF to text using specified method
+        Args:
+            pdf_path: Path to the PDF file
+            method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
+        Returns:
+            Dict containing text content, metadata, and processing info
+        """
+        result = {
+            'success': False,
+            'text': '',
+            'method_used': '',
+            'metadata': {},
+            'error': None
+        }
+        if not os.path.exists(pdf_path):
+            result['error'] = f"PDF file not found: {pdf_path}"
+            return result
+        # Auto method selection
+        if method == "auto":
+            if self.azure_client:
+                method = "azure"
+            elif self._check_tesseract_available():
+                method = "tesseract"
+            else:
+                method = "pymupdf"
+        # Try primary method
+        try:
+            if method == "azure" and self.azure_client:
+                result = self._azure_ocr(pdf_path)
+            elif method == "tesseract":
+                result = self._tesseract_ocr(pdf_path)
+            elif method == "pymupdf":
+                result = self._pymupdf_extract(pdf_path)
+            else:
+                result['error'] = f"Method '{method}' not available or not configured"
+        except Exception as e:
+            logger.error(f"Primary method '{method}' failed: {e}")
+            result['error'] = str(e)
+        # Fallback mechanism
+        if not result['success']:
+            logger.info("Primary method failed, trying fallback methods...")
+            result = self._try_fallback_methods(pdf_path, exclude_method=method)
+        return result
+    def _azure_ocr(self, pdf_path: str) -> Dict[str, Any]:
+        """Azure Document Intelligence OCR with enhanced layout preservation"""
+        result = {
+            'success': False,
+            'text': '',
+            'method_used': 'azure_document_intelligence',
+            'metadata': {},
+            'error': None
+        }
+        try:
+            with open(pdf_path, 'rb') as pdf_file:
+                file_content = pdf_file.read()
+                # Try different API call patterns for different SDK versions
+                try:
+                    # Pattern 1: body + content_type (most common for current SDK)
+                    poller = self.azure_client.begin_analyze_document(
+                        "prebuilt-layout",
+                        body=file_content,
+                        content_type="application/pdf"
+                    )
+                except TypeError:
+                    try:
+                        # Pattern 2: model_id + body
+                        poller = self.azure_client.begin_analyze_document(
+                            model_id="prebuilt-layout",
+                            body=file_content
+                        )
+                    except TypeError:
+                        # Pattern 3: document parameter (older SDK)
+                        pdf_file.seek(0)
+                        poller = self.azure_client.begin_analyze_document(
+                            "prebuilt-layout",
+                            document=pdf_file
+                        )
+            analysis_result = poller.result()
+            # Enhanced format preservation with better structure
+            formatted_text = self._format_azure_result_enhanced(analysis_result)
+            result.update({
+                'success': True,
+                'text': formatted_text,
+                'metadata': {
+                    'pages': len(analysis_result.pages) if analysis_result.pages else 0,
+                    'tables': len(analysis_result.tables) if analysis_result.tables else 0,
+                    'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
+                    'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
+                    'azure_analysis': analysis_result  # Pass full result for DOCX formatting
+                }
+            })
+            logger.info("Azure OCR completed successfully with enhanced formatting")
+        except Exception as e:
+            logger.error(f"Azure OCR error: {e}")
+            result['error'] = f"Azure OCR error: {e}"
+        return result
+    def _format_azure_result_enhanced(self, analysis_result) -> str:
+        """Enhanced formatting that preserves more layout structure"""
+        formatted_parts = []
+        if not analysis_result.pages:
+            return ""
+        for page_num, page in enumerate(analysis_result.pages, 1):
+            formatted_parts.append(f"\n=== PAGE {page_num} ===\n")
+            # Collect all content with positions for better ordering
+            content_items = []
+            # Add paragraphs if available (better than individual lines)
+            if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
+                page_paragraphs = [p for p in analysis_result.paragraphs if
+                                 p.bounding_regions and
+                                 p.bounding_regions[0].page_number == page_num]
+                for para in page_paragraphs:
+                    if para.content.strip():
+                        y_pos = para.bounding_regions[0].polygon[1] if para.bounding_regions[0].polygon else 0
+                        content_items.append({
+                            'type': 'paragraph',
+                            'content': para.content.strip(),
+                            'y_pos': y_pos,
+                            'role': getattr(para, 'role', 'paragraph')
+                        })
+            # Add lines if paragraphs not available
+            elif page.lines:
+                for line in page.lines:
+                    if line.content.strip():
+                        y_pos = line.polygon[1] if line.polygon else 0
+                        content_items.append({
+                            'type': 'line',
+                            'content': line.content.strip(),
+                            'y_pos': y_pos,
+                            'role': 'text'
+                        })
+            # Sort content by vertical position (top to bottom)
+            content_items.sort(key=lambda x: x['y_pos'])
+            # Add formatted content
+            for item in content_items:
+                if item['role'] == 'title':
+                    formatted_parts.append(f"\n# {item['content']}\n")
+                elif item['role'] == 'sectionHeading':
+                    formatted_parts.append(f"\n## {item['content']}\n")
+                else:
+                    formatted_parts.append(item['content'])
+                    formatted_parts.append("")  # Add line break
+            # Add tables for this page
+            if analysis_result.tables:
+                page_tables = [t for t in analysis_result.tables if any(
+                    cell.bounding_regions and
+                    cell.bounding_regions[0].page_number == page_num
+                    for cell in t.cells
+                )]
+                for table_idx, table in enumerate(page_tables):
+                    formatted_parts.append(f"\n--- TABLE {table_idx + 1} ---")
+                    table_text = self._format_table_enhanced(table)
+                    formatted_parts.append(table_text)
+                    formatted_parts.append("")
+        return '\n'.join(formatted_parts)
+    def _format_table_enhanced(self, table) -> str:
+        """Enhanced table formatting with better structure"""
+        if not table.cells:
+            return ""
+        # Create matrix
+        max_row = max(cell.row_index for cell in table.cells) + 1
+        max_col = max(cell.column_index for cell in table.cells) + 1
+        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
+        # Fill matrix with cell content
+        for cell in table.cells:
+            content = (cell.content or "").strip()
+            table_matrix[cell.row_index][cell.column_index] = content
+        # Calculate column widths
+        col_widths = [0] * max_col
+        for row in table_matrix:
+            for col_idx, cell in enumerate(row):
+                col_widths[col_idx] = max(col_widths[col_idx], len(cell))
+        # Format as aligned table
+        formatted_rows = []
+        for row_idx, row in enumerate(table_matrix):
+            formatted_cells = []
+            for col_idx, cell in enumerate(row):
+                width = max(col_widths[col_idx], 3)  # Minimum width
+                formatted_cells.append(cell.ljust(width))
+            formatted_row = " | ".join(formatted_cells)
+            formatted_rows.append(formatted_row)
+            # Add separator after header row
+            if row_idx == 0 and max_row > 1:
+                separator = " | ".join(["-" * max(col_widths[i], 3) for i in range(max_col)])
+                formatted_rows.append(separator)
+        return "\n".join(formatted_rows)
+    def _format_azure_result(self, analysis_result) -> str:
+        """Format Azure Document Intelligence result preserving layout"""
+        formatted_text = []
+        if analysis_result.pages:
+            for page_num, page in enumerate(analysis_result.pages, 1):
+                formatted_text.append(f"\n--- Page {page_num} ---\n")
+                # Sort lines by vertical position for better reading order
+                if page.lines:
+                    sorted_lines = sorted(page.lines, key=lambda line: (
+                        line.polygon[1] if line.polygon else 0,  # Y coordinate
+                        line.polygon[0] if line.polygon else 0   # X coordinate
+                    ))
+                    for line in sorted_lines:
+                        formatted_text.append(line.content)
+                # Add tables if present
+                if analysis_result.tables:
+                    page_tables = [t for t in analysis_result.tables if any(
+                        cell.bounding_regions and
+                        cell.bounding_regions[0].page_number == page_num
+                        for cell in t.cells
+                    )]
+                    for table_idx, table in enumerate(page_tables):
+                        formatted_text.append(f"\n--- Table {table_idx + 1} ---")
+                        formatted_text.append(self._format_table(table))
+        return '\n'.join(formatted_text)
+    def _format_table(self, table) -> str:
+        """Format table from Azure Document Intelligence"""
+        if not table.cells:
+            return ""
+        # Create matrix
+        max_row = max(cell.row_index for cell in table.cells) + 1
+        max_col = max(cell.column_index for cell in table.cells) + 1
+        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
+        for cell in table.cells:
+            table_matrix[cell.row_index][cell.column_index] = cell.content or ""
+        # Format as text table
+        formatted_rows = []
+        for row in table_matrix:
+            formatted_rows.append(" | ".join(row))
+        return "\n".join(formatted_rows)
+    def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
+        """Tesseract OCR with image preprocessing - FIXED VERSION"""
+        result = {
+            'success': False,
+            'text': '',
+            'method_used': 'tesseract',
+            'metadata': {},
+            'error': None
+        }
+        if not TESSERACT_AVAILABLE:
+            result['error'] = "Tesseract not available"
+            return result
+        pdf_document = None
+        try:
+            # Convert PDF to images
+            pdf_document = fitz.open(pdf_path)
+            page_count = len(pdf_document)  # Get count before processing
+            all_text = []
+            for page_num in range(page_count):
+                page = pdf_document.load_page(page_num)
+                # Render page to image
+                mat = fitz.Matrix(2.0, 2.0)  # High resolution
+                pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("png")
+                # Convert to PIL Image
+                temp_img_path = None
+                try:
+                    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
+                        temp_img.write(img_data)
+                        temp_img_path = temp_img.name
+                    # Preprocess image for better OCR
+                    processed_img = self._preprocess_image(temp_img_path)
+                    # OCR with custom config
+                    custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
+                    text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
+                    all_text.append(f"\n--- Page {page_num + 1} ---\n")
+                    all_text.append(text)
+                finally:
+                    # Clean up temp image file
+                    if temp_img_path and os.path.exists(temp_img_path):
+                        try:
+                            os.unlink(temp_img_path)
+                        except:
+                            pass
+            result.update({
+                'success': True,
+                'text': '\n'.join(all_text),
+                'metadata': {'pages': page_count}
+            })
+            logger.info("Tesseract OCR completed successfully")
+        except Exception as e:
+            logger.error(f"Tesseract OCR error: {e}")
+            result['error'] = f"Tesseract OCR error: {e}"
+        finally:
+            # FIXED: Ensure document is properly closed
+            if pdf_document is not None:
+                try:
+                    pdf_document.close()
+                except:
+                    pass
+        return result
+    def _preprocess_image(self, image_path: str) -> np.ndarray:
+        """Preprocess image for better OCR accuracy"""
+        # Read image
+        img = cv2.imread(image_path)
+        # Convert to grayscale
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # Noise removal
+        denoised = cv2.medianBlur(gray, 3)
+        # Threshold to get binary image
+        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+    def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
+        """PyMuPDF text extraction - FIXED VERSION"""
+        result = {
+            'success': False,
+            'text': '',
+            'method_used': 'pymupdf',
+            'metadata': {},
+            'error': None
+        }
+        pdf_document = None
+        try:
+            pdf_document = fitz.open(pdf_path)
+            page_count = len(pdf_document)  # FIXED: Get count first before processing
+            all_text = []
+            for page_num in range(page_count):
+                page = pdf_document.load_page(page_num)
+                text = page.get_text()
+                all_text.append(f"\n--- Page {page_num + 1} ---\n")
+                all_text.append(text)
+            result.update({
+                'success': True,
+                'text': '\n'.join(all_text),
+                'metadata': {'pages': page_count}  # FIXED: Use stored count
+            })
+            logger.info("PyMuPDF extraction completed successfully")
+        except Exception as e:
+            logger.error(f"PyMuPDF error: {e}")
+            result['error'] = f"PyMuPDF error: {e}"
+        finally:
+            # FIXED: Ensure document is properly closed
+            if pdf_document is not None:
+                try:
+                    pdf_document.close()
+                except:
+                    pass
+        return result
+    def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
+        """Try fallback OCR methods"""
+        fallback_methods = []
+        # Order of fallback preference
+        if exclude_method != "azure" and self.azure_client:
+            fallback_methods.append("azure")
+        if exclude_method != "tesseract" and self._check_tesseract_available():
+            fallback_methods.append("tesseract")
+        if exclude_method != "pymupdf":
+            fallback_methods.append("pymupdf")
+        for method in fallback_methods:
+            logger.info(f"Trying fallback method: {method}")
+            try:
+                if method == "azure":
+                    result = self._azure_ocr(pdf_path)
+                elif method == "tesseract":
+                    result = self._tesseract_ocr(pdf_path)
+                elif method == "pymupdf":
+                    result = self._pymupdf_extract(pdf_path)
+                if result['success']:
+                    result['method_used'] += '_fallback'
+                    return result
+            except Exception as e:
+                logger.error(f"Fallback method {method} failed: {e}")
+                continue
+        return {
+            'success': False,
+            'text': '',
+            'method_used': 'all_methods_failed',
+            'metadata': {},
+            'error': 'All OCR methods failed'
+        }
+    def _check_tesseract_available(self) -> bool:
+        """Check if Tesseract is available"""
+        if not TESSERACT_AVAILABLE:
+            return False
+        try:
+            pytesseract.get_tesseract_version()
+            return True
+        except:
+            return False
+    def get_available_methods(self) -> list:
+        """Get list of available OCR methods"""
+        methods = []
+        if self.azure_client:
+            methods.append("azure")
+        if self._check_tesseract_available():
+            methods.append("tesseract")
+        methods.append("pymupdf")  # Always available
+        return methods

readme.md ADDED Viewed

	@@ -0,0 +1,231 @@

+# PDF OCR Service
+A comprehensive PDF to text conversion service with multiple OCR providers and a user-friendly web interface.
+## Features
+- 🔄 **Multiple OCR Methods**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
+- 📄 **Format Preservation**: Maintains original spacing and layout from PDFs
+- 🛡️ **Fallback Mechanisms**: Automatically tries alternative methods if primary fails
+- 🌐 **Web Interface**: Clean, intuitive Gradio-based UI
+- 📊 **Processing Analytics**: Track processing history and statistics
+- ⚡ **High Performance**: Optimized for speed and accuracy
+## Architecture
+The service consists of three main components:
+1. **`ocr_service.py`** - Core OCR processing with Azure, Tesseract, and PyMuPDF
+2. **`backend.py`** - Backend management, file handling, and coordination
+3. **`ui.py`** - Gradio web interface for user interaction
+## Quick Start
+### 1. Install Dependencies
+```bash
+# Install Python dependencies
+pip install -r requirements.txt
+# Install system dependencies (Ubuntu/Debian)
+sudo apt-get update
+sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
+sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
+# For macOS
+brew install tesseract
+# For Windows
+# Download Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
+# Add to PATH environment variable
+```
+### 2. Configure Environment
+```bash
+# Copy environment template
+cp .env.example .env
+# Edit .env file with your settings
+nano .env
+```
+**Required Configuration:**
+- Set Azure Document Intelligence endpoint and key (for best quality)
+- Adjust file size limits and server settings as needed
+### 3. Run the Service
+```bash
+# Start the web interface
+python app.py
+# Or run individual components
+python backend.py  # Test backend functionality
+python ocr_service.py  # Test OCR service
+```
+The service will be available at `http://localhost:7860`
+## Azure Document Intelligence Setup
+1. **Create Azure Resource**
+   - Go to [Azure Portal](https://portal.azure.com)
+   - Create new "Document Intelligence" resource
+   - Choose subscription, resource group, and region
+   - Select pricing tier (F0 for free, S0 for standard)
+2. **Get Credentials**
+   - Navigate to "Keys and Endpoint" section
+   - Copy the endpoint URL and API key
+   - Add to your `.env` file:
+   ```bash
+   AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
+   AZURE_DOCUMENT_INTELLIGENCE_KEY=your-api-key-here
+   ```
+## OCR Methods
+### Azure Document Intelligence (Recommended)
+- **Best Quality**: Advanced layout analysis and text extraction
+- **Features**: Table detection, handwriting recognition, form understanding
+- **Use Case**: Complex documents, forms, tables, mixed content
+- **Requirements**: Azure subscription and API key
+### Tesseract OCR
+- **Good Quality**: Open-source OCR with preprocessing
+- **Features**: Multiple language support, image enhancement
+- **Use Case**: Scanned documents, images, simple PDFs
+- **Requirements**: Tesseract installation
+### PyMuPDF
+- **Fast Processing**: Direct text extraction from digital PDFs
+- **Features**: Fastest processing, embedded text extraction
+- **Use Case**: Digital PDFs with embedded text
+- **Requirements**: No additional setup needed
+## Usage Examples
+### Web Interface
+1. Open `http://localhost:7860` in your browser
+2. Upload a PDF file
+3. Select OCR method (or use "auto")
+4. Click "Process PDF"
+5. Download extracted text
+### Python API
+```python
+from backend import BackendManager
+# Initialize backend
+manager = BackendManager()
+# Process PDF
+result = manager.process_pdf('document.pdf', method='auto')
+if result['success']:
+    print("Extracted Text:")
+    print(result['text'])
+    print(f"Method used: {result['method_used']}")
+    print(f"Pages: {result['metadata']['pages']}")
+else:
+    print(f"Error: {result['error']}")
+```
+## Configuration Options
+### File Processing
+- `MAX_FILE_SIZE_MB`: Maximum file size (default: 50MB)
+- `PROCESSING_TIMEOUT`: Processing timeout in seconds
+- `MAX_CONCURRENT_TASKS`: Concurrent processing limit
+### OCR Settings
+- `DEFAULT_OCR_METHOD`: Default method (auto/azure/tesseract/pymupdf)
+- `AZURE_OCR_MODEL`: Azure model (prebuilt-layout/prebuilt-read)
+- `TESSERACT_LANGUAGES`: Tesseract language packs
+### Server Settings
+- `SERVER_HOST`: Web server host (default: 127.0.0.1)
+- `SERVER_PORT`: Web server port (default: 7860)
+- `SHARE_GRADIO`: Enable public sharing
+## Troubleshooting
+### Common Issues
+1. **Azure OCR not working**
+   - Verify endpoint URL and API key
+   - Check Azure subscription status
+   - Ensure resource region matches endpoint
+2. **Tesseract not found**
+   - Install Tesseract OCR system package
+   - Verify installation: `tesseract --version`
+   - Check PATH environment variable
+3. **Large file processing fails**
+   - Increase `MAX_FILE_SIZE_MB` in .env
+   - Check available memory and disk space
+   - Consider splitting large PDFs
+4. **Poor OCR quality**
+   - Try different OCR methods
+   - Use Azure for best quality
+   - Ensure good PDF scan quality
+### Performance Optimization
+- **Use Azure Document Intelligence** for best accuracy
+- **Enable image preprocessing** for scanned documents
+- **Increase DPI settings** for better image quality
+- **Configure memory limits** based on available resources
+## File Structure
+```
+pdf-ocr-service/
+├── ocr_service.py      # Core OCR processing
+├── backend.py          # Backend management
+├── ui.py              # Gradio web interface
+├── requirements.txt    # Python dependencies
+├── .env               # Environment configuration
+├── README.md          # This file
+├── logs/              # Log files (created automatically)
+├── temp/              # Temporary files (created automatically)
+└── cache/             # Cache directory (optional)
+```
+## Security Considerations
+- Never commit `.env` file to version control
+- Use secure methods to store API keys in production
+- Enable file validation to prevent malicious uploads
+- Consider rate limiting for public deployments
+- Regular cleanup of temporary files
+## Contributing
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Add tests if applicable
+5. Submit a pull request
+## License
+This project is licensed under the MIT License. See LICENSE file for details.
+## Support
+- Check the troubleshooting section above
+- Review Azure Document Intelligence documentation
+- Open an issue for bug reports or feature requests
+## Changelog
+### Version 1.0.0
+- Initial release
+- Azure Document Intelligence integration
+- Multiple OCR fallback methods
+- Gradio web interface
+- Processing history and analytics

requirements.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+# PDF OCR Service Requirements
+# Core web framework and UI
+gradio>=4.0.0
+# Environment configuration
+python-dotenv>=1.0.0
+# Azure Document Intelligence
+azure-ai-documentintelligence>=1.0.0b1
+azure-core>=1.28.0
+# OCR and image processing
+pytesseract>=0.3.10
+Pillow>=10.0.0
+opencv-python>=4.8.0
+numpy>=1.24.0
+# PDF processing
+PyMuPDF>=1.23.0
+# Document export formats
+python-docx>=0.8.11
+# System dependencies information (install separately):
+#
+# For Ubuntu/Debian:
+# sudo apt-get update
+# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
+# sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
+#
+# For CentOS/RHEL:
+# sudo yum install -y tesseract tesseract-langpack-eng
+#
+# For macOS:
+# brew install tesseract
+#
+# For Windows:
+# Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
+# Add Tesseract to PATH environment variable

test_setup.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/env python3
+"""
+Simple test script to verify the PDF OCR Service setup
+Run this to check if everything is working properly
+"""
+import sys
+import os
+from pathlib import Path
+def test_imports():
+    """Test if all required modules can be imported"""
+    print("🧪 Testing imports...")
+    required_imports = [
+        ('dotenv', 'python-dotenv'),
+        ('gradio', 'gradio'),
+        ('azure.ai.documentintelligence', 'azure-ai-documentintelligence'),
+        ('azure.core', 'azure-core'),
+        ('fitz', 'PyMuPDF'),
+        ('PIL', 'Pillow'),
+        ('cv2', 'opencv-python'),
+        ('numpy', 'numpy'),
+    ]
+    optional_imports = [
+        ('pytesseract', 'pytesseract'),
+        ('docx', 'python-docx'),
+    ]
+    all_good = True
+    # Test required imports
+    for module, package in required_imports:
+        try:
+            __import__(module)
+            print(f"✅ {package}")
+        except ImportError:
+            print(f"❌ {package} - Run: pip install {package}")
+            all_good = False
+    # Test optional imports
+    for module, package in optional_imports:
+        try:
+            __import__(module)
+            print(f"✅ {package} (optional)")
+        except ImportError:
+            print(f"⚠️  {package} (optional) - Run: pip install {package}")
+    return all_good
+def test_files():
+    """Test if all required files exist"""
+    print("\n📁 Testing files...")
+    required_files = ['ocr_service.py', 'backend.py', 'requirements.txt', '.env']
+    # Check for UI file (either ui.py or app.py)
+    ui_file = None
+    if Path('ui.py').exists():
+        ui_file = 'ui.py'
+    elif Path('app.py').exists():
+        ui_file = 'app.py'
+    all_good = True
+    for file in required_files:
+        if Path(file).exists():
+            print(f"✅ {file}")
+        else:
+            print(f"❌ {file} missing")
+            all_good = False
+    # Check UI file
+    if ui_file:
+        print(f"✅ {ui_file} (UI file)")
+    else:
+        print("❌ UI file missing (need either ui.py or app.py)")
+        all_good = False
+    return all_good
+def test_env_config():
+    """Test environment configuration"""
+    print("\n🔧 Testing environment...")
+    try:
+        from dotenv import load_dotenv
+        load_dotenv()
+        endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
+        key = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
+        if endpoint and key:
+            if endpoint.startswith('https://') and endpoint.endswith('/'):
+                print("✅ Azure endpoint configured properly")
+            else:
+                print("⚠️  Azure endpoint format may be incorrect")
+            if len(key) > 20:
+                print("✅ Azure key configured")
+            else:
+                print("⚠️  Azure key may be incorrect")
+            return True
+        else:
+            print("⚠️  Azure credentials not configured")
+            print("   Update your .env file with valid credentials")
+            return False
+    except ImportError:
+        print("❌ python-dotenv not available")
+        return False
+def test_service():
+    """Test if the service can be imported and initialized"""
+    print("\n🚀 Testing service initialization...")
+    try:
+        from backend import BackendManager
+        manager = BackendManager()
+        methods = manager.get_available_methods()
+        print(f"✅ Service initialized successfully")
+        print(f"   Available methods: {methods}")
+        if 'azure' in methods:
+            print("✅ Azure OCR ready")
+        else:
+            print("⚠️  Azure OCR not available (check credentials)")
+        return True
+    except Exception as e:
+        print(f"❌ Service initialization failed: {e}")
+        return False
+def main():
+    """Run all tests"""
+    print("🧪 PDF OCR Service Setup Test")
+    print("=" * 40)
+    tests = [
+        ("Import test", test_imports),
+        ("File test", test_files),
+        ("Environment test", test_env_config),
+        ("Service test", test_service),
+    ]
+    results = {}
+    for test_name, test_func in tests:
+        print(f"\n{'='*40}")
+        print(f"{test_name.upper()}")
+        print('='*40)
+        results[test_name] = test_func()
+    # Summary
+    print(f"\n{'='*40}")
+    print("TEST SUMMARY")
+    print('='*40)
+    all_passed = True
+    for test_name, passed in results.items():
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"{status} {test_name}")
+        if not passed:
+            all_passed = False
+    print('='*40)
+    if all_passed:
+        print("���� All tests passed! You can run the service with:")
+        print("   python ui.py")
+    else:
+        print("⚠️  Some tests failed. Please fix the issues above.")
+        print("\nQuick fixes:")
+        print("1. Install missing packages: pip install -r requirements.txt")
+        print("2. Configure your .env file with Azure credentials")
+        print("3. Ensure all files are present")
+    return all_passed
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)