Spaces:

Betimes-Solution
/

PDFtoDocx-OCR

Running

App Files Files Community

Chirapath commited on Sep 21, 2025

Commit

5ba08f1

verified ·

1 Parent(s): 22b4814

Upload 10 files

Browse files

Files changed (6) hide show

.gitattributes +35 -35
app.py +635 -783
backend.py +556 -413
ocr_service.py +790 -324
readme.md +196 -157
requirements.txt +24 -8

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Gradio UI for PDF OCR Service - Enhanced with Header/Footer Removal
-User interface for PDF to text conversion with multiple OCR providers and preprocessing options
-"""
 import re
 import gradio as gr
 import os
 import tempfile
 import logging
 from pathlib import Path
 from datetime import datetime
 import cv2
 import numpy as np
 from PIL import Image
 import fitz  # PyMuPDF
 # Load environment variables
 from dotenv import load_dotenv
@@ -28,179 +26,330 @@ logger = logging.getLogger(__name__)
 backend_manager = BackendManager()
 # Check if python-docx is available
-from docx.shared import Pt
-from docx.enum.table import WD_TABLE_ALIGNMENT
 try:
     from docx import Document
-    from docx.shared import Inches
     HAS_DOCX_SUPPORT = True
     logger.info("DOCX export available")
 except ImportError:
     HAS_DOCX_SUPPORT = False
     logger.info("DOCX export not available - install python-docx to enable")
-# Global variables for crop preview
-current_crop_settings = {
-    'top': 0,
-    'bottom': 0,
-    'left': 0,
-    'right': 0
 }
-def generate_preview_image(pdf_file, page_num=0):
-    """Generate preview image from PDF first page for cropping"""
     if pdf_file is None:
-        return None
     try:
-        pdf_path = pdf_file.name
-        doc = fitz.open(pdf_path)
-        if page_num >= len(doc):
-            page_num = 0
-        page = doc.load_page(page_num)
-        # Render page to image with good resolution
-        mat = fitz.Matrix(2.0, 2.0)
-        pix = page.get_pixmap(matrix=mat)
-        img_data = pix.tobytes("png")
-        # Convert to PIL Image and then to numpy array
-        import io
-        pil_image = Image.open(io.BytesIO(img_data))
-        img_array = np.array(pil_image)
-        doc.close()
-        return img_array
     except Exception as e:
-        logger.error(f"Error generating preview: {e}")
         return None
-def update_crop_preview(pdf_file, top_crop, bottom_crop, left_crop, right_crop):
-    """Update preview image with crop areas highlighted"""
-    if pdf_file is None:
         return None
     try:
-        img_array = generate_preview_image(pdf_file)
-        if img_array is None:
-            return None
-        # Convert to BGR for OpenCV
-        if len(img_array.shape) == 3 and img_array.shape[2] == 4:
-            # RGBA to RGB
-            img_array = img_array[:, :, :3]
-        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
-        height, width = img_bgr.shape[:2]
-        # Calculate crop areas
-        top_px = int(height * top_crop / 100)
-        bottom_px = int(height * bottom_crop / 100)
-        left_px = int(width * left_crop / 100)
-        right_px = int(width * right_crop / 100)
-        # Store current settings
-        current_crop_settings.update({
-            'top': top_px,
-            'bottom': bottom_px,
-            'left': left_px,
-            'right': right_px
-        })
-        # Create overlay
-        overlay = img_bgr.copy()
-        # Draw crop areas in red (areas to be removed)
-        if top_px > 0:
-            cv2.rectangle(overlay, (0, 0), (width, top_px), (0, 0, 255), -1)
-        if bottom_px > 0:
-            cv2.rectangle(overlay, (0, height - bottom_px), (width, height), (0, 0, 255), -1)
-        if left_px > 0:
-            cv2.rectangle(overlay, (0, 0), (left_px, height), (0, 0, 255), -1)
-        if right_px > 0:
-            cv2.rectangle(overlay, (width - right_px, 0), (width, height), (0, 0, 255), -1)
-        # Draw content area outline in green
-        content_top = top_px
-        content_bottom = height - bottom_px
-        content_left = left_px
-        content_right = width - right_px
-        if content_right > content_left and content_bottom > content_top:
-            cv2.rectangle(overlay, (content_left, content_top), (content_right, content_bottom), (0, 255, 0), 3)
-        # Blend overlay with original
-        result = cv2.addWeighted(img_bgr, 0.7, overlay, 0.3, 0)
-        # Add text annotations
-        cv2.putText(result, "RED: Areas to remove", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
-        cv2.putText(result, "GREEN: Content area", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
-        # Convert back to RGB for display
-        result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
-        return result_rgb
     except Exception as e:
         logger.error(f"Error updating crop preview: {e}")
         return None
-def process_pdf_file(pdf_file, ocr_method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right,
-                    progress=gr.Progress()):
-    """
-    Process uploaded PDF file with optional header/footer removal
-    """
     if pdf_file is None:
-        return "No file uploaded.", "", "❌ Error: No file selected"
-    temp_file_path = None
     try:
-        progress(0.1, desc="Initializing...")
-        # Handle Gradio file object
-        temp_file_path = pdf_file.name
-        # Prepare preprocessing options
         preprocessing_options = {
             'enable_header_footer_removal': enable_header_footer_removal,
-            'crop_settings': {
-                'top': crop_top,
-                'bottom': crop_bottom,
-                'left': crop_left,
-                'right': crop_right
-            }
         }
-        progress(0.3, desc="Processing PDF...")
-        # Process the PDF with preprocessing options
-        result = backend_manager.process_pdf(temp_file_path, ocr_method, preprocessing_options)
-        progress(0.9, desc="Finalizing...")
         progress(1.0, desc="Complete!")
         if result['success']:
-            # Format metadata for display
-            metadata_info = format_metadata(result['metadata'], result['method_used'])
-            status = f"✅ Success: Processed using {result['method_used']}"
-            return result['text'], metadata_info, status
         else:
             error_msg = result.get('error', 'Unknown error occurred')
-            return f"Error: {error_msg}", "", f"❌ Processing failed: {error_msg}"
     except Exception as e:
-        logger.error(f"UI processing error: {e}")
-        return f"Error: {str(e)}", "", f"❌ Unexpected error: {str(e)}"
-def format_metadata(metadata, method_used):
-    """Format metadata for display"""
     if not metadata:
         return f"Method used: {method_used}"
@@ -209,750 +358,453 @@ def format_metadata(metadata, method_used):
     if 'pages' in metadata:
         info_lines.append(f"Pages processed: {metadata['pages']}")
-    if 'tables' in metadata:
-        info_lines.append(f"Tables detected: {metadata['tables']}")
-    if 'has_handwritten' in metadata:
-        handwritten_status = "Yes" if metadata['has_handwritten'] else "No"
-        info_lines.append(f"Handwritten content: {handwritten_status}")
-    if 'header_footer_removed' in metadata:
-        removal_status = "Yes" if metadata['header_footer_removed'] else "No"
-        info_lines.append(f"Header/Footer removed: {removal_status}")
     if 'processing_time_seconds' in metadata:
         info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
     return "\n".join(info_lines)
-def create_txt_file(text_content, metadata_info=""):
-    """Create a TXT file from extracted text with clean table handling"""
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    temp_file = tempfile.NamedTemporaryFile(
-        suffix=f'_extracted_text_{timestamp}.txt',
-        delete=False,
-        mode='w',
-        encoding='utf-8'
     )
-    try:
-        # Add header
-        temp_file.write("PDF OCR Extraction Results\n")
-        temp_file.write("=" * 50 + "\n\n")
-        # Add metadata
-        if metadata_info:
-            temp_file.write("Processing Information:\n")
-            temp_file.write("-" * 25 + "\n")
-            temp_file.write(metadata_info + "\n\n")
-        # Add timestamp
-        temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
-        temp_file.write("=" * 50 + "\n\n")
-        # Add main content with clean table processing
-        temp_file.write("Extracted Text:\n")
-        temp_file.write("-" * 15 + "\n\n")
-        # Process content to clean up table duplications
-        cleaned_content = _clean_text_content_for_txt(text_content)
-        temp_file.write(cleaned_content)
-        temp_file.close()
-        return temp_file.name
-    except Exception as e:
-        logger.error(f"Error creating TXT file: {e}")
-        temp_file.close()
-        raise
-def _clean_text_content_for_txt(content):
-    """Clean text content for TXT export, removing table duplications"""
-    if not content.strip():
-        return content
-    # Split by pages first
-    if '=== PAGE ' in content:
-        pages = content.split('=== PAGE ')
-        cleaned_pages = []
-        for i, page_content in enumerate(pages):
-            if i == 0 and not page_content.strip():
-                continue
-            if i > 0:
-                # Add page header
-                page_num = page_content.split(' ===')[0] if ' ===' in page_content else str(i)
-                cleaned_pages.append(f"\n--- Page {page_num} ---\n")
-                # Get content after page header
-                content_part = page_content.split('===\n', 1)[-1] if '===\n' in page_content else page_content
-            else:
-                content_part = page_content
-            # Clean this page's content
-            cleaned_page = _clean_page_content_for_txt(content_part)
-            if cleaned_page.strip():
-                cleaned_pages.append(cleaned_page)
-        return '\n'.join(cleaned_pages)
-    else:
-        # No page structure, clean as single content
-        return _clean_page_content_for_txt(content)
-def _clean_page_content_for_txt(content):
-    """Clean a single page's content for TXT export"""
-    if not content.strip():
-        return ""
-    import re
-    # Split content by table markers
-    parts = re.split(r'\n?--- TABLE \d+ ---\n?', content)
-    cleaned_parts = []
-    table_count = 0
-    # Find all table sections
-    table_matches = re.finditer(r'\n?--- TABLE (\d+) ---\n?(.*?)(?=\n?--- TABLE \d+ ---|$)', content, re.DOTALL)
-    table_contents = {}
-    for match in table_matches:
-        table_num = match.group(1)
-        table_content = match.group(2).strip()
-        table_contents[int(table_num)] = table_content
-    # Process each part
-    for i, part in enumerate(parts):
-        if part.strip():
-            # Clean the text part
-            cleaned_part = _clean_text_part(part)
-            if cleaned_part.strip():
-                cleaned_parts.append(cleaned_part)
-            # Add table if this part was followed by one
-            if i < len(parts) - 1:  # Not the last part
-                table_count += 1
-                if table_count in table_contents:
-                    table_header = f"\n--- TABLE {table_count} ---\n"
-                    table_text = _format_table_for_txt(table_contents[table_count])
-                    cleaned_parts.append(table_header + table_text)
-    return '\n'.join(cleaned_parts)
-def _clean_text_part(text_part):
-    """Clean a text part of any remaining table content"""
-    if not text_part.strip():
-        return ""
-    import re
-    # Remove any stray table markers
-    cleaned = re.sub(r'\n?--- TABLE \d+ ---\n?', '', text_part)
-    cleaned = re.sub(r'\n?--- Table \d+ ---\n?', '', cleaned)
-    # Split into lines and filter out table-like content
-    lines = cleaned.split('\n')
-    filtered_lines = []
-    for line in lines:
-        line = line.strip()
-        if not line:
-            filtered_lines.append('')  # Keep empty lines for spacing
-            continue
-        # Skip lines that look like table content (multiple | separators)
-        if line.count('|') >= 2:
-            continue
-        # Skip separator lines
-        if line.replace('-', '').replace(' ', '').replace('|', '') == '':
-            continue
-        filtered_lines.append(line)
-    # Remove excessive empty lines
-    result_lines = []
-    prev_empty = False
-    for line in filtered_lines:
-        if line == '':
-            if not prev_empty:
-                result_lines.append(line)
-            prev_empty = True
-        else:
-            result_lines.append(line)
-            prev_empty = False
-    return '\n'.join(result_lines)
-def _format_table_for_txt(table_content):
-    """Format table content nicely for TXT output"""
-    if not table_content.strip():
-        return ""
-    lines = [line.strip() for line in table_content.split('\n') if line.strip()]
-    # Look for table structure
-    table_lines = []
-    for line in lines:
-        if '|' in line:
-            # Clean up the table line
-            cells = [cell.strip() for cell in line.split('|')]
-            # Remove empty cells at start/end
-            while cells and not cells[0]:
-                cells.pop(0)
-            while cells and not cells[-1]:
-                cells.pop()
-            if cells:
-                table_lines.append(cells)
-    if not table_lines:
-        return table_content  # Return as is if no table structure found
-    # Calculate column widths
-    if table_lines:
-        max_cols = max(len(row) for row in table_lines)
-        col_widths = [0] * max_cols
-        for row in table_lines:
-            for i in range(min(len(row), max_cols)):
-                col_widths[i] = max(col_widths[i], len(row[i]) if i < len(row) else 0)
-        # Format table with proper alignment
-        formatted_lines = []
-        for i, row in enumerate(table_lines):
-            formatted_row = []
-            for j in range(max_cols):
-                cell_content = row[j] if j < len(row) else ""
-                width = max(col_widths[j], 3)
-                formatted_row.append(cell_content.ljust(width))
-            formatted_lines.append(" | ".join(formatted_row))
-            # Add separator after header row
-            if i == 0 and len(table_lines) > 1:
-                separator = " | ".join(["-" * max(col_widths[k], 3) for k in range(max_cols)])
-                formatted_lines.append(separator)
-        return '\n'.join(formatted_lines)
-    return table_content
-def create_docx_file(text_content, metadata_info=""):
-    """Create DOCX file with enhanced table handling - NO separator rows"""
-    if not HAS_DOCX_SUPPORT:
-        raise ImportError("python-docx not installed. Cannot create DOCX files.")
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    temp_file = tempfile.NamedTemporaryFile(
-        suffix=f'_extracted_text_{timestamp}.docx',
-        delete=False
-    )
-    temp_file.close()
-    try:
-        from docx import Document
-        from docx.shared import Inches, Pt
-        from docx.enum.text import WD_ALIGN_PARAGRAPH
-        from docx.enum.table import WD_TABLE_ALIGNMENT
-        doc = Document()
-        # Set margins
-        sections = doc.sections
-        for section in sections:
-            section.top_margin = Inches(1)
-            section.bottom_margin = Inches(1)
-            section.left_margin = Inches(1)
-            section.right_margin = Inches(1)
-        # Title
-        title = doc.add_heading('PDF OCR Extraction Results', 0)
-        title.alignment = WD_ALIGN_PARAGRAPH.CENTER
-        # Metadata
-        if metadata_info:
-            doc.add_heading('Processing Information', level=1)
-            metadata_para = doc.add_paragraph(metadata_info)
-            metadata_para.style = 'Intense Quote'
-            doc.add_page_break()
-        # Enhanced content processing
-        _add_enhanced_content_to_docx(doc, text_content)
-        # Footer
-        footer_section = doc.sections[0]
-        footer = footer_section.footer
-        footer_para = footer.paragraphs[0]
-        footer_para.text = f"Generated by PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
-        doc.save(temp_file.name)
-        logger.info(f"Enhanced DOCX file created: {temp_file.name}")
-        return temp_file.name
-    except Exception as e:
-        logger.error(f"Error creating DOCX file: {e}")
         try:
-            os.unlink(temp_file.name)
-        except:
-            pass
-        raise
-def _add_enhanced_content_to_docx(doc, text_content):
-    """Enhanced content addition with NO separator rows in tables"""
-    import re
-    # Split by lines and process sequentially
-    lines = text_content.split('\n')
-    current_table_content = []
-    in_table = False
-    for line in lines:
-        line = line.strip()
-        # Handle page markers
-        if line.startswith('=== PAGE '):
-            if current_table_content:
-                _add_enhanced_table(doc, current_table_content)
-                current_table_content = []
-                in_table = False
-            page_num = line.replace('=== PAGE ', '').replace(' ===', '')
-            doc.add_heading(f'Page {page_num}', level=1)
-            continue
-        # Handle table start
-        if line.startswith('--- TABLE '):
-            if current_table_content:
-                _add_enhanced_table(doc, current_table_content)
-            current_table_content = []
-            in_table = True
-            table_num = line.replace('--- TABLE ', '').replace(' ---', '')
-            current_table_content.append(f"Table {table_num}")
-            continue
-        # Handle content
-        if in_table:
-            if line and not line.startswith('==='):
-                current_table_content.append(line)
-        else:
-            # Regular text
-            if line:
-                if line.startswith('# '):
-                    doc.add_heading(line[2:], level=1)
-                elif line.startswith('## '):
-                    doc.add_heading(line[3:], level=2)
-                elif line.startswith('### '):
-                    doc.add_heading(line[4:], level=3)
-                else:
-                    doc.add_paragraph(line)
-            else:
-                # Empty line - add small space
-                doc.add_paragraph("")
-    # Handle any remaining table
-    if current_table_content:
-        _add_enhanced_table(doc, current_table_content)
-def _add_enhanced_table(doc, table_content):
-    """Add table with enhanced processing - REMOVES separator rows"""
-    if not table_content:
-        return
-    # First line should be table title
-    if table_content:
-        doc.add_heading(table_content[0], level=3)
-        table_lines = table_content[1:]
     else:
-        table_lines = table_content
-    if not table_lines:
-        return
-    # Find lines that contain pipes (table rows) and FILTER OUT separator rows
-    table_rows = []
-    for line in table_lines:
-        if '|' in line and line.strip():
-            # CRITICAL: Skip separator rows (lines that are mostly dashes)
-            line_content = line.replace('|', '').replace(' ', '')
-            if line_content.replace('-', '') == '':
-                continue  # Skip this separator row
-            # Split and clean
-            cells = [cell.strip() for cell in line.split('|')]
-            # Remove empty cells at edges
-            while cells and not cells[0]:
-                cells.pop(0)
-            while cells and not cells[-1]:
-                cells.pop()
-            if cells:
-                table_rows.append(cells)
-    if not table_rows:
-        # No table structure, add as text
-        for line in table_lines:
-            if line.strip():
-                doc.add_paragraph(line)
-        return
-    # Create table
-    max_cols = max(len(row) for row in table_rows)
-    table = doc.add_table(rows=len(table_rows), cols=max_cols)
-    table.style = 'Table Grid'
-    # Fill table
-    for row_idx, row_data in enumerate(table_rows):
-        table_row = table.rows[row_idx]
-        for col_idx in range(max_cols):
-            cell = table_row.cells[col_idx]
-            if col_idx < len(row_data):
-                cell.text = row_data[col_idx]
-            # Bold first row
-            if row_idx == 0:
-                for paragraph in cell.paragraphs:
-                    for run in paragraph.runs:
-                        run.bold = True
-    doc.add_paragraph("")  # Space after table
-def get_method_info(method):
-    """Get information about selected OCR method"""
     method_descriptions = {
-        "auto": "🤖 **Auto Selection**: Automatically chooses the best available method. Prefers Azure → Tesseract → PyMuPDF in order.",
-        "azure": "☁️ **Azure Document Intelligence**: Advanced cloud-based OCR with excellent layout preservation, table detection, and handwriting recognition. Best quality but requires API credentials.",
-        "tesseract": "🔍 **Tesseract OCR**: Open-source OCR engine with image preprocessing. Good for scanned documents and images. Works offline.",
-        "pymupdf": "📄 **PyMuPDF**: Fast text extraction for PDFs with embedded text. Best for digital PDFs but limited OCR capabilities for scanned documents."
     }
     return method_descriptions.get(method, "Select a method to see details.")
-def check_service_status():
-    """Check and display service status"""
     available_methods = backend_manager.get_available_methods()
-    status_lines = ["**Available OCR Methods:**"]
     if "azure" in available_methods:
-        status_lines.append("✅ Azure Document Intelligence - Ready")
     else:
-        status_lines.append("❌ Azure Document Intelligence - Not configured")
     if "tesseract" in available_methods:
-        status_lines.append("✅ Tesseract OCR - Ready")
     else:
-        status_lines.append("❌ Tesseract OCR - Not available")
     if "pymupdf" in available_methods:
-        status_lines.append("✅ PyMuPDF - Ready")
     else:
-        status_lines.append("❌ PyMuPDF - Not available")
-    # Add DOCX support status
     if HAS_DOCX_SUPPORT:
-        status_lines.append("✅ DOCX Export - Available")
     else:
-        status_lines.append("❌ DOCX Export - Install python-docx to enable")
-    return "\n".join(status_lines)
-def process_and_prepare_downloads(pdf_file, method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right):
-    """Process PDF and prepare both TXT and DOCX downloads if successful"""
-    text, metadata, status = process_pdf_file(pdf_file, method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right)
-    # Prepare downloads if processing was successful
-    if text and not text.startswith("Error:") and not text.startswith("No file"):
-        try:
-            # Create TXT file
-            txt_path = create_txt_file(text, metadata)
-            # Create DOCX file if support is available
-            if HAS_DOCX_SUPPORT:
-                try:
-                    docx_path = create_docx_file(text, metadata)
-                    return (text, metadata, status,
-                           gr.update(visible=True, value=txt_path),
-                           gr.update(visible=True, value=docx_path))
-                except Exception as docx_error:
-                    logger.warning(f"DOCX creation failed: {docx_error}")
-                    return (text, metadata, status,
-                           gr.update(visible=True, value=txt_path),
-                           gr.update(visible=False))
-            else:
-                return (text, metadata, status,
-                       gr.update(visible=True, value=txt_path),
-                       gr.update(visible=False))
-        except Exception as file_error:
-            logger.error(f"File creation error: {file_error}")
-            return (text, metadata, status,
-                   gr.update(visible=False),
-                   gr.update(visible=False))
-    else:
-        return (text, metadata, status,
-               gr.update(visible=False),
-               gr.update(visible=False))
-def create_interface():
-    """Create and configure the Gradio interface"""
     with gr.Blocks(
-        title="PDF OCR Service - Enhanced",
         theme=gr.themes.Soft(),
         css="""
         .main-header { text-align: center; margin-bottom: 2rem; }
-        .method-info { background-color: #f8f9fa; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0; }
         .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
-        .preprocessing-box { border: 2px solid #28a745; padding: 1rem; border-radius: 0.5rem; background-color: #f8fff8; }
         """
     ) as interface:
         gr.HTML("""
         <div class="main-header">
-            <h1>📄 PDF OCR Service - Enhanced</h1>
-            <p>Convert PDF documents to text using advanced OCR technologies with preprocessing options</p>
         </div>
         """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.HTML("<h3>📁 Upload & Configure</h3>")
-                # File upload
-                pdf_input = gr.File(
-                    label="Upload PDF File",
-                    file_types=[".pdf"],
-                    file_count="single"
-                )
-                # OCR method selection
-                method_choice = gr.Dropdown(
-                    choices=["auto", "azure", "tesseract", "pymupdf"],
-                    value="auto",
-                    label="OCR Method",
-                    info="Choose OCR method or use auto-selection"
-                )
-                # Method information display
-                method_info = gr.Markdown(
-                    value=get_method_info("auto"),
-                    elem_classes=["method-info"]
-                )
-                # Header/Footer Removal Section
-                gr.HTML('<div class="preprocessing-box">')
-                gr.HTML("<h4>🔧 Header/Footer Removal</h4>")
                 enable_header_footer_removal = gr.Checkbox(
-                    label="Enable Header/Footer Removal",
                     value=False,
-                    info="Remove headers and footers from all pages"
                 )
-                # Crop controls with preview
                 with gr.Group(visible=False) as crop_controls:
-                    gr.HTML("<h5>📏 Crop Areas (% of page)</h5>")
-                    crop_top = gr.Slider(
-                        minimum=0,
-                        maximum=30,
-                        value=5,
-                        step=0.5,
-                        label="Top Crop %",
-                        info="Percentage of page height to remove from top"
-                    )
-                    crop_bottom = gr.Slider(
-                        minimum=0,
-                        maximum=30,
-                        value=5,
-                        step=0.5,
-                        label="Bottom Crop %",
-                        info="Percentage of page height to remove from bottom"
-                    )
-                    crop_left = gr.Slider(
-                        minimum=0,
-                        maximum=20,
-                        value=2,
-                        step=0.5,
-                        label="Left Crop %",
-                        info="Percentage of page width to remove from left"
-                    )
-                    crop_right = gr.Slider(
-                        minimum=0,
-                        maximum=20,
-                        value=2,
-                        step=0.5,
-                        label="Right Crop %",
-                        info="Percentage of page width to remove from right"
                     )
-                gr.HTML('</div>')
-                # Process button
-                process_btn = gr.Button(
-                    "🚀 Process PDF",
-                    variant="primary",
-                    size="lg"
-                )
-                # Service status
-                gr.HTML("<h4>🔧 Service Status</h4>")
-                service_status = gr.Markdown(
-                    value=check_service_status(),
-                    elem_classes=["status-box"]
-                )
-                # Refresh status button
-                refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
             with gr.Column(scale=2):
-                gr.HTML("<h3>📋 Results</h3>")
-                # Crop preview (visible only when crop method is selected)
-                crop_preview = gr.Image(
-                    label="Crop Preview",
-                    visible=False,
-                    interactive=False,
-                    height=400
-                )
-                # Processing status
-                processing_status = gr.Textbox(
-                    label="Processing Status",
-                    interactive=False,
-                    lines=1
-                )
-                # Extracted text output
-                text_output = gr.Textbox(
-                    label="Extracted Text",
-                    placeholder="Processed text will appear here...",
-                    lines=20,
-                    max_lines=30,
-                    interactive=False,
-                    show_copy_button=True
-                )
-                # Metadata information
-                metadata_output = gr.Textbox(
-                    label="Processing Information",
-                    interactive=False,
-                    lines=4
-                )
-                # Download buttons
-                with gr.Row():
-                    download_txt_btn = gr.DownloadButton(
-                        "📄 Download TXT",
-                        visible=False,
-                        variant="secondary"
                     )
-                    download_docx_btn = gr.DownloadButton(
-                        "📊 Download DOCX",
-                        visible=False,
-                        variant="secondary"
                     )
-        # Add tips section
-        gr.HTML("<h3>💡 Tips & Features</h3>")
-        # Create tips content based on available features
-        download_info = "Get results as formatted TXT files"
-        if HAS_DOCX_SUPPORT:
-            download_info += " and structured DOCX files with clean table formatting"
-        else:
-            download_info += " (install python-docx for DOCX export)"
-        tips_html = f"""
-        <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0;">
-            <ul>
-                <li><strong>Auto method</strong> is recommended for most users - intelligently selects the best OCR method</li>
-                <li><strong>Header/Footer Removal:</strong> Clean up scanned documents by removing headers and footers</li>
-                <li><strong>Fixed Removal:</strong> Remove specific pixel amounts from top/bottom of each page</li>
-                <li><strong>Smart Crop:</strong> Use visual preview to set exact crop areas</li>
-                <li><strong>Table Processing:</strong> Enhanced table detection with clean formatting (no separator lines)</li>
-                <li><strong>Download Options:</strong> {download_info}</li>
-                <li><strong>Azure Document Intelligence</strong> provides the best quality for complex documents</li>
-                <li>Larger files may take longer to process - progress bar shows current status</li>
-                <li>Supported file types: PDF documents (up to 50MB by default)</li>
-            </ul>
-        </div>
-        """
-        gr.HTML(tips_html)
-        # Event handlers
         method_choice.change(
-            fn=get_method_info,
             inputs=[method_choice],
             outputs=[method_info]
         )
         enable_header_footer_removal.change(
-            fn=lambda enabled: (
                 gr.update(visible=enabled),
-                gr.update(visible=enabled and "fixed"),
-                gr.update(visible=enabled and "crop"),
-                gr.update(visible=enabled and "crop")
-            ),
             inputs=[enable_header_footer_removal],
-            outputs=[crop_controls, crop_preview]
         )
-        # Update crop preview when parameters change
-        for crop_input in [crop_top, crop_bottom, crop_left, crop_right]:
             crop_input.change(
-                fn=update_crop_preview,
-                inputs=[pdf_input, crop_top, crop_bottom, crop_left, crop_right],
                 outputs=[crop_preview]
             )
-        # Update preview when PDF is uploaded
-        pdf_input.change(
-            fn=update_crop_preview,
-            inputs=[pdf_input, crop_top, crop_bottom, crop_left, crop_right],
-            outputs=[crop_preview]
         )
         refresh_btn.click(
-            fn=check_service_status,
             outputs=[service_status]
         )
         process_btn.click(
-            fn=process_and_prepare_downloads,
-            inputs=[pdf_input, method_choice, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right],
-            outputs=[text_output, metadata_output, processing_status, download_txt_btn, download_docx_btn]
         )
     return interface
-def launch_ui():
-    """Launch the Gradio interface"""
-    interface = create_interface()
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )
 if __name__ == "__main__":
-    launch_ui()

 import re
 import gradio as gr
 import os
 import tempfile
 import logging
+import json
 from pathlib import Path
 from datetime import datetime
 import cv2
 import numpy as np
 from PIL import Image
 import fitz  # PyMuPDF
+from typing import Dict, List, Tuple, Optional
 # Load environment variables
 from dotenv import load_dotenv
 backend_manager = BackendManager()
 # Check if python-docx is available
 try:
     from docx import Document
+    from docx.shared import Inches, Pt
+    from docx.enum.table import WD_TABLE_ALIGNMENT
     HAS_DOCX_SUPPORT = True
     logger.info("DOCX export available")
 except ImportError:
     HAS_DOCX_SUPPORT = False
     logger.info("DOCX export not available - install python-docx to enable")
+# Global variables for enhanced crop management
+current_pdf_data = {
+    'path': None,
+    'page_count': 0,
+    'page_images': {},
+    'crop_settings': {},
+    'default_crop_all': True
 }
+class PDFPageManager:
+    """Manages PDF page previews and crop settings with enhanced resolution - FIXED VERSION"""
+    def __init__(self):
+        self.pdf_doc = None
+        self.page_images = {}
+        self.crop_settings = {}
+        self.current_page = 0
+        self.high_res_scale = 2.0  # Reduced from 3.0 for better performance
+    def load_pdf(self, pdf_path: str) -> Dict:
+        """Load PDF and generate high-resolution page previews - FIXED"""
+        try:
+            if self.pdf_doc:
+                self.pdf_doc.close()
+            self.pdf_doc = fitz.open(pdf_path)
+            page_count = len(self.pdf_doc)
+            # Generate high-resolution previews for all pages
+            self.page_images = {}
+            for page_num in range(page_count):
+                self.page_images[page_num] = self._generate_high_res_preview(page_num)
+            # Initialize default crop settings for all pages
+            self.crop_settings = {
+                i: {'top': 0, 'bottom': 0, 'left': 0, 'right': 0, 'custom': False}
+                for i in range(page_count)
+            }
+            logger.info(f"PDF loaded successfully: {page_count} pages")
+            return {
+                'success': True,
+                'page_count': page_count,
+                'pages': list(range(page_count))
+            }
+        except Exception as e:
+            logger.error(f"Error loading PDF: {e}")
+            return {'success': False, 'error': str(e)}
+    def _generate_high_res_preview(self, page_num: int) -> np.ndarray:
+        """Generate high-resolution preview for better crop visualization - FIXED"""
+        try:
+            if not self.pdf_doc:
+                return None
+            page = self.pdf_doc.load_page(page_num)
+            # Use high resolution matrix for better quality
+            mat = fitz.Matrix(self.high_res_scale, self.high_res_scale)
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("png")
+            # Convert to PIL Image and then to numpy array
+            import io
+            pil_image = Image.open(io.BytesIO(img_data))
+            img_array = np.array(pil_image)
+            # Convert RGBA to RGB if needed
+            if len(img_array.shape) == 3 and img_array.shape[2] == 4:
+                img_array = img_array[:, :, :3]
+            return img_array
+        except Exception as e:
+            logger.error(f"Error generating preview for page {page_num}: {e}")
+            return None
+    def update_crop_visualization(self, page_num: int, crop_coords: Dict) -> np.ndarray:
+        """Update crop visualization with enhanced preview - FIXED"""
+        if page_num not in self.page_images or self.page_images[page_num] is None:
+            logger.warning(f"No image available for page {page_num}")
+            return None
+        try:
+            img_array = self.page_images[page_num].copy()
+            height, width = img_array.shape[:2]
+            # Convert coordinates from percentages to pixels
+            x1 = int(crop_coords.get('left', 0) * width / 100)
+            y1 = int(crop_coords.get('top', 0) * height / 100)
+            x2 = width - int(crop_coords.get('right', 0) * width / 100)
+            y2 = height - int(crop_coords.get('bottom', 0) * height / 100)
+            # Ensure coordinates are valid
+            x1 = max(0, min(x1, width))
+            x2 = max(0, min(x2, width))
+            y1 = max(0, min(y1, height))
+            y2 = max(0, min(y2, height))
+            # Create overlay
+            overlay = img_array.copy()
+            # Draw crop areas in semi-transparent red (areas to be removed)
+            alpha = 0.3
+            if crop_coords.get('top', 0) > 0 and y1 > 0:
+                cv2.rectangle(overlay, (0, 0), (width, y1), (255, 0, 0), -1)
+            if crop_coords.get('bottom', 0) > 0 and y2 < height:
+                cv2.rectangle(overlay, (0, y2), (width, height), (255, 0, 0), -1)
+            if crop_coords.get('left', 0) > 0 and x1 > 0:
+                cv2.rectangle(overlay, (0, 0), (x1, height), (255, 0, 0), -1)
+            if crop_coords.get('right', 0) > 0 and x2 < width:
+                cv2.rectangle(overlay, (x2, 0), (width, height), (255, 0, 0), -1)
+            # Draw content area outline in green
+            if x2 > x1 and y2 > y1:
+                thickness = max(2, int(self.high_res_scale * 2))
+                cv2.rectangle(overlay, (x1, y1), (x2, y2), (0, 255, 0), thickness)
+            # Blend overlay with original
+            result = cv2.addWeighted(img_array, 1-alpha, overlay, alpha, 0)
+            # Add informative text with better scaling
+            font_scale = max(0.8, self.high_res_scale / 3)
+            thickness = max(1, int(self.high_res_scale))
+            text_color = (255, 255, 255)
+            background_color = (0, 0, 0)
+            # Add text with background for better visibility
+            texts = [
+                f"Page {page_num + 1}",
+                "RED: Remove areas",
+                "GREEN: Content area",
+                f"Crop: T{crop_coords.get('top', 0):.1f}% B{crop_coords.get('bottom', 0):.1f}% L{crop_coords.get('left', 0):.1f}% R{crop_coords.get('right', 0):.1f}%"
+            ]
+            y_offset = 30
+            for i, text in enumerate(texts):
+                y_pos = y_offset + (i * 30)
+                # Add background rectangle for text
+                (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
+                cv2.rectangle(result, (10, y_pos - text_height - 5), (text_width + 20, y_pos + 5), background_color, -1)
+                cv2.putText(result, text, (15, y_pos), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, thickness)
+            return result
+        except Exception as e:
+            logger.error(f"Error updating crop visualization: {e}")
+            return self.page_images[page_num] if page_num in self.page_images else None
+    def set_crop_for_page(self, page_num: int, crop_coords: Dict):
+        """Set crop coordinates for specific page - FIXED"""
+        if page_num in self.crop_settings:
+            self.crop_settings[page_num].update(crop_coords)
+            self.crop_settings[page_num]['custom'] = True
+            logger.info(f"Set crop for page {page_num}: {crop_coords}")
+    def set_crop_for_all_pages(self, crop_coords: Dict):
+        """Apply same crop settings to all pages - FIXED"""
+        for page_num in self.crop_settings:
+            if not self.crop_settings[page_num].get('custom', False):
+                self.crop_settings[page_num].update(crop_coords)
+        logger.info(f"Applied crop to all non-custom pages: {crop_coords}")
+    def get_crop_settings_for_processing(self) -> Dict:
+        """Get crop settings in format expected by backend - FIXED"""
+        return {
+            'per_page_crops': self.crop_settings,
+            'has_custom_crops': any(page.get('custom', False) for page in self.crop_settings.values()),
+            'enhanced_resolution': True,
+            'resolution_scale': self.high_res_scale
+        }
+    def close(self):
+        """Clean up resources"""
+        if self.pdf_doc:
+            self.pdf_doc.close()
+            self.pdf_doc = None
+        self.page_images.clear()
+        self.crop_settings.clear()
+# Global page manager instance
+pdf_manager = PDFPageManager()
+def load_pdf_for_preview(pdf_file):
+    """Load PDF and return page thumbnails for selection - FIXED"""
     if pdf_file is None:
+        return None, gr.update(choices=[], value=None), gr.update(visible=False), "No PDF loaded"
     try:
+        result = pdf_manager.load_pdf(pdf_file.name)
+        if result['success']:
+            # Create page choices for dropdown
+            page_choices = [f"Page {i+1}" for i in range(result['page_count'])]
+            # Get first page preview with default crop
+            first_page_preview = pdf_manager.update_crop_visualization(0, {
+                'top': 0, 'bottom': 0, 'left': 0, 'right': 0
+            }) if 0 in pdf_manager.page_images else None
+            status = f"PDF loaded successfully: {result['page_count']} pages"
+            return (first_page_preview,
+                   gr.update(choices=page_choices, value=page_choices[0] if page_choices else None, visible=True),
+                   gr.update(visible=True),
+                   status)
+        else:
+            return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error: {result['error']}"
+    except Exception as e:
+        logger.error(f"Error in load_pdf_for_preview: {e}")
+        return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error loading PDF: {str(e)}"
+def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_right):
+    """Change preview to selected page with current crop settings - FIXED"""
+    if not page_selection:
+        return None
+    try:
+        page_num = int(page_selection.split()[1]) - 1  # Extract page number
+        # Get current crop settings for this page
+        crop_coords = {
+            'top': crop_top,
+            'bottom': crop_bottom,
+            'left': crop_left,
+            'right': crop_right
+        }
+        # Update visualization
+        preview_image = pdf_manager.update_crop_visualization(page_num, crop_coords)
+        return preview_image
     except Exception as e:
+        logger.error(f"Error changing preview page: {e}")
         return None
+def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_left, crop_right, apply_to_all):
+    """Update crop preview with interactive feedback - FIXED"""
+    if not page_selection or not pdf_manager.pdf_doc:
         return None
     try:
+        page_num = int(page_selection.split()[1]) - 1
+        crop_coords = {
+            'top': crop_top,
+            'bottom': crop_bottom,
+            'left': crop_left,
+            'right': crop_right
+        }
+        # Apply to current page or all pages based on setting
+        if apply_to_all:
+            pdf_manager.set_crop_for_all_pages(crop_coords)
+        else:
+            pdf_manager.set_crop_for_page(page_num, crop_coords)
+        # Return updated preview
+        return pdf_manager.update_crop_visualization(page_num, crop_coords)
     except Exception as e:
         logger.error(f"Error updating crop preview: {e}")
         return None
+def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer_removal,
+                                    crop_top, crop_bottom, crop_left, crop_right,
+                                    apply_to_all_pages, current_page_selection,
+                                    progress=gr.Progress()):
+    """Process PDF with HTML enhancement and improved table handling - FIXED"""
     if pdf_file is None:
+        return "No file uploaded.", "", "", "Error: No file selected"
     try:
+        progress(0.1, desc="Initializing HTML-enhanced processing...")
+        # Prepare enhanced preprocessing options
         preprocessing_options = {
             'enable_header_footer_removal': enable_header_footer_removal,
+            'enhanced_crop_processing': True,
+            'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
         }
+        progress(0.3, desc="Processing with HTML enhancement...")
+        # Process the PDF with enhanced preprocessing
+        result = backend_manager.process_pdf_with_enhanced_resolution(
+            pdf_file.name, ocr_method, preprocessing_options
+        )
+        progress(0.9, desc="Finalizing HTML processing...")
         progress(1.0, desc="Complete!")
         if result['success']:
+            metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
+            status = f"Success: Processed using {result['method_used']} with HTML enhancement"
+            # Return text, HTML, metadata, and status
+            return (result['text'],
+                   result.get('html', ''),
+                   metadata_info,
+                   status)
         else:
             error_msg = result.get('error', 'Unknown error occurred')
+            return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
     except Exception as e:
+        logger.error(f"HTML-enhanced processing error: {e}")
+        return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
+def format_enhanced_metadata(metadata, method_used):
+    """Enhanced metadata formatting with HTML processing info"""
     if not metadata:
         return f"Method used: {method_used}"
     if 'pages' in metadata:
         info_lines.append(f"Pages processed: {metadata['pages']}")
+    if metadata.get('enhanced_processing', False):
+        info_lines.append("Enhanced processing: Enabled")
+    if metadata.get('html_processing', False):
+        info_lines.append("HTML generation: Enabled")
+    if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
+        info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
+    if 'custom_crops_applied' in metadata:
+        info_lines.append(f"Custom crops per page: {metadata['custom_crops_applied']}")
+    if 'tables' in metadata:
+        info_lines.append(f"Tables detected: {metadata['tables']}")
     if 'processing_time_seconds' in metadata:
         info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
     return "\n".join(info_lines)
+def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
+                             crop_top, crop_bottom, crop_left, crop_right,
+                             apply_to_all_pages, current_page_selection):
+    """Prepare enhanced downloads with HTML processing"""
+    text, html, metadata, status = process_pdf_with_html_enhancement(
+        pdf_file, method, enable_header_footer_removal,
+        crop_top, crop_bottom, crop_left, crop_right,
+        apply_to_all_pages, current_page_selection
     )
+    # Prepare downloads if processing was successful
+    if text and not text.startswith("Error:") and not text.startswith("No file"):
         try:
+            # Create enhanced download files
+            download_files = backend_manager.create_enhanced_downloads(text, html, metadata)
+            # Prepare gradio updates for download buttons
+            updates = [
+                text, metadata, status,  # Display outputs
+                gr.update(visible=True, value=download_files.get('txt')) if 'txt' in download_files else gr.update(visible=False),
+                gr.update(visible=True, value=download_files.get('docx')) if 'docx' in download_files else gr.update(visible=False),
+                gr.update(visible=True, value=download_files.get('html')) if 'html' in download_files else gr.update(visible=False)
+            ]
+            return tuple(updates)
+        except Exception as file_error:
+            logger.error(f"Enhanced file creation error: {file_error}")
+            return (text, metadata, status,
+                   gr.update(visible=False),
+                   gr.update(visible=False),
+                   gr.update(visible=False))
     else:
+        return (text, metadata, status,
+               gr.update(visible=False),
+               gr.update(visible=False),
+               gr.update(visible=False))
+def get_enhanced_method_info(method):
+    """Get information about selected OCR method with HTML processing"""
     method_descriptions = {
+        "auto": "**Auto Selection**: Automatically chooses the best available method with HTML processing and enhanced table handling.",
+        "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with HTML generation, layout preservation, and smart table detection.",
+        "tesseract": "**Tesseract OCR**: Open-source OCR with HTML output, enhanced image preprocessing, and resolution scaling.",
+        "pymupdf": "**PyMuPDF**: Fast extraction enhanced with HTML processing and improved formatting preservation."
     }
     return method_descriptions.get(method, "Select a method to see details.")
+def check_enhanced_service_status():
+    """Check and display enhanced service status"""
     available_methods = backend_manager.get_available_methods()
+    status_lines = ["**Available OCR Methods (Enhanced with HTML Processing):**"]
     if "azure" in available_methods:
+        status_lines.append("✓ Azure Document Intelligence - Ready (HTML + Tables)")
     else:
+        status_lines.append("✗ Azure Document Intelligence - Not configured")
     if "tesseract" in available_methods:
+        status_lines.append("✓ Tesseract OCR - Ready (HTML Enhanced)")
     else:
+        status_lines.append("✗ Tesseract OCR - Not available")
     if "pymupdf" in available_methods:
+        status_lines.append("✓ PyMuPDF - Ready (HTML Enhanced)")
     else:
+        status_lines.append("✗ PyMuPDF - Not available")
+    # Add enhanced features status
+    status_lines.append("✓ HTML Processing - Available")
+    status_lines.append("✓ Enhanced Table Handling - Available")
+    status_lines.append("✓ Smart Text Preservation - Available")
+    status_lines.append("✓ Multi-Page Crop Preview - Available")
+    status_lines.append("✓ Per-Page Crop Customization - Available")
     if HAS_DOCX_SUPPORT:
+        status_lines.append("✓ Enhanced DOCX Export - Available")
     else:
+        status_lines.append("✗ Enhanced DOCX Export - Install python-docx to enable")
+    status_lines.append("✓ HTML File Export - Available")
+    status_lines.append("✓ Enhanced Text Export - Available")
+    return "\n".join(status_lines)
+def create_enhanced_interface():
+    """Create enhanced Gradio interface with improved layout and HTML processing"""
     with gr.Blocks(
+        title="PDF OCR Service - Enhanced with HTML Processing",
         theme=gr.themes.Soft(),
         css="""
         .main-header { text-align: center; margin-bottom: 2rem; }
+        .config-panel { border: 2px solid #007bff; padding: 1.5rem; border-radius: 0.8rem; background-color: #f8f9fa; margin-bottom: 1rem; }
+        .instructions-panel { border: 2px solid #28a745; padding: 1.5rem; border-radius: 0.8rem; background-color: #f0fff0; margin-bottom: 1rem; }
+        .crop-controls { border: 2px solid #ffc107; padding: 1rem; border-radius: 0.5rem; background-color: #fffef7; }
+        .page-preview { border: 2px solid #17a2b8; padding: 1rem; border-radius: 0.5rem; background-color: #f0f8ff; }
+        .results-panel { border: 2px solid #6f42c1; padding: 1rem; border-radius: 0.5rem; background-color: #f8f5ff; }
         .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
         """
     ) as interface:
         gr.HTML("""
         <div class="main-header">
+            <h1>PDF OCR Service - Enhanced with HTML Processing</h1>
+            <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, and format preservation</p>
         </div>
         """)
+        # Instructions at the top
+        with gr.Group(elem_classes=["instructions-panel"]):
+            gr.HTML("<h3>Instructions & Features</h3>")
+            gr.HTML("""
+            <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
+                <h4>How to Use:</h4>
+                <ol>
+                    <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
+                    <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
+                    <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
+                    <li><strong>Process:</strong> Click the process button to extract text with HTML enhancement</li>
+                    <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format</li>
+                </ol>
+                <h4>Enhanced Features:</h4>
+                <ul>
+                    <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
+                    <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
+                    <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads</li>
+                    <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
+                    <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
+                    <li><strong>Page Numbers:</strong> Automatic page numbering in extracted content</li>
+                    <li><strong>Proper Indentation:</strong> Preserved spacing and formatting</li>
+                </ul>
+            </div>
+            """)
+        # Configuration Panel - Top Left
+        with gr.Group(elem_classes=["config-panel"]):
+            gr.HTML("<h3>Configuration Panel</h3>")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    # File upload
+                    pdf_input = gr.File(
+                        label="Upload PDF File",
+                        file_types=[".pdf"],
+                        file_count="single"
+                    )
+                    # PDF loading status
+                    pdf_load_status = gr.Textbox(
+                        label="PDF Status",
+                        interactive=False,
+                        lines=1,
+                        value="No PDF loaded"
+                    )
+                with gr.Column(scale=1):
+                    # OCR method selection
+                    method_choice = gr.Dropdown(
+                        choices=["auto", "azure", "tesseract", "pymupdf"],
+                        value="auto",
+                        label="OCR Method",
+                        info="Choose OCR method (all enhanced with HTML processing)"
+                    )
+                    # Method information display
+                    method_info = gr.Markdown(
+                        value=get_enhanced_method_info("auto"),
+                        elem_classes=["method-info"]
+                    )
+            # Enhanced Header/Footer Removal Section
+            with gr.Group(elem_classes=["crop-controls"]):
+                gr.HTML("<h4>Header/Footer Removal & Crop Settings</h4>")
                 enable_header_footer_removal = gr.Checkbox(
+                    label="Enable Enhanced Header/Footer Removal",
                     value=False,
+                    info="Remove headers and footers with high-resolution processing"
                 )
+                # Multi-page controls
                 with gr.Group(visible=False) as crop_controls:
+                    gr.HTML("<h5>Multi-Page Crop Control</h5>")
+                    with gr.Row():
+                        # Page selection
+                        page_selector = gr.Dropdown(
+                            label="Select Page for Preview",
+                            choices=[],
+                            value=None,
+                            info="Choose page to preview and customize crop settings",
+                            visible=False
+                        )
+                        # Apply to all pages toggle
+                        apply_to_all_pages = gr.Checkbox(
+                            label="Apply crop settings to all pages",
+                            value=True,
+                            info="When enabled, changes apply to all pages"
+                        )
+                    gr.HTML("<h5>Crop Areas (% of page)</h5>")
+                    with gr.Row():
+                        crop_top = gr.Slider(
+                            minimum=0,
+                            maximum=40,
+                            value=8,
+                            step=0.5,
+                            label="Top Crop %"
+                        )
+                        crop_bottom = gr.Slider(
+                            minimum=0,
+                            maximum=40,
+                            value=8,
+                            step=0.5,
+                            label="Bottom Crop %"
+                        )
+                    with gr.Row():
+                        crop_left = gr.Slider(
+                            minimum=0,
+                            maximum=30,
+                            value=3,
+                            step=0.5,
+                            label="Left Crop %"
+                        )
+                        crop_right = gr.Slider(
+                            minimum=0,
+                            maximum=30,
+                            value=3,
+                            step=0.5,
+                            label="Right Crop %"
+                        )
+                    # Quick preset buttons
+                    with gr.Row():
+                        preset_light = gr.Button("Light Crop (5%)", size="sm")
+                        preset_medium = gr.Button("Medium Crop (10%)", size="sm")
+                        preset_heavy = gr.Button("Heavy Crop (15%)", size="sm")
+                        preset_reset = gr.Button("Reset", size="sm")
+            # Process button
+            process_btn = gr.Button(
+                "Process PDF with HTML Enhancement",
+                variant="primary",
+                size="lg"
+            )
+        # Results and Preview Section
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Enhanced crop preview with multi-page support
+                with gr.Group(visible=False, elem_classes=["page-preview"]) as preview_group:
+                    gr.HTML("<h4>Page Preview with Crop Visualization</h4>")
+                    crop_preview = gr.Image(
+                        label="High-Resolution Page Preview",
+                        interactive=False,
+                        height=500,
+                        show_label=False
                     )
+                    gr.HTML("""
+                    <p style="font-size: 0.9em; color: #666; text-align: center;">
+                    <strong>Red areas:</strong> Will be removed | <strong>Green outline:</strong> Content area |
+                    <strong>Enhanced:</strong> 2x resolution processing
+                    </p>
+                    """)
             with gr.Column(scale=2):
+                with gr.Group(elem_classes=["results-panel"]):
+                    gr.HTML("<h3>Results & Downloads</h3>")
+                    # Processing status
+                    processing_status = gr.Textbox(
+                        label="Processing Status",
+                        interactive=False,
+                        lines=1
+                    )
+                    # Extracted text output
+                    text_output = gr.Textbox(
+                        label="Extracted Text (Enhanced with Proper Formatting and Page Numbers)",
+                        placeholder="Processed text with HTML enhancement and preserved formatting will appear here...",
+                        lines=20,
+                        max_lines=30,
+                        interactive=False,
+                        show_copy_button=True
                     )
+                    # Metadata information
+                    metadata_output = gr.Textbox(
+                        label="Processing Information",
+                        interactive=False,
+                        lines=4
                     )
+                    # Enhanced download buttons
+                    with gr.Row():
+                        download_txt_btn = gr.DownloadButton(
+                            "Download Enhanced TXT",
+                            visible=False,
+                            variant="secondary"
+                        )
+                        download_docx_btn = gr.DownloadButton(
+                            "Download Enhanced DOCX",
+                            visible=False,
+                            variant="secondary"
+                        )
+                        download_html_btn = gr.DownloadButton(
+                            "Download HTML File",
+                            visible=False,
+                            variant="secondary"
+                        )
+        # Service Status at the bottom
+        with gr.Group(elem_classes=["status-box"]):
+            gr.HTML("<h4>Service Status</h4>")
+            service_status = gr.Markdown(
+                value=check_enhanced_service_status()
+            )
+            # Refresh status button
+            refresh_btn = gr.Button("Refresh Status", size="sm")
+        # Event handlers with enhanced functionality
+        # PDF upload handler
+        pdf_input.change(
+            fn=load_pdf_for_preview,
+            inputs=[pdf_input],
+            outputs=[crop_preview, page_selector, crop_controls, pdf_load_status]
+        )
+        # Method info handler
         method_choice.change(
+            fn=get_enhanced_method_info,
             inputs=[method_choice],
             outputs=[method_info]
         )
+        # Header/footer removal handler
         enable_header_footer_removal.change(
+            fn=lambda enabled: [
                 gr.update(visible=enabled),
+                gr.update(visible=enabled)
+            ],
             inputs=[enable_header_footer_removal],
+            outputs=[crop_controls, preview_group]
         )
+        # Page selection handler
+        page_selector.change(
+            fn=change_preview_page,
+            inputs=[page_selector, crop_top, crop_bottom, crop_left, crop_right],
+            outputs=[crop_preview]
+        )
+        # Crop parameter handlers - update preview in real-time
+        for crop_input in [crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages]:
             crop_input.change(
+                fn=update_crop_preview_interactive,
+                inputs=[page_selector, crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages],
                 outputs=[crop_preview]
             )
+        # Preset button handlers
+        def apply_preset(top, bottom, left, right):
+            return top, bottom, left, right
+        preset_light.click(
+            fn=lambda: apply_preset(5, 5, 2, 2),
+            outputs=[crop_top, crop_bottom, crop_left, crop_right]
+        )
+        preset_medium.click(
+            fn=lambda: apply_preset(10, 10, 5, 5),
+            outputs=[crop_top, crop_bottom, crop_left, crop_right]
+        )
+        preset_heavy.click(
+            fn=lambda: apply_preset(15, 15, 8, 8),
+            outputs=[crop_top, crop_bottom, crop_left, crop_right]
         )
+        preset_reset.click(
+            fn=lambda: apply_preset(0, 0, 0, 0),
+            outputs=[crop_top, crop_bottom, crop_left, crop_right]
+        )
+        # Status refresh handler
         refresh_btn.click(
+            fn=check_enhanced_service_status,
             outputs=[service_status]
         )
+        # Main processing handler with enhanced downloads
         process_btn.click(
+            fn=prepare_enhanced_downloads,
+            inputs=[pdf_input, method_choice, enable_header_footer_removal,
+                   crop_top, crop_bottom, crop_left, crop_right,
+                   apply_to_all_pages, page_selector],
+            outputs=[text_output, metadata_output, processing_status,
+                    download_txt_btn, download_docx_btn, download_html_btn]
         )
     return interface
+def launch_enhanced_ui():
+    """Launch the enhanced Gradio interface with HTML processing"""
+    try:
+        interface = create_enhanced_interface()
+        interface.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True
+        )
+    finally:
+        # Clean up resources
+        pdf_manager.close()
 if __name__ == "__main__":
+    launch_enhanced_ui()

backend.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Backend Management Module - Enhanced with Header/Footer Removal
 Coordinates between UI and OCR services, handles file management and preprocessing
 """
 import re
@@ -26,8 +26,385 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class BackendManager:
-    """Backend manager for PDF OCR processing with enhanced preprocessing and table handling"""
     def __init__(self):
         self.ocr_service = OCRService()
@@ -38,12 +415,12 @@ class BackendManager:
         self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
         self.temp_dir.mkdir(exist_ok=True)
-        logger.info("Enhanced backend manager initialized successfully")
-    def process_pdf(self, pdf_path: str, method: str = "auto",
-                   preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         """
-        Process PDF file with optional preprocessing and return results
         Args:
             pdf_path: Path to the PDF file
@@ -51,7 +428,7 @@ class BackendManager:
             preprocessing_options: Dictionary containing preprocessing settings
         Returns:
-            Dict containing processing results
         """
         start_time = datetime.now()
@@ -61,11 +438,12 @@ class BackendManager:
                 'success': False,
                 'error': f"File not found: {pdf_path}",
                 'text': '',
                 'method_used': '',
                 'metadata': {}
             }
-        # Check file size (limit to 50MB by default)
         max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
         file_size = os.path.getsize(pdf_path)
@@ -74,14 +452,15 @@ class BackendManager:
                 'success': False,
                 'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
                 'text': '',
                 'method_used': '',
                 'metadata': {}
             }
-        # Generate file hash for caching/tracking
         file_hash = self._calculate_file_hash(pdf_path)
-        logger.info(f"Processing PDF: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
         logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
         # Handle preprocessing if enabled
@@ -89,18 +468,17 @@ class BackendManager:
         preprocessing_applied = False
         if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
-            logger.info("Applying header/footer removal preprocessing...")
             try:
-                processed_pdf_path = self._apply_preprocessing(pdf_path, preprocessing_options)
                 preprocessing_applied = True
-                logger.info("Preprocessing completed successfully")
             except Exception as e:
                 logger.error(f"Preprocessing failed: {e}")
-                # Continue with original file if preprocessing fails
                 processed_pdf_path = pdf_path
         try:
-            # Process the PDF (original or preprocessed)
             result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
             # Add processing metadata
@@ -111,15 +489,12 @@ class BackendManager:
                 'file_size_mb': round(file_size / (1024*1024), 2),
                 'processing_time_seconds': round(processing_time, 2),
                 'timestamp': start_time.isoformat(),
                 'header_footer_removed': preprocessing_applied,
                 'preprocessing_options': preprocessing_options if preprocessing_applied else None
             })
-            # Post-process for better table handling if needed
-            if result['success'] and result['text']:
-                result['text'] = self._post_process_extracted_text(result['text'])
-                result['metadata']['post_processed'] = True
             # Cleanup temporary preprocessed file
             if preprocessing_applied and processed_pdf_path != pdf_path:
                 try:
@@ -130,14 +505,17 @@ class BackendManager:
             # Log results
             if result['success']:
                 text_length = len(result['text'])
-                table_count = result['text'].count('--- TABLE ')
-                logger.info(f"Processing completed successfully in {processing_time:.2f}s")
                 logger.info(f"Method used: {result['method_used']}")
                 logger.info(f"Text extracted: {text_length} characters")
                 if table_count > 0:
                     logger.info(f"Tables detected: {table_count}")
                 if preprocessing_applied:
-                    logger.info("Header/footer removal applied")
                 # Add to processing history
                 self._add_to_history({
@@ -148,10 +526,12 @@ class BackendManager:
                     'text_length': text_length,
                     'table_count': table_count,
                     'processing_time': processing_time,
-                    'preprocessing_applied': preprocessing_applied
                 })
             else:
-                logger.error(f"Processing failed: {result.get('error', 'Unknown error')}")
                 # Add to processing history
                 self._add_to_history({
@@ -161,15 +541,16 @@ class BackendManager:
                     'success': False,
                     'error': result.get('error', 'Unknown error'),
                     'processing_time': processing_time,
-                    'preprocessing_applied': preprocessing_applied
                 })
             return result
         except Exception as e:
-            logger.error(f"Unexpected error during processing: {e}")
-            # Cleanup temporary preprocessed file
             if preprocessing_applied and processed_pdf_path != pdf_path:
                 try:
                     os.unlink(processed_pdf_path)
@@ -184,57 +565,36 @@ class BackendManager:
                 'method_requested': method,
                 'success': False,
                 'error': str(e),
-                'processing_time': processing_time
             })
             return {
                 'success': False,
-                'error': f"Processing error: {str(e)}",
                 'text': '',
                 'method_used': '',
                 'metadata': {
                     'file_hash': file_hash,
                     'processing_time_seconds': round(processing_time, 2),
-                    'timestamp': start_time.isoformat()
                 }
             }
-    def _apply_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
-        """
-        Apply header/footer removal preprocessing to PDF
-        Args:
-            pdf_path: Path to original PDF
-            options: Preprocessing options
-        Returns:
-            Path to preprocessed PDF file
-        """
-        removal_method = options.get('removal_method', 'fixed')
         # Create temporary file for processed PDF
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        temp_pdf_path = self.temp_dir / f"preprocessed_{timestamp}.pdf"
-        if removal_method == 'fixed':
-            return self._apply_fixed_removal(pdf_path, str(temp_pdf_path), options)
-        elif removal_method == 'crop':
-            return self._apply_crop_removal(pdf_path, str(temp_pdf_path), options)
-        else:
-            raise ValueError(f"Unknown removal method: {removal_method}")
-    def _apply_crop_removal(self, input_pdf: str, output_pdf: str, options: Dict[str, Any]) -> str:
-        """Apply percentage-based crop removal"""
-        crop_settings = options.get('crop_settings', {})
-        top_percent = crop_settings.get('top', 0)
-        bottom_percent = crop_settings.get('bottom', 0)
-        left_percent = crop_settings.get('left', 0)
-        right_percent = crop_settings.get('right', 0)
-        if all(v == 0 for v in [top_percent, bottom_percent, left_percent, right_percent]):
-            return input_pdf  # No processing needed
-        doc = fitz.open(input_pdf)
         new_doc = fitz.open()
         try:
@@ -242,7 +602,17 @@ class BackendManager:
                 page = doc.load_page(page_num)
                 page_rect = page.rect
-                # Calculate crop amounts in pixels
                 width = page_rect.width
                 height = page_rect.height
@@ -259,319 +629,109 @@ class BackendManager:
                     page_rect.y1 - crop_bottom
                 )
-                # Create new page with cropped content
-                new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height)
-                # Copy content from cropped area
-                new_page.show_pdf_page(
-                    new_page.rect,
-                    doc,
-                    page_num,
-                    clip=new_rect
-                )
-            new_doc.save(output_pdf)
-            logger.info(f"Crop removal applied: top={top_percent}%, bottom={bottom_percent}%, left={left_percent}%, right={right_percent}%")
         finally:
             doc.close()
             new_doc.close()
-        return output_pdf
-    def _post_process_extracted_text(self, text: str) -> str:
-        """
-        Post-process extracted text with ZERO duplication - completely rewritten
-        Args:
-            text: Raw extracted text
-        Returns:
-            Cleaned and formatted text with zero duplicates
-        """
-        if not text or not text.strip():
-            return text
-        import re
-        # Step 1: Split by page markers first to handle each page individually
-        if '=== PAGE ' in text:
-            pages = re.split(r'(=== PAGE \d+ ===)', text)
-            processed_pages = []
-            for i, page_part in enumerate(pages):
-                if not page_part.strip():
-                    continue
-                if page_part.startswith('=== PAGE '):
-                    # This is a page marker, keep it
-                    processed_pages.append(page_part)
-                else:
-                    # This is page content, process it
-                    cleaned_content = self._clean_page_content(page_part)
-                    if cleaned_content.strip():
-                        processed_pages.append(cleaned_content)
-            return '\n'.join(processed_pages)
-        else:
-            # Single page content
-            return self._clean_page_content(text)
-    def _clean_page_content(self, content: str) -> str:
-        """Clean individual page content removing all duplicates and artifacts"""
-        if not content.strip():
-            return content
-        import re
-        # Step 1: Identify and preserve table sections
-        table_pattern = r'(--- TABLE \d+ ---\n.*?)(?=\n--- TABLE \d+ ---|\n=== PAGE |\Z)'
-        table_sections = {}
-        table_positions = []
-        for match in re.finditer(table_pattern, content, re.DOTALL):
-            start_pos = match.start()
-            end_pos = match.end()
-            table_content = match.group(1)
-            table_sections[start_pos] = table_content
-            table_positions.append((start_pos, end_pos))
-        # Step 2: Extract pure text content (excluding table regions)
-        text_content = content
-        # Remove table sections from text processing
-        for start_pos, end_pos in sorted(table_positions, reverse=True):
-            text_content = text_content[:start_pos] + '\n<<<TABLE_PLACEHOLDER>>>\n' + text_content[end_pos:]
-        # Step 3: Clean the text content
-        lines = text_content.split('\n')
-        cleaned_lines = []
-        for line in lines:
-            if line.strip() == '<<<TABLE_PLACEHOLDER>>>':
-                cleaned_lines.append(line)  # Preserve placeholder
-                continue
-            # Remove excessive whitespace but preserve structure
-            if line.strip():
-                # Clean up multiple spaces but preserve indentation
-                leading_spaces = len(line) - len(line.lstrip())
-                content_part = re.sub(r'\s+', ' ', line.strip())
-                cleaned_line = ' ' * leading_spaces + content_part
-                cleaned_lines.append(cleaned_line)
-            else:
-                cleaned_lines.append('')
-        # Step 4: Remove excessive empty lines
-        result_lines = []
-        empty_count = 0
-        for line in cleaned_lines:
-            if not line.strip() and line != '<<<TABLE_PLACEHOLDER>>>':
-                empty_count += 1
-                if empty_count <= 1:  # Allow max 1 empty line between content
-                    result_lines.append('')
-            else:
-                empty_count = 0
-                result_lines.append(line)
-        # Step 5: Restore table sections with enhanced cleaning
-        processed_text = '\n'.join(result_lines)
-        # Replace placeholders with cleaned table content
-        for start_pos in sorted(table_sections.keys()):
-            table_content = table_sections[start_pos]
-            # ENHANCED: Clean table content to remove separator rows
-            cleaned_table_content = self._clean_table_content(table_content)
-            processed_text = processed_text.replace('\n<<<TABLE_PLACEHOLDER>>>\n', f'\n{cleaned_table_content}\n', 1)
-        return processed_text
-    def _clean_table_content(self, table_content: str) -> str:
-        """Clean table content removing separator rows and duplicates"""
-        lines = table_content.split('\n')
-        cleaned_lines = []
-        for line in lines:
-            line_stripped = line.strip()
-            # Keep table headers
-            if line_stripped.startswith('--- TABLE '):
-                cleaned_lines.append(line_stripped)
-                continue
-            # CRITICAL: Skip separator rows (lines that are mostly dashes and pipes)
-            if line_stripped:
-                # Remove pipes and spaces, check if remaining content is just dashes
-                content_check = line_stripped.replace('|', '').replace(' ', '')
-                if content_check.replace('-', '') == '':
-                    # This is a separator row, skip it
-                    continue
-                # Keep actual content rows
-                cleaned_lines.append(line_stripped)
-        return '\n'.join(cleaned_lines)
-    def extract_table_data(self, text: str) -> Dict[str, Any]:
-        """
-        Extract structured table data from processed text - NO duplicates
-        Args:
-            text: Processed text containing table markers
-        Returns:
-            Dictionary containing extracted table information
-        """
-        import re
-        tables = {}
-        # More precise pattern to avoid overlapping matches
-        table_pattern = r'--- TABLE (\d+) ---\n(.*?)(?=\n--- TABLE \d+ ---|$|\n=== PAGE)'
-        matches = re.finditer(table_pattern, text, re.DOTALL)
-        for match in matches:
-            table_num = int(match.group(1))
-            table_content = match.group(2).strip()
-            # Only process if we haven't seen this table number before
-            if table_num not in tables:
-                table_data = self._parse_table_content(table_content)
-                tables[table_num] = table_data
-        return {
-            'table_count': len(tables),
-            'tables': tables,
-            'has_tables': len(tables) > 0
-        }
-    def _parse_table_content(self, content: str) -> Dict[str, Any]:
-        """Parse individual table content into structured data - improved with separator filtering"""
-        lines = [line.strip() for line in content.split('\n') if line.strip()]
-        table_data = {
-            'rows': [],
-            'columns': 0,
-            'has_header': False
-        }
-        seen_rows = set()  # Track seen row content to avoid duplicates
-        for i, line in enumerate(lines):
-            # ENHANCED: Skip separator lines more comprehensively
-            line_content = line.replace('|', '').replace(' ', '')
-            if line_content.replace('-', '') == '':
-                continue  # Skip separator rows
-            if '|' in line:
-                # Split by | and clean up cells
-                cells = [cell.strip() for cell in line.split('|')]
-                # Remove empty cells at start/end
-                while cells and not cells[0]:
-                    cells.pop(0)
-                while cells and not cells[-1]:
-                    cells.pop()
-                if cells:
-                    # Create a key for duplicate detection
-                    row_key = '|'.join(cells).lower().strip()
-                    # Only add if we haven't seen this exact row before
-                    if row_key not in seen_rows:
-                        table_data['rows'].append(cells)
-                        table_data['columns'] = max(table_data['columns'], len(cells))
-                        seen_rows.add(row_key)
-                        # Assume first unique row is header
-                        if len(table_data['rows']) == 1:
-                            table_data['has_header'] = True
-        return table_data
-    def validate_pdf_file(self, file_path: str) -> Dict[str, Any]:
-        """
-        Validate PDF file before processing - enhanced validation
-        Args:
-            file_path: Path to the PDF file
-        Returns:
-            Dict with validation results
-        """
-        validation_result = {
-            'valid': False,
-            'error': None,
-            'warnings': [],
-            'file_info': {}
-        }
         try:
-            # Check if file exists
-            if not os.path.exists(file_path):
-                validation_result['error'] = "File does not exist"
-                return validation_result
-            # Check file extension
-            if not file_path.lower().endswith('.pdf'):
-                validation_result['warnings'].append("File does not have .pdf extension")
-            # Check file size
-            file_size = os.path.getsize(file_path)
-            max_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
-            if file_size > max_size:
-                validation_result['error'] = f"File too large ({file_size/(1024*1024):.1f}MB > {max_size/(1024*1024)}MB)"
-                return validation_result
-            if file_size == 0:
-                validation_result['error'] = "File is empty"
-                return validation_result
-            # Try to open with PyMuPDF to check if it's a valid PDF
             try:
-                import fitz
-                doc = fitz.open(file_path)
-                page_count = len(doc)
-                # Quick validation - try to access first page
-                if page_count > 0:
-                    first_page = doc.load_page(0)
-                    # Try to get some text to ensure it's readable
-                    first_page.get_text()
-                doc.close()
-                if page_count == 0:
-                    validation_result['warnings'].append("PDF contains no pages")
-                validation_result['file_info'] = {
-                    'size_mb': round(file_size / (1024*1024), 2),
-                    'pages': page_count
-                }
-            except Exception as pdf_error:
-                validation_result['error'] = f"Invalid or corrupted PDF file: {str(pdf_error)}"
-                return validation_result
-            validation_result['valid'] = True
         except Exception as e:
-            validation_result['error'] = f"Validation error: {str(e)}"
-        return validation_result
     def get_available_methods(self) -> List[str]:
         """Get list of available OCR methods"""
         methods = self.ocr_service.get_available_methods()
-        logger.info(f"Available OCR methods: {methods}")
         return methods
     def get_service_status(self) -> Dict[str, Any]:
-        """Get comprehensive service status"""
         available_methods = self.get_available_methods()
         status = {
             'service_healthy': True,
             'available_methods': available_methods,
@@ -582,37 +742,16 @@ class BackendManager:
             'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
             'temp_dir': str(self.temp_dir),
             'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
-            'table_processing_enhanced': True,
-            'preprocessing_available': True
         }
         return status
-    def get_processing_history(self, limit: int = 10) -> List[Dict[str, Any]]:
-        """Get recent processing history"""
-        return self.processing_history[-limit:]
-    def cleanup_temp_files(self):
-        """Clean up temporary files"""
-        try:
-            temp_files = list(self.temp_dir.glob('*'))
-            cleaned_count = 0
-            for temp_file in temp_files:
-                try:
-                    # Remove files older than 1 hour
-                    if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
-                        temp_file.unlink()
-                        cleaned_count += 1
-                except Exception as e:
-                    logger.warning(f"Could not remove temp file {temp_file}: {e}")
-            if cleaned_count > 0:
-                logger.info(f"Cleaned up {cleaned_count} temporary files")
-        except Exception as e:
-            logger.error(f"Error during cleanup: {e}")
     def _calculate_file_hash(self, file_path: str) -> str:
         """Calculate SHA-256 hash of file"""
         sha256_hash = hashlib.sha256()
@@ -634,32 +773,29 @@ class BackendManager:
         if len(self.processing_history) > self.max_history_size:
             self.processing_history = self.processing_history[-self.max_history_size:]
-    def export_processing_history(self, file_path: str = None) -> str:
-        """Export processing history to JSON file"""
-        if file_path is None:
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            file_path = self.temp_dir / f"processing_history_{timestamp}.json"
         try:
-            history_data = {
-                'exported_at': datetime.now().isoformat(),
-                'total_entries': len(self.processing_history),
-                'service_status': self.get_service_status(),
-                'history': self.processing_history
-            }
-            with open(file_path, 'w') as f:
-                json.dump(history_data, f, indent=2)
-            logger.info(f"Processing history exported to: {file_path}")
-            return str(file_path)
         except Exception as e:
-            logger.error(f"Error exporting history: {e}")
-            raise
-    def get_statistics(self) -> Dict[str, Any]:
-        """Get processing statistics"""
         if not self.processing_history:
             return {
                 'total_processed': 0,
@@ -668,27 +804,31 @@ class BackendManager:
                 'most_used_method': 'N/A',
                 'total_text_extracted': 0,
                 'total_tables_processed': 0,
-                'preprocessing_usage': 0
             }
         total_processed = len(self.processing_history)
         successful = [h for h in self.processing_history if h.get('success', False)]
         success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
-        # Calculate average processing time
         processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
         avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
-        # Find most used method
         methods = [h.get('method_used', 'unknown') for h in successful]
         most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
-        # Calculate total text and tables extracted
         total_text = sum(h.get('text_length', 0) for h in successful)
         total_tables = sum(h.get('table_count', 0) for h in successful)
-        # Calculate preprocessing usage
         preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
         return {
             'total_processed': total_processed,
@@ -699,15 +839,18 @@ class BackendManager:
             'total_tables_processed': total_tables,
             'successful_processes': len(successful),
             'failed_processes': total_processed - len(successful),
-            'preprocessing_usage': preprocessing_usage
         }
-# Initialize global backend manager instance
 _backend_manager = None
 def get_backend_manager() -> BackendManager:
-    """Get global backend manager instance"""
     global _backend_manager
     if _backend_manager is None:
         _backend_manager = BackendManager()
@@ -715,11 +858,11 @@ def get_backend_manager() -> BackendManager:
 if __name__ == "__main__":
-    # Test the backend manager
     manager = BackendManager()
-    print("Enhanced Backend Manager Test")
-    print("============================")
     print(f"Available methods: {manager.get_available_methods()}")
     print(f"Service status: {manager.get_service_status()}")
-    print(f"Statistics: {manager.get_statistics()}")

 """
+Backend Management Module - FIXED VERSION with Corrected Crop Processing
 Coordinates between UI and OCR services, handles file management and preprocessing
 """
 import re
 logger = logging.getLogger(__name__)
+class DocumentExporter:
+    """Advanced document export with HTML-based formatting"""
+    @staticmethod
+    def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
+        """Create enhanced TXT file with improved formatting"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        temp_file = tempfile.NamedTemporaryFile(
+            suffix=f'_extracted_text_{timestamp}.txt',
+            delete=False,
+            mode='w',
+            encoding='utf-8'
+        )
+        try:
+            # Add header
+            temp_file.write("PDF OCR Extraction Results - Enhanced with HTML Processing\n")
+            temp_file.write("=" * 70 + "\n\n")
+            # Add metadata
+            if metadata_info:
+                temp_file.write("Processing Information:\n")
+                temp_file.write("-" * 25 + "\n")
+                temp_file.write(metadata_info + "\n\n")
+            # Add timestamp
+            temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            temp_file.write("=" * 70 + "\n\n")
+            # Add main content
+            temp_file.write("Extracted Text (Formatted):\n")
+            temp_file.write("-" * 30 + "\n\n")
+            temp_file.write(text_content)
+            temp_file.close()
+            return temp_file.name
+        except Exception as e:
+            logger.error(f"Error creating enhanced TXT file: {e}")
+            temp_file.close()
+            raise
+    @staticmethod
+    def create_enhanced_docx_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
+        """Create enhanced DOCX file from HTML content with proper spacing and indentation"""
+        try:
+            from docx import Document
+            from docx.shared import Inches, Pt, RGBColor
+            from docx.enum.text import WD_ALIGN_PARAGRAPH
+            from docx.enum.table import WD_TABLE_ALIGNMENT
+            from docx.oxml.shared import OxmlElement, qn
+            from html.parser import HTMLParser
+            # Enhanced HTML to DOCX parser with spacing preservation
+            class EnhancedDOCXHTMLParser(HTMLParser):
+                def __init__(self, doc):
+                    super().__init__()
+                    self.doc = doc
+                    self.current_paragraph = None
+                    self.current_run = None
+                    self.in_table = False
+                    self.current_table = None
+                    self.current_row = None
+                    self.current_cell = None
+                    self.table_data = []
+                    self.current_table_row = []
+                    self.current_indent_em = 0
+                    self.is_bold = False
+                    self.is_title = False
+                    self.is_heading = False
+                    self.is_bullet_point = False
+                def handle_starttag(self, tag, attrs):
+                    attr_dict = dict(attrs)
+                    class_attr = attr_dict.get('class', '')
+                    style_attr = attr_dict.get('style', '')
+                    if tag == 'div' and 'page' in class_attr:
+                        # Add minimal page separation (just paragraph spacing, no page break)
+                        if hasattr(self, 'has_content'):
+                            # Add just 2 line breaks worth of spacing
+                            self.doc.add_paragraph()
+                            self.doc.add_paragraph()
+                        self.has_content = True
+                    elif tag == 'div' and 'page-header' in class_attr:
+                        self.current_paragraph = self.doc.add_heading(level=1)
+                        self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    elif tag == 'div' and 'title' in class_attr:
+                        self.current_paragraph = self.doc.add_heading(level=1)
+                        self.is_title = True
+                        self._apply_spacing_from_style(style_attr)
+                    elif tag == 'div' and 'section-heading' in class_attr:
+                        self.current_paragraph = self.doc.add_heading(level=2)
+                        self.is_heading = True
+                        self._apply_spacing_from_style(style_attr)
+                    elif tag == 'div' and 'paragraph' in class_attr:
+                        self.current_paragraph = self.doc.add_paragraph()
+                        self.is_bullet_point = 'bullet-point' in class_attr
+                        self._apply_spacing_from_style(style_attr)
+                    elif tag == 'table':
+                        self.in_table = True
+                        self.table_data = []
+                    elif tag == 'tr':
+                        self.current_table_row = []
+                    elif tag == 'th' or tag == 'td':
+                        pass  # Will be handled in handle_data
+                    elif tag == 'br':
+                        if self.current_paragraph:
+                            self.current_paragraph.add_run().add_break()
+                def _apply_spacing_from_style(self, style_attr):
+                    """Apply spacing and indentation from HTML style to DOCX paragraph"""
+                    if not self.current_paragraph:
+                        return
+                    # Extract margin-left for indentation
+                    import re
+                    margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
+                    if margin_match:
+                        em_value = float(margin_match.group(1))
+                        # Convert em to inches (1em ≈ 12pt, 72pt = 1 inch)
+                        indent_inches = (em_value * 12) / 72
+                        self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
+                        # For bullet points, add hanging indent
+                        if self.is_bullet_point:
+                            self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
+                    # Set line spacing for better readability
+                    from docx.shared import Length
+                    self.current_paragraph.paragraph_format.line_spacing = 1.15
+                    # Add appropriate spacing after paragraphs
+                    self.current_paragraph.paragraph_format.space_after = Pt(6)
+                def handle_endtag(self, tag):
+                    if tag == 'div' and (self.is_title or self.is_heading):
+                        self.is_title = False
+                        self.is_heading = False
+                        self.current_paragraph = None
+                    elif tag == 'div' and self.current_paragraph and not self.in_table:
+                        self.is_bullet_point = False
+                        self.current_paragraph = None
+                    elif tag == 'table':
+                        self.in_table = False
+                        self._create_enhanced_docx_table()
+                    elif tag == 'tr' and self.current_table_row:
+                        self.table_data.append(self.current_table_row[:])
+                        self.current_table_row = []
+                def handle_data(self, data):
+                    if data.strip():
+                        # Convert &nbsp; back to regular spaces
+                        data = data.replace('&nbsp;', ' ')
+                        if self.in_table:
+                            self.current_table_row.append(data.strip())
+                        elif self.current_paragraph is not None:
+                            run = self.current_paragraph.add_run(data)
+                            if self.is_title:
+                                run.bold = True
+                                run.font.size = Pt(16)
+                            elif self.is_heading:
+                                run.bold = True
+                                run.font.size = Pt(14)
+                            else:
+                                # Regular text formatting
+                                run.font.size = Pt(11)
+                def _create_enhanced_docx_table(self):
+                    if not self.table_data:
+                        return
+                    # Create table with proper formatting
+                    rows = len(self.table_data)
+                    cols = max(len(row) for row in self.table_data) if self.table_data else 1
+                    table = self.doc.add_table(rows=rows, cols=cols)
+                    table.style = 'Table Grid'
+                    table.alignment = WD_TABLE_ALIGNMENT.LEFT
+                    # Set table margins
+                    table.autofit = False
+                    # Fill table data with proper formatting
+                    for row_idx, row_data in enumerate(self.table_data):
+                        table_row = table.rows[row_idx]
+                        for col_idx, cell_data in enumerate(row_data):
+                            if col_idx < len(table_row.cells):
+                                cell = table_row.cells[col_idx]
+                                cell.text = str(cell_data)
+                                # Style header row
+                                if row_idx == 0:
+                                    for paragraph in cell.paragraphs:
+                                        for run in paragraph.runs:
+                                            run.bold = True
+                                            run.font.size = Pt(10)
+                                        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                                else:
+                                    # Regular data cells
+                                    for paragraph in cell.paragraphs:
+                                        for run in paragraph.runs:
+                                            run.font.size = Pt(10)
+                                # Set cell margins for better spacing
+                                cell.vertical_alignment = WD_ALIGN_PARAGRAPH.LEFT
+                    # Add spacing after table
+                    self.doc.add_paragraph()
+            # Create DOCX document
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            temp_file = tempfile.NamedTemporaryFile(
+                suffix=f'_extracted_document_{timestamp}.docx',
+                delete=False
+            )
+            temp_file.close()
+            doc = Document()
+            # Set document margins for better spacing
+            sections = doc.sections
+            for section in sections:
+                section.top_margin = Inches(1)
+                section.bottom_margin = Inches(1)
+                section.left_margin = Inches(1)
+                section.right_margin = Inches(1)
+            # Title with better formatting
+            title = doc.add_heading('PDF OCR Extraction Results', 0)
+            title.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            # Add subtitle with enhanced styling
+            subtitle_para = doc.add_paragraph()
+            subtitle_run = subtitle_para.add_run('Enhanced with HTML Processing and Preserved Formatting')
+            subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            subtitle_run.italic = True
+            subtitle_run.font.size = Pt(12)
+            subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
+            # Metadata section with better formatting
+            if metadata_info:
+                doc.add_heading('Processing Information', level=1)
+                meta_para = doc.add_paragraph()
+                meta_run = meta_para.add_run(metadata_info)
+                meta_run.font.size = Pt(10)
+                meta_para.style = 'Intense Quote'
+                doc.add_paragraph()  # Add spacing
+            # Process HTML content with enhanced spacing
+            doc.add_heading('Extracted Content', level=1)
+            if html_content and '<table' in html_content:
+                # Parse HTML and convert to DOCX with spacing preservation
+                parser = EnhancedDOCXHTMLParser(doc)
+                parser.feed(html_content)
+            else:
+                # Fallback to text content with enhanced formatting
+                paragraphs = text_content.split('\n\n')
+                for para in paragraphs:
+                    if para.strip():
+                        if para.strip().startswith('==='):
+                            # Page headers with minimal separation
+                            page_header = doc.add_heading(para.strip(), level=1)
+                            page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                        elif para.strip().startswith('#'):
+                            # Titles
+                            title_text = para.strip().lstrip('#').strip()
+                            title_para = doc.add_heading(title_text, level=1)
+                        elif para.strip().startswith('##'):
+                            # Section headings
+                            heading_text = para.strip().lstrip('#').strip()
+                            heading_para = doc.add_heading(heading_text, level=2)
+                        else:
+                            # Regular paragraphs with spacing preservation
+                            lines = para.split('\n')
+                            for line in lines:
+                                if line.strip():
+                                    para_element = doc.add_paragraph()
+                                    # Calculate indentation from leading spaces
+                                    leading_spaces = len(line) - len(line.lstrip())
+                                    if leading_spaces > 0:
+                                        indent_level = leading_spaces // 2  # 2 spaces = 1 indent level
+                                        para_element.paragraph_format.left_indent = Inches(0.5 * indent_level)
+                                    # Add the text content
+                                    run = para_element.add_run(line.strip())
+                                    run.font.size = Pt(11)
+                                    # Set line spacing
+                                    para_element.paragraph_format.line_spacing = 1.15
+                                    para_element.paragraph_format.space_after = Pt(3)
+            # Enhanced footer
+            footer_section = doc.sections[0]
+            footer = footer_section.footer
+            footer_para = footer.paragraphs[0]
+            footer_para.text = f"Generated by Enhanced PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+            footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            footer_run = footer_para.runs[0]
+            footer_run.font.size = Pt(9)
+            footer_run.font.color.rgb = RGBColor(128, 128, 128)
+            doc.save(temp_file.name)
+            logger.info(f"Enhanced DOCX file with proper spacing created: {temp_file.name}")
+            return temp_file.name
+        except ImportError:
+            raise ImportError("python-docx not installed. Cannot create DOCX files.")
+        except Exception as e:
+            logger.error(f"Error creating enhanced DOCX file: {e}")
+            try:
+                os.unlink(temp_file.name)
+            except:
+                pass
+            raise
+    @staticmethod
+    def create_html_file(html_content: str, metadata_info: str = "") -> str:
+        """Create standalone HTML file"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        temp_file = tempfile.NamedTemporaryFile(
+            suffix=f'_extracted_document_{timestamp}.html',
+            delete=False,
+            mode='w',
+            encoding='utf-8'
+        )
+        try:
+            # Enhanced HTML with better styling
+            enhanced_html = html_content.replace(
+                '<style>',
+                '''<style>
+                    body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; margin: 20px; background-color: #f9f9f9; }
+                    .container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
+                    .header { text-align: center; margin-bottom: 30px; border-bottom: 3px solid #2c3e50; padding-bottom: 20px; }
+                    .metadata { background-color: #ecf0f1; padding: 15px; border-radius: 5px; margin-bottom: 25px; border-left: 4px solid #3498db; }
+                '''
+            )
+            # Wrap content in container
+            if '<body>' in enhanced_html:
+                enhanced_html = enhanced_html.replace(
+                    '<body>',
+                    '''<body>
+                    <div class="container">
+                    <div class="header">
+                        <h1>PDF OCR Extraction Results</h1>
+                        <p>Enhanced with HTML Processing and Format Preservation</p>
+                    </div>''' +
+                    (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
+                )
+                enhanced_html = enhanced_html.replace('</body>', '</div></body>')
+            temp_file.write(enhanced_html)
+            temp_file.close()
+            return temp_file.name
+        except Exception as e:
+            logger.error(f"Error creating HTML file: {e}")
+            temp_file.close()
+            raise
 class BackendManager:
+    """Enhanced backend manager with FIXED crop processing and advanced export capabilities"""
     def __init__(self):
         self.ocr_service = OCRService()
         self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
         self.temp_dir.mkdir(exist_ok=True)
+        logger.info("Enhanced backend manager with fixed crop processing initialized successfully")
+    def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
+                                           preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         """
+        Process PDF with enhanced resolution and HTML generation
         Args:
             pdf_path: Path to the PDF file
             preprocessing_options: Dictionary containing preprocessing settings
         Returns:
+            Dict containing processing results with HTML content
         """
         start_time = datetime.now()
                 'success': False,
                 'error': f"File not found: {pdf_path}",
                 'text': '',
+                'html': '',
                 'method_used': '',
                 'metadata': {}
             }
+        # Check file size
         max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
         file_size = os.path.getsize(pdf_path)
                 'success': False,
                 'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
                 'text': '',
+                'html': '',
                 'method_used': '',
                 'metadata': {}
             }
+        # Generate file hash for tracking
         file_hash = self._calculate_file_hash(pdf_path)
+        logger.info(f"Processing PDF with enhanced resolution: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
         logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
         # Handle preprocessing if enabled
         preprocessing_applied = False
         if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
+            logger.info("Applying enhanced preprocessing...")
             try:
+                processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options)
                 preprocessing_applied = True
+                logger.info("Enhanced preprocessing completed successfully")
             except Exception as e:
                 logger.error(f"Preprocessing failed: {e}")
                 processed_pdf_path = pdf_path
         try:
+            # Process with enhanced OCR
             result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
             # Add processing metadata
                 'file_size_mb': round(file_size / (1024*1024), 2),
                 'processing_time_seconds': round(processing_time, 2),
                 'timestamp': start_time.isoformat(),
+                'enhanced_processing': True,
+                'html_processing': True,
                 'header_footer_removed': preprocessing_applied,
                 'preprocessing_options': preprocessing_options if preprocessing_applied else None
             })
             # Cleanup temporary preprocessed file
             if preprocessing_applied and processed_pdf_path != pdf_path:
                 try:
             # Log results
             if result['success']:
                 text_length = len(result['text'])
+                has_html = bool(result.get('html'))
+                table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0
+                logger.info(f"Enhanced processing completed successfully in {processing_time:.2f}s")
                 logger.info(f"Method used: {result['method_used']}")
                 logger.info(f"Text extracted: {text_length} characters")
+                logger.info(f"HTML generated: {has_html}")
                 if table_count > 0:
                     logger.info(f"Tables detected: {table_count}")
                 if preprocessing_applied:
+                    logger.info("Enhanced preprocessing applied")
                 # Add to processing history
                 self._add_to_history({
                     'text_length': text_length,
                     'table_count': table_count,
                     'processing_time': processing_time,
+                    'preprocessing_applied': preprocessing_applied,
+                    'html_generated': has_html,
+                    'enhanced_processing': True
                 })
             else:
+                logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
                 # Add to processing history
                 self._add_to_history({
                     'success': False,
                     'error': result.get('error', 'Unknown error'),
                     'processing_time': processing_time,
+                    'preprocessing_applied': preprocessing_applied,
+                    'enhanced_processing': True
                 })
             return result
         except Exception as e:
+            logger.error(f"Unexpected error during enhanced processing: {e}")
+            # Cleanup
             if preprocessing_applied and processed_pdf_path != pdf_path:
                 try:
                     os.unlink(processed_pdf_path)
                 'method_requested': method,
                 'success': False,
                 'error': str(e),
+                'processing_time': processing_time,
+                'enhanced_processing': True
             })
             return {
                 'success': False,
+                'error': f"Enhanced processing error: {str(e)}",
                 'text': '',
+                'html': '',
                 'method_used': '',
                 'metadata': {
                     'file_hash': file_hash,
                     'processing_time_seconds': round(processing_time, 2),
+                    'timestamp': start_time.isoformat(),
+                    'enhanced_processing': True
                 }
             }
+    def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
+        """Apply enhanced preprocessing with high-resolution crop handling - FIXED"""
+        crop_settings = options.get('crop_settings', {})
+        per_page_crops = crop_settings.get('per_page_crops', {})
+        enhanced_resolution = crop_settings.get('enhanced_resolution', True)
+        resolution_scale = crop_settings.get('resolution_scale', 2.0)
         # Create temporary file for processed PDF
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        temp_pdf_path = self.temp_dir / f"enhanced_preprocessed_{timestamp}.pdf"
+        doc = fitz.open(pdf_path)
         new_doc = fitz.open()
         try:
                 page = doc.load_page(page_num)
                 page_rect = page.rect
+                # Get crop settings for this page - FIXED indexing
+                page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
+                    'top': 0, 'bottom': 0, 'left': 0, 'right': 0
+                }))
+                top_percent = page_crop.get('top', 0)
+                bottom_percent = page_crop.get('bottom', 0)
+                left_percent = page_crop.get('left', 0)
+                right_percent = page_crop.get('right', 0)
+                # Calculate crop amounts
                 width = page_rect.width
                 height = page_rect.height
                     page_rect.y1 - crop_bottom
                 )
+                # Ensure the rectangle is valid
+                if new_rect.width <= 0 or new_rect.height <= 0:
+                    logger.warning(f"Invalid crop rectangle for page {page_num}, using original page")
+                    new_rect = page_rect
+                # Create new page with enhanced resolution if enabled
+                if enhanced_resolution:
+                    # Use high resolution for better quality
+                    new_page = new_doc.new_page(
+                        width=new_rect.width,
+                        height=new_rect.height
+                    )
+                    # Copy content with proper transformation
+                    mat = fitz.Matrix(1, 1).prescale(resolution_scale, resolution_scale)
+                    new_page.show_pdf_page(
+                        new_page.rect,
+                        doc,
+                        page_num,
+                        clip=new_rect
+                    )
+                else:
+                    # Standard resolution
+                    new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height)
+                    new_page.show_pdf_page(
+                        new_page.rect,
+                        doc,
+                        page_num,
+                        clip=new_rect
+                    )
+                logger.debug(f"Page {page_num}: Applied crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%")
+            new_doc.save(str(temp_pdf_path))
+            logger.info(f"Enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages")
+        except Exception as e:
+            logger.error(f"Error in enhanced preprocessing: {e}")
+            raise
         finally:
             doc.close()
             new_doc.close()
+        return str(temp_pdf_path)
+    def create_enhanced_downloads(self, text_content: str, html_content: str,
+                                metadata_info: str = "") -> Dict[str, str]:
+        """Create enhanced download files with HTML processing"""
+        download_files = {}
         try:
+            # Create enhanced TXT file
+            txt_path = DocumentExporter.create_enhanced_txt_file(
+                text_content, html_content, metadata_info
+            )
+            download_files['txt'] = txt_path
+            logger.info(f"Enhanced TXT file created: {txt_path}")
+            # Create enhanced DOCX file if possible
             try:
+                docx_path = DocumentExporter.create_enhanced_docx_file(
+                    text_content, html_content, metadata_info
+                )
+                download_files['docx'] = docx_path
+                logger.info(f"Enhanced DOCX file created: {docx_path}")
+            except ImportError:
+                logger.warning("python-docx not available. DOCX creation skipped.")
+            except Exception as e:
+                logger.error(f"DOCX creation failed: {e}")
+            # Create standalone HTML file
+            try:
+                html_path = DocumentExporter.create_html_file(
+                    html_content, metadata_info
+                )
+                download_files['html'] = html_path
+                logger.info(f"HTML file created: {html_path}")
+            except Exception as e:
+                logger.error(f"HTML file creation failed: {e}")
         except Exception as e:
+            logger.error(f"Error creating enhanced downloads: {e}")
+            raise
+        return download_files
     def get_available_methods(self) -> List[str]:
         """Get list of available OCR methods"""
         methods = self.ocr_service.get_available_methods()
+        logger.info(f"Available enhanced OCR methods: {methods}")
         return methods
     def get_service_status(self) -> Dict[str, Any]:
+        """Get comprehensive service status with enhanced features"""
         available_methods = self.get_available_methods()
+        # Check DOCX support
+        try:
+            import docx
+            docx_available = True
+        except ImportError:
+            docx_available = False
         status = {
             'service_healthy': True,
             'available_methods': available_methods,
             'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
             'temp_dir': str(self.temp_dir),
             'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
+            'enhanced_processing': True,
+            'html_processing': True,
+            'docx_export_available': docx_available,
+            'enhanced_crop_processing': True,
+            'multi_resolution_support': True,
+            'crop_processing_fixed': True
         }
         return status
     def _calculate_file_hash(self, file_path: str) -> str:
         """Calculate SHA-256 hash of file"""
         sha256_hash = hashlib.sha256()
         if len(self.processing_history) > self.max_history_size:
             self.processing_history = self.processing_history[-self.max_history_size:]
+    def cleanup_temp_files(self):
+        """Clean up temporary files"""
         try:
+            temp_files = list(self.temp_dir.glob('*'))
+            cleaned_count = 0
+            for temp_file in temp_files:
+                try:
+                    # Remove files older than 1 hour
+                    if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
+                        temp_file.unlink()
+                        cleaned_count += 1
+                except Exception as e:
+                    logger.warning(f"Could not remove temp file {temp_file}: {e}")
+            if cleaned_count > 0:
+                logger.info(f"Cleaned up {cleaned_count} temporary files")
         except Exception as e:
+            logger.error(f"Error during cleanup: {e}")
+    def get_enhanced_statistics(self) -> Dict[str, Any]:
+        """Get enhanced processing statistics"""
         if not self.processing_history:
             return {
                 'total_processed': 0,
                 'most_used_method': 'N/A',
                 'total_text_extracted': 0,
                 'total_tables_processed': 0,
+                'preprocessing_usage': 0,
+                'html_generation_rate': 0,
+                'enhanced_processing_usage': 0
             }
         total_processed = len(self.processing_history)
         successful = [h for h in self.processing_history if h.get('success', False)]
         success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
+        # Calculate statistics
         processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
         avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
         methods = [h.get('method_used', 'unknown') for h in successful]
         most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
         total_text = sum(h.get('text_length', 0) for h in successful)
         total_tables = sum(h.get('table_count', 0) for h in successful)
         preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
+        html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
+        enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
+        html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
+        enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
         return {
             'total_processed': total_processed,
             'total_tables_processed': total_tables,
             'successful_processes': len(successful),
             'failed_processes': total_processed - len(successful),
+            'preprocessing_usage': preprocessing_usage,
+            'html_generation_rate': round(html_generation_rate, 2),
+            'enhanced_processing_usage': enhanced_processing,
+            'enhanced_processing_rate': round(enhanced_processing_rate, 2)
         }
+# Global backend manager instance
 _backend_manager = None
 def get_backend_manager() -> BackendManager:
+    """Get global enhanced backend manager instance"""
     global _backend_manager
     if _backend_manager is None:
         _backend_manager = BackendManager()
 if __name__ == "__main__":
+    # Test the enhanced backend manager
     manager = BackendManager()
+    print("Enhanced Backend Manager with Fixed Crop Processing Test")
+    print("=" * 60)
     print(f"Available methods: {manager.get_available_methods()}")
     print(f"Service status: {manager.get_service_status()}")
+    print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")

ocr_service.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """
-OCR Service Module - FIXED VERSION
-Handles PDF to text conversion using Azure Document Intelligence with fallback methods
 """
 import re
 import os
 import logging
-from typing import Optional, Dict, Any, Tuple
 import tempfile
 from pathlib import Path
@@ -35,8 +35,708 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class OCRService:
-    """Main OCR service with multiple providers and fallback mechanisms"""
     def __init__(self):
         self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
@@ -58,18 +758,19 @@ class OCRService:
     def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
         """
-        Convert PDF to text using specified method
         Args:
             pdf_path: Path to the PDF file
             method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
         Returns:
-            Dict containing text content, metadata, and processing info
         """
         result = {
             'success': False,
             'text': '',
             'method_used': '',
             'metadata': {},
             'error': None
@@ -91,7 +792,7 @@ class OCRService:
         # Try primary method
         try:
             if method == "azure" and self.azure_client:
-                result = self._azure_ocr(pdf_path)
             elif method == "tesseract":
                 result = self._tesseract_ocr(pdf_path)
             elif method == "pymupdf":
@@ -110,11 +811,12 @@ class OCRService:
         return result
-    def _azure_ocr(self, pdf_path: str) -> Dict[str, Any]:
-        """Azure Document Intelligence OCR with enhanced layout preservation"""
         result = {
             'success': False,
             'text': '',
             'method_used': 'azure_document_intelligence',
             'metadata': {},
             'error': None
@@ -124,9 +826,8 @@ class OCRService:
             with open(pdf_path, 'rb') as pdf_file:
                 file_content = pdf_file.read()
-                # Try different API call patterns for different SDK versions
                 try:
-                    # Pattern 1: body + content_type (most common for current SDK)
                     poller = self.azure_client.begin_analyze_document(
                         "prebuilt-layout",
                         body=file_content,
@@ -134,13 +835,11 @@ class OCRService:
                     )
                 except TypeError:
                     try:
-                        # Pattern 2: model_id + body
                         poller = self.azure_client.begin_analyze_document(
                             model_id="prebuilt-layout",
                             body=file_content
                         )
                     except TypeError:
-                        # Pattern 3: document parameter (older SDK)
                         pdf_file.seek(0)
                         poller = self.azure_client.begin_analyze_document(
                             "prebuilt-layout",
@@ -149,22 +848,29 @@ class OCRService:
             analysis_result = poller.result()
-            # Enhanced format preservation with better structure
-            formatted_text = self._format_azure_result_enhanced(analysis_result)
             result.update({
                 'success': True,
                 'text': formatted_text,
                 'metadata': {
                     'pages': len(analysis_result.pages) if analysis_result.pages else 0,
                     'tables': len(analysis_result.tables) if analysis_result.tables else 0,
                     'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
                     'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
-                    'azure_analysis': analysis_result  # Pass full result for DOCX formatting
                 }
             })
-            logger.info("Azure OCR completed successfully with enhanced formatting")
         except Exception as e:
             logger.error(f"Azure OCR error: {e}")
@@ -172,281 +878,12 @@ class OCRService:
         return result
-    def _format_azure_result_enhanced(self, analysis_result) -> str:
-        """FIXED: Enhanced formatting that eliminates ALL duplication at the source"""
-        formatted_parts = []
-        if not analysis_result.pages:
-            return ""
-        for page_num, page in enumerate(analysis_result.pages, 1):
-            formatted_parts.append(f"\n=== PAGE {page_num} ===\n")
-            # Get all tables for this page first
-            page_tables = []
-            table_regions = []
-            if analysis_result.tables:
-                for table_idx, table in enumerate(analysis_result.tables):
-                    if any(cell.bounding_regions and
-                        cell.bounding_regions[0].page_number == page_num
-                        for cell in table.cells):
-                        page_tables.append((table_idx, table))
-                        # Calculate table bounding region
-                        if table.bounding_regions:
-                            table_regions.append(table.bounding_regions[0])
-            # CRITICAL FIX: Use ONLY paragraphs OR lines, never both
-            content_items = []
-            # Priority 1: Use paragraphs if available (they contain consolidated content)
-            if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
-                page_paragraphs = [p for p in analysis_result.paragraphs if
-                                p.bounding_regions and
-                                p.bounding_regions[0].page_number == page_num]
-                # Use paragraph content ONLY - don't use lines at all
-                for para in page_paragraphs:
-                    if para.content.strip() and not self._is_content_in_table(para, table_regions):
-                        y_pos = para.bounding_regions[0].polygon[1] if para.bounding_regions[0].polygon else 0
-                        content_items.append({
-                            'type': 'paragraph',
-                            'content': para.content.strip(),
-                            'y_pos': y_pos,
-                            'role': getattr(para, 'role', 'paragraph')
-                        })
-            # Priority 2: Only if NO paragraphs available, use lines
-            elif page.lines:
-                # Deduplicate lines first - group by approximate position
-                unique_lines = []
-                seen_content = set()
-                for line in page.lines:
-                    line_content = line.content.strip().lower()
-                    if (line_content and
-                        line_content not in seen_content and
-                        not self._is_content_in_table_by_line(line, table_regions)):
-                        seen_content.add(line_content)
-                        y_pos = line.polygon[1] if line.polygon else 0
-                        unique_lines.append({
-                            'type': 'line',
-                            'content': line.content.strip(),
-                            'y_pos': y_pos,
-                            'role': 'text'
-                        })
-                content_items.extend(unique_lines)
-            # Add table positions to content items
-            for table_idx, table in page_tables:
-                if table.bounding_regions:
-                    table_y_pos = table.bounding_regions[0].polygon[1] if table.bounding_regions[0].polygon else 9999
-                    content_items.append({
-                        'type': 'table',
-                        'content': table,
-                        'y_pos': table_y_pos,
-                        'table_idx': table_idx
-                    })
-            # Sort all content by vertical position
-            content_items.sort(key=lambda x: x['y_pos'])
-            # FINAL DEDUPLICATION: Remove content that appears multiple times
-            seen_text_content = set()
-            final_content = []
-            for item in content_items:
-                if item['type'] == 'table':
-                    final_content.append(item)
-                else:
-                    # Check for text duplication
-                    text_key = item['content'].lower().strip()
-                    if text_key not in seen_text_content:
-                        seen_text_content.add(text_key)
-                        final_content.append(item)
-            # Add formatted content
-            for item in final_content:
-                if item['type'] == 'table':
-                    formatted_parts.append(f"\n--- TABLE {item['table_idx'] + 1} ---")
-                    table_text = self._format_table_enhanced(item['content'])
-                    formatted_parts.append(table_text)
-                    formatted_parts.append("")
-                else:
-                    # Add text content
-                    if item['role'] == 'title':
-                        formatted_parts.append(f"\n# {item['content']}\n")
-                    elif item['role'] == 'sectionHeading':
-                        formatted_parts.append(f"\n## {item['content']}\n")
-                    else:
-                        formatted_parts.append(item['content'])
-            # Clean up excessive empty lines
-            result = '\n'.join(formatted_parts)
-            result = re.sub(r'\n{3,}', '\n\n', result)  # Max 2 consecutive newlines
-        return result
-    def _is_content_in_table(self, content_item, table_regions):
-        """Check if content overlaps with any table region"""
-        if not table_regions or not content_item.bounding_regions:
-            return False
-        content_region = content_item.bounding_regions[0]
-        if not content_region.polygon:
-            return False
-        content_y1 = content_region.polygon[1]  # Top Y
-        content_y2 = content_region.polygon[5]  # Bottom Y
-        content_x1 = content_region.polygon[0]  # Left X
-        content_x2 = content_region.polygon[2]  # Right X
-        for table_region in table_regions:
-            if not table_region.polygon:
-                continue
-            table_y1 = table_region.polygon[1]  # Top Y
-            table_y2 = table_region.polygon[5]  # Bottom Y
-            table_x1 = table_region.polygon[0]  # Left X
-            table_x2 = table_region.polygon[2]  # Right X
-            # Check for overlap with some tolerance
-            y_overlap = not (content_y2 < table_y1 - 10 or content_y1 > table_y2 + 10)
-            x_overlap = not (content_x2 < table_x1 - 10 or content_x1 > table_x2 + 10)
-            if y_overlap and x_overlap:
-                return True
-        return False
-    def _is_content_in_table_by_line(self, line, table_regions):
-        """Check if line content overlaps with any table region"""
-        if not table_regions or not line.polygon:
-            return False
-        line_y1 = line.polygon[1]  # Top Y
-        line_y2 = line.polygon[5]  # Bottom Y
-        line_x1 = line.polygon[0]  # Left X
-        line_x2 = line.polygon[2]  # Right X
-        for table_region in table_regions:
-            if not table_region.polygon:
-                continue
-            table_y1 = table_region.polygon[1]  # Top Y
-            table_y2 = table_region.polygon[5]  # Bottom Y
-            table_x1 = table_region.polygon[0]  # Left X
-            table_x2 = table_region.polygon[2]  # Right X
-            # Check for overlap with tolerance
-            y_overlap = not (line_y2 < table_y1 - 10 or line_y1 > table_y2 + 10)
-            x_overlap = not (line_x2 < table_x1 - 10 or line_x1 > table_x2 + 10)
-            if y_overlap and x_overlap:
-                return True
-        return False
-    def _format_table_enhanced(self, table) -> str:
-        """Enhanced table formatting with better structure"""
-        if not table.cells:
-            return ""
-        # Create matrix
-        max_row = max(cell.row_index for cell in table.cells) + 1
-        max_col = max(cell.column_index for cell in table.cells) + 1
-        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
-        # Fill matrix with cell content
-        for cell in table.cells:
-            content = (cell.content or "").strip()
-            table_matrix[cell.row_index][cell.column_index] = content
-        # Calculate column widths
-        col_widths = [0] * max_col
-        for row in table_matrix:
-            for col_idx, cell in enumerate(row):
-                col_widths[col_idx] = max(col_widths[col_idx], len(cell))
-        # Format as aligned table
-        formatted_rows = []
-        for row_idx, row in enumerate(table_matrix):
-            formatted_cells = []
-            for col_idx, cell in enumerate(row):
-                width = max(col_widths[col_idx], 3)  # Minimum width
-                formatted_cells.append(cell.ljust(width))
-            formatted_row = " | ".join(formatted_cells)
-            formatted_rows.append(formatted_row)
-            # Add separator after header row
-            if row_idx == 0 and max_row > 1:
-                separator = " | ".join(["-" * max(col_widths[i], 3) for i in range(max_col)])
-                formatted_rows.append(separator)
-        return "\n".join(formatted_rows)
-    def _format_azure_result(self, analysis_result) -> str:
-        """Format Azure Document Intelligence result preserving layout"""
-        formatted_text = []
-        if analysis_result.pages:
-            for page_num, page in enumerate(analysis_result.pages, 1):
-                formatted_text.append(f"\n--- Page {page_num} ---\n")
-                # Sort lines by vertical position for better reading order
-                if page.lines:
-                    sorted_lines = sorted(page.lines, key=lambda line: (
-                        line.polygon[1] if line.polygon else 0,  # Y coordinate
-                        line.polygon[0] if line.polygon else 0   # X coordinate
-                    ))
-                    for line in sorted_lines:
-                        formatted_text.append(line.content)
-                # Add tables if present
-                if analysis_result.tables:
-                    page_tables = [t for t in analysis_result.tables if any(
-                        cell.bounding_regions and
-                        cell.bounding_regions[0].page_number == page_num
-                        for cell in t.cells
-                    )]
-                    for table_idx, table in enumerate(page_tables):
-                        formatted_text.append(f"\n--- Table {table_idx + 1} ---")
-                        formatted_text.append(self._format_table(table))
-        return '\n'.join(formatted_text)
-    def _format_table(self, table) -> str:
-        """Format table from Azure Document Intelligence"""
-        if not table.cells:
-            return ""
-        # Create matrix
-        max_row = max(cell.row_index for cell in table.cells) + 1
-        max_col = max(cell.column_index for cell in table.cells) + 1
-        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
-        for cell in table.cells:
-            table_matrix[cell.row_index][cell.column_index] = cell.content or ""
-        # Format as text table
-        formatted_rows = []
-        for row in table_matrix:
-            formatted_rows.append(" | ".join(row))
-        return "\n".join(formatted_rows)
     def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
-        """Tesseract OCR with image preprocessing - FIXED VERSION"""
         result = {
             'success': False,
             'text': '',
             'method_used': 'tesseract',
             'metadata': {},
             'error': None
@@ -458,57 +895,72 @@ class OCRService:
         pdf_document = None
         try:
-            # Convert PDF to images
             pdf_document = fitz.open(pdf_path)
-            page_count = len(pdf_document)  # Get count before processing
             all_text = []
             for page_num in range(page_count):
                 page = pdf_document.load_page(page_num)
                 # Render page to image
-                mat = fitz.Matrix(2.0, 2.0)  # High resolution
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
-                # Convert to PIL Image
                 temp_img_path = None
                 try:
                     with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
                         temp_img.write(img_data)
                         temp_img_path = temp_img.name
-                    # Preprocess image for better OCR
                     processed_img = self._preprocess_image(temp_img_path)
-                    # OCR with custom config
                     custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
                     text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
-                    all_text.append(f"\n--- Page {page_num + 1} ---\n")
                     all_text.append(text)
                 finally:
-                    # Clean up temp image file
                     if temp_img_path and os.path.exists(temp_img_path):
                         try:
                             os.unlink(temp_img_path)
                         except:
                             pass
             result.update({
                 'success': True,
                 'text': '\n'.join(all_text),
-                'metadata': {'pages': page_count}
             })
-            logger.info("Tesseract OCR completed successfully")
         except Exception as e:
             logger.error(f"Tesseract OCR error: {e}")
             result['error'] = f"Tesseract OCR error: {e}"
         finally:
-            # FIXED: Ensure document is properly closed
             if pdf_document is not None:
                 try:
                     pdf_document.close()
@@ -517,27 +969,12 @@ class OCRService:
         return result
-    def _preprocess_image(self, image_path: str) -> np.ndarray:
-        """Preprocess image for better OCR accuracy"""
-        # Read image
-        img = cv2.imread(image_path)
-        # Convert to grayscale
-        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # Noise removal
-        denoised = cv2.medianBlur(gray, 3)
-        # Threshold to get binary image
-        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-        return binary
     def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
-        """PyMuPDF text extraction - FIXED VERSION"""
         result = {
             'success': False,
             'text': '',
             'method_used': 'pymupdf',
             'metadata': {},
             'error': None
@@ -546,29 +983,50 @@ class OCRService:
         pdf_document = None
         try:
             pdf_document = fitz.open(pdf_path)
-            page_count = len(pdf_document)  # FIXED: Get count first before processing
             all_text = []
             for page_num in range(page_count):
                 page = pdf_document.load_page(page_num)
                 text = page.get_text()
-                all_text.append(f"\n--- Page {page_num + 1} ---\n")
                 all_text.append(text)
             result.update({
                 'success': True,
                 'text': '\n'.join(all_text),
-                'metadata': {'pages': page_count}  # FIXED: Use stored count
             })
-            logger.info("PyMuPDF extraction completed successfully")
         except Exception as e:
             logger.error(f"PyMuPDF error: {e}")
             result['error'] = f"PyMuPDF error: {e}"
         finally:
-            # FIXED: Ensure document is properly closed
             if pdf_document is not None:
                 try:
                     pdf_document.close()
@@ -577,11 +1035,18 @@ class OCRService:
         return result
     def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
         """Try fallback OCR methods"""
         fallback_methods = []
-        # Order of fallback preference
         if exclude_method != "azure" and self.azure_client:
             fallback_methods.append("azure")
         if exclude_method != "tesseract" and self._check_tesseract_available():
@@ -593,7 +1058,7 @@ class OCRService:
             logger.info(f"Trying fallback method: {method}")
             try:
                 if method == "azure":
-                    result = self._azure_ocr(pdf_path)
                 elif method == "tesseract":
                     result = self._tesseract_ocr(pdf_path)
                 elif method == "pymupdf":
@@ -610,6 +1075,7 @@ class OCRService:
         return {
             'success': False,
             'text': '',
             'method_used': 'all_methods_failed',
             'metadata': {},
             'error': 'All OCR methods failed'
@@ -633,6 +1099,6 @@ class OCRService:
             methods.append("azure")
         if self._check_tesseract_available():
             methods.append("tesseract")
-        methods.append("pymupdf")  # Always available
         return methods

 """
+OCR Service Module - FIXED VERSION with Improved Text Formatting and Page Numbers
+Handles PDF to text conversion with proper indentation, spacing, and page numbering
 """
 import re
 import os
 import logging
+from typing import Optional, Dict, Any, Tuple, List
 import tempfile
 from pathlib import Path
 logger = logging.getLogger(__name__)
+class HTMLProcessor:
+    """Process OCR results through HTML for better formatting preservation - FIXED VERSION"""
+    @staticmethod
+    def create_html_from_azure_result(analysis_result) -> str:
+        """Create structured HTML from Azure Document Intelligence result with proper spacing and page numbers"""
+        html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
+        html_parts.append('<style>')
+        html_parts.append('''
+            body {
+                font-family: 'Consolas', 'Courier New', monospace;
+                line-height: 1.6;
+                margin: 20px;
+                white-space: pre-wrap;
+                font-size: 11pt;
+                background-color: #fafafa;
+            }
+            .page {
+                margin-bottom: 30px;
+                border: 1px solid #ddd;
+                padding: 20px;
+                background-color: white;
+                border-radius: 5px;
+                box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+            }
+            .page-header {
+                font-weight: bold;
+                color: #2c3e50;
+                margin-bottom: 15px;
+                text-align: center;
+                border-bottom: 2px solid #3498db;
+                padding-bottom: 8px;
+                font-size: 14pt;
+                text-transform: uppercase;
+                letter-spacing: 1px;
+            }
+            .paragraph {
+                margin-bottom: 0.8em;
+                white-space: pre-wrap;
+                font-family: 'Consolas', 'Courier New', monospace;
+                line-height: 1.4;
+            }
+            .title {
+                font-size: 1.4em;
+                font-weight: bold;
+                margin: 15px 0 12px 0;
+                color: #2c3e50;
+                border-left: 4px solid #3498db;
+                padding-left: 10px;
+            }
+            .section-heading {
+                font-size: 1.2em;
+                font-weight: bold;
+                margin: 12px 0 8px 0;
+                color: #34495e;
+                border-left: 3px solid #95a5a6;
+                padding-left: 8px;
+            }
+            .table-container {
+                margin: 15px 0;
+                font-family: 'Consolas', 'Courier New', monospace;
+                background-color: #f8f9fa;
+                padding: 10px;
+                border-radius: 5px;
+                border: 1px solid #dee2e6;
+            }
+            .table {
+                border-collapse: collapse;
+                width: 100%;
+                margin: 8px 0;
+                font-family: 'Consolas', 'Courier New', monospace;
+                font-size: 10pt;
+                background-color: white;
+            }
+            .table th, .table td {
+                border: 1px solid #bdc3c7;
+                padding: 6px 10px;
+                text-align: left;
+                white-space: pre-wrap;
+                vertical-align: top;
+            }
+            .table th {
+                background-color: #ecf0f1;
+                font-weight: bold;
+                color: #2c3e50;
+            }
+            .table tr:nth-child(even) {
+                background-color: #f8f9fa;
+            }
+            .indented {
+                display: inline-block;
+                white-space: pre-wrap;
+            }
+            .bullet-point {
+                position: relative;
+                padding-left: 1.2em;
+                margin-bottom: 0.3em;
+            }
+            .bullet-point:before {
+                content: "•";
+                position: absolute;
+                left: 0;
+                color: #3498db;
+                font-weight: bold;
+            }
+            .spaced {
+                margin-top: 10px;
+            }
+            .page-number {
+                position: relative;
+                float: right;
+                background-color: #3498db;
+                color: white;
+                padding: 2px 8px;
+                border-radius: 3px;
+                font-size: 9pt;
+                margin-top: -5px;
+            }
+        ''')
+        html_parts.append('</style></head><body>')
+        if not analysis_result.pages:
+            html_parts.append('<p>No content found</p></body></html>')
+            return '\n'.join(html_parts)
+        for page_num, page in enumerate(analysis_result.pages, 1):
+            html_parts.append(f'<div class="page">')
+            html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
+            # Process content with proper ordering and spacing preservation
+            content_items = HTMLProcessor._extract_page_content(page, analysis_result, page_num)
+            content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
+            # Generate HTML for each content item with preserved spacing
+            for item in content_items:
+                if item['type'] == 'table':
+                    html_parts.append(HTMLProcessor._table_to_html(item['content'], item['table_idx']))
+                else:
+                    html_parts.append(HTMLProcessor._text_to_html(item))
+            html_parts.append('</div>')
+        html_parts.append('</body></html>')
+        return '\n'.join(html_parts)
+    @staticmethod
+    def _extract_page_content(page, analysis_result, page_num):
+        """Extract and organize page content without losing text with proper spacing"""
+        content_items = []
+        # First, collect all tables for this page
+        page_tables = []
+        table_regions = []
+        if analysis_result.tables:
+            for table_idx, table in enumerate(analysis_result.tables):
+                if HTMLProcessor._is_table_on_page(table, page_num):
+                    page_tables.append((table_idx, table))
+                    # Store table regions for overlap detection
+                    if table.bounding_regions:
+                        table_regions.append({
+                            'polygon': table.bounding_regions[0].polygon,
+                            'table_idx': table_idx
+                        })
+        # Add table items to content
+        for table_idx, table in page_tables:
+            if table.bounding_regions and table.bounding_regions[0].polygon:
+                polygon = table.bounding_regions[0].polygon
+                y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7])  # Top Y
+                x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])  # Left X
+                content_items.append({
+                    'type': 'table',
+                    'content': table,
+                    'table_idx': table_idx,
+                    'y_pos': y_pos,
+                    'x_pos': x_pos
+                })
+        # Calculate page margins for proper indentation detection
+        page_left_margin = HTMLProcessor._calculate_page_margins(page, analysis_result, page_num)
+        # Process text content - use paragraphs if available, otherwise lines
+        if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
+            # Use paragraphs (better content grouping)
+            page_paragraphs = [p for p in analysis_result.paragraphs if
+                             p.bounding_regions and
+                             p.bounding_regions[0].page_number == page_num]
+            for para in page_paragraphs:
+                if para.content.strip():
+                    # Check if this paragraph overlaps significantly with any table
+                    overlap_ratio = HTMLProcessor._calculate_table_overlap(para, table_regions)
+                    # Only exclude if heavily overlapping (>70%) with a table
+                    if overlap_ratio < 0.7:
+                        polygon = para.bounding_regions[0].polygon
+                        y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
+                        x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
+                        # Calculate proper indentation based on page margins
+                        indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, para.content)
+                        content_items.append({
+                            'type': 'paragraph',
+                            'content': para.content.strip(),
+                            'role': getattr(para, 'role', 'paragraph'),
+                            'y_pos': y_pos,
+                            'x_pos': x_pos,
+                            'indent_level': indent_info['level'],
+                            'indent_pixels': indent_info['pixels'],
+                            'is_bullet': indent_info['is_bullet'],
+                            'preserve_spacing': True
+                        })
+        elif page.lines:
+            # Use lines as fallback with enhanced spacing preservation
+            processed_lines = HTMLProcessor._process_lines_content_with_spacing(page.lines, table_regions, page_left_margin)
+            content_items.extend(processed_lines)
+        return content_items
+    @staticmethod
+    def _is_table_on_page(table, page_num):
+        """Check if table belongs to the specified page"""
+        if not table.cells:
+            return False
+        for cell in table.cells:
+            if (cell.bounding_regions and
+                cell.bounding_regions[0].page_number == page_num):
+                return True
+        return False
+    @staticmethod
+    def _calculate_table_overlap(content_item, table_regions):
+        """Calculate overlap ratio between content and tables (FIXED)"""
+        if not table_regions or not content_item.bounding_regions:
+            return 0.0
+        content_polygon = content_item.bounding_regions[0].polygon
+        if not content_polygon or len(content_polygon) < 8:
+            return 0.0
+        # Content bounding box
+        content_x1 = min(content_polygon[0], content_polygon[2], content_polygon[4], content_polygon[6])
+        content_x2 = max(content_polygon[0], content_polygon[2], content_polygon[4], content_polygon[6])
+        content_y1 = min(content_polygon[1], content_polygon[3], content_polygon[5], content_polygon[7])
+        content_y2 = max(content_polygon[1], content_polygon[3], content_polygon[5], content_polygon[7])
+        content_area = (content_x2 - content_x1) * (content_y2 - content_y1)
+        if content_area <= 0:
+            return 0.0
+        max_overlap_ratio = 0.0
+        for table_region in table_regions:
+            table_polygon = table_region['polygon']
+            if not table_polygon or len(table_polygon) < 8:
+                continue
+            # Table bounding box
+            table_x1 = min(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
+            table_x2 = max(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
+            table_y1 = min(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
+            table_y2 = max(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
+            # Calculate intersection
+            intersect_x1 = max(content_x1, table_x1)
+            intersect_x2 = min(content_x2, table_x2)
+            intersect_y1 = max(content_y1, table_y1)
+            intersect_y2 = min(content_y2, table_y2)
+            if intersect_x2 > intersect_x1 and intersect_y2 > intersect_y1:
+                intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
+                overlap_ratio = intersect_area / content_area
+                max_overlap_ratio = max(max_overlap_ratio, overlap_ratio)
+        return max_overlap_ratio
+    @staticmethod
+    def _calculate_page_margins(page, analysis_result, page_num):
+        """Calculate page margins to determine proper indentation baseline"""
+        left_positions = []
+        # Collect x positions from paragraphs if available
+        if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
+            page_paragraphs = [p for p in analysis_result.paragraphs if
+                             p.bounding_regions and
+                             p.bounding_regions[0].page_number == page_num]
+            for para in page_paragraphs:
+                if para.bounding_regions and para.bounding_regions[0].polygon:
+                    polygon = para.bounding_regions[0].polygon
+                    x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
+                    left_positions.append(x_pos)
+        # Fallback to lines if no paragraphs
+        elif page.lines:
+            for line in page.lines:
+                if line.polygon:
+                    x_pos = min(line.polygon[0], line.polygon[2], line.polygon[4], line.polygon[6])
+                    left_positions.append(x_pos)
+        # Find the most common left margin (baseline)
+        if left_positions:
+            left_positions.sort()
+            # Take the most frequent left position as the main margin
+            from collections import Counter
+            position_counts = Counter([round(pos, -1) for pos in left_positions])  # Round to nearest 10
+            base_margin = position_counts.most_common(1)[0][0]
+            return base_margin
+        return 50  # Default margin if no content found
+    @staticmethod
+    def _calculate_precise_indentation(x_pos, base_margin, content):
+        """Calculate precise indentation based on x position and content analysis"""
+        # Calculate indent distance from base margin
+        indent_distance = max(0, x_pos - base_margin)
+        # Define indentation levels based on distance
+        # Each level represents approximately 0.5 inch or 36 points
+        level_threshold = 30  # Reduced threshold for better sensitivity
+        indent_level = int(indent_distance / level_threshold)
+        # Detect bullet points or numbered lists
+        is_bullet = False
+        content_stripped = content.strip()
+        # Common bullet point patterns
+        bullet_patterns = [
+            r'^\s*[•·▪▫◦‣⁃]\s+',  # Bullet symbols
+            r'^\s*[\-\*\+]\s+',     # Dash, asterisk, plus
+            r'^\s*\d+[\.\)]\s+',    # Numbered lists (1. or 1))
+            r'^\s*[a-zA-Z][\.\)]\s+', # Lettered lists (a. or a))
+            r'^\s*[ivxlcdm]+[\.\)]\s+', # Roman numerals
+        ]
+        for pattern in bullet_patterns:
+            if re.match(pattern, content_stripped, re.IGNORECASE):
+                is_bullet = True
+                break
+        return {
+            'level': min(indent_level, 6),  # Cap at level 6
+            'pixels': indent_distance,
+            'is_bullet': is_bullet
+        }
+    @staticmethod
+    def _process_lines_content_with_spacing(lines, table_regions, page_left_margin):
+        """Process lines content with enhanced spacing preservation"""
+        content_items = []
+        processed_content = set()
+        for line in lines:
+            if not line.content.strip():
+                continue
+            # Avoid duplicates
+            content_key = line.content.strip().lower()
+            if content_key in processed_content:
+                continue
+            processed_content.add(content_key)
+            # Check table overlap
+            overlap_ratio = HTMLProcessor._calculate_line_table_overlap(line, table_regions)
+            # Only exclude if heavily overlapping with table
+            if overlap_ratio < 0.7:
+                polygon = line.polygon
+                y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
+                x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
+                # Calculate precise indentation for lines
+                indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, line.content)
+                content_items.append({
+                    'type': 'line',
+                    'content': line.content.strip(),
+                    'role': 'text',
+                    'y_pos': y_pos,
+                    'x_pos': x_pos,
+                    'indent_level': indent_info['level'],
+                    'indent_pixels': indent_info['pixels'],
+                    'is_bullet': indent_info['is_bullet'],
+                    'preserve_spacing': True
+                })
+        return content_items
+    @staticmethod
+    def _calculate_line_table_overlap(line, table_regions):
+        """Calculate overlap between line and tables"""
+        if not table_regions or not line.polygon:
+            return 0.0
+        line_polygon = line.polygon
+        if len(line_polygon) < 8:
+            return 0.0
+        # Line bounding box
+        line_x1 = min(line_polygon[0], line_polygon[2], line_polygon[4], line_polygon[6])
+        line_x2 = max(line_polygon[0], line_polygon[2], line_polygon[4], line_polygon[6])
+        line_y1 = min(line_polygon[1], line_polygon[3], line_polygon[5], line_polygon[7])
+        line_y2 = max(line_polygon[1], line_polygon[3], line_polygon[5], line_polygon[7])
+        line_area = (line_x2 - line_x1) * (line_y2 - line_y1)
+        if line_area <= 0:
+            return 0.0
+        max_overlap = 0.0
+        for table_region in table_regions:
+            table_polygon = table_region['polygon']
+            if not table_polygon or len(table_polygon) < 8:
+                continue
+            table_x1 = min(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
+            table_x2 = max(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
+            table_y1 = min(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
+            table_y2 = max(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
+            # Calculate intersection
+            intersect_x1 = max(line_x1, table_x1)
+            intersect_x2 = min(line_x2, table_x2)
+            intersect_y1 = max(line_y1, table_y1)
+            intersect_y2 = min(line_y2, table_y2)
+            if intersect_x2 > intersect_x1 and intersect_y2 > intersect_y1:
+                intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
+                overlap_ratio = intersect_area / line_area
+                max_overlap = max(max_overlap, overlap_ratio)
+        return max_overlap
+    @staticmethod
+    def _text_to_html(item):
+        """Convert text item to HTML with proper formatting and preserved spacing"""
+        content = item['content']
+        role = item.get('role', 'paragraph')
+        indent_level = item.get('indent_level', 0)
+        indent_pixels = item.get('indent_pixels', 0)
+        is_bullet = item.get('is_bullet', False)
+        preserve_spacing = item.get('preserve_spacing', False)
+        # Calculate CSS indentation
+        css_indent = max(0, indent_level)
+        # Build CSS classes and inline styles
+        css_classes = []
+        inline_styles = []
+        if css_indent > 0:
+            inline_styles.append(f"margin-left: {css_indent * 1.5}em")
+            css_classes.append("indented")
+        if is_bullet:
+            css_classes.append("bullet-point")
+        # Preserve internal spacing within content
+        if preserve_spacing:
+            # Replace multiple spaces with &nbsp; to preserve spacing
+            content = re.sub(r'  +', lambda m: '&nbsp;' * len(m.group()), content)
+            # Preserve line breaks within content
+            content = content.replace('\n', '<br>')
+        # Combine CSS
+        class_str = f' class="{" ".join(css_classes)}"' if css_classes else ''
+        style_str = f' style="{"; ".join(inline_styles)}"' if inline_styles else ''
+        if role == 'title':
+            return f'<div class="title"{class_str}{style_str}>{content}</div>'
+        elif role == 'sectionHeading':
+            return f'<div class="section-heading"{class_str}{style_str}>{content}</div>'
+        else:
+            # Regular paragraphs with preserved formatting
+            return f'<div class="paragraph"{class_str}{style_str}>{content}</div>'
+    @staticmethod
+    def _table_to_html(table, table_idx):
+        """Convert table to HTML with proper structure"""
+        if not table.cells:
+            return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
+        # Create table matrix
+        max_row = max(cell.row_index for cell in table.cells) + 1
+        max_col = max(cell.column_index for cell in table.cells) + 1
+        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
+        # Fill matrix
+        for cell in table.cells:
+            content = (cell.content or "").strip()
+            table_matrix[cell.row_index][cell.column_index] = content
+        # Generate HTML
+        html_parts = [f'<div class="table-container">']
+        html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
+        html_parts.append('<table class="table">')
+        for row_idx, row in enumerate(table_matrix):
+            if row_idx == 0 and any(cell.strip() for cell in row):
+                # Header row
+                html_parts.append('<tr>')
+                for cell in row:
+                    html_parts.append(f'<th>{cell}</th>')
+                html_parts.append('</tr>')
+            else:
+                # Data row
+                if any(cell.strip() for cell in row):  # Skip empty rows
+                    html_parts.append('<tr>')
+                    for cell in row:
+                        html_parts.append(f'<td>{cell}</td>')
+                    html_parts.append('</tr>')
+        html_parts.append('</table></div>')
+        return '\n'.join(html_parts)
+    @staticmethod
+    def html_to_formatted_text(html_content):
+        """Convert HTML back to formatted text preserving structure, spacing, and adding page numbers"""
+        from html.parser import HTMLParser
+        class FixedSpacingTextExtractor(HTMLParser):
+            def __init__(self):
+                super().__init__()
+                self.text_parts = []
+                self.in_title = False
+                self.in_section_heading = False
+                self.in_table = False
+                self.in_table_header = False
+                self.current_table_row = []
+                self.table_data = []
+                self.current_indent = 0
+                self.preserve_spacing = False
+                self.in_page_header = False
+                self.current_page_num = 0
+            def handle_starttag(self, tag, attrs):
+                attr_dict = dict(attrs)
+                class_attr = attr_dict.get('class', '')
+                style_attr = attr_dict.get('style', '')
+                if 'page-header' in class_attr:
+                    self.in_page_header = True
+                    # Add proper page separation with page number
+                    if len(self.text_parts) > 0:
+                        self.text_parts.append('\n\n' + '=' * 80 + '\n')
+                elif 'title' in class_attr:
+                    self.in_title = True
+                elif 'section-heading' in class_attr:
+                    self.in_section_heading = True
+                elif tag == 'table':
+                    self.in_table = True
+                    self.table_data = []
+                elif tag == 'th':
+                    self.in_table_header = True
+                elif tag == 'tr':
+                    self.current_table_row = []
+                elif tag == 'br':
+                    self.text_parts.append('\n')
+                # Extract indentation from style
+                if 'margin-left' in style_attr:
+                    import re
+                    margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
+                    if margin_match:
+                        self.current_indent = int(float(margin_match.group(1)))
+                    else:
+                        self.current_indent = 0
+                else:
+                    # Count indented classes as fallback
+                    self.current_indent = class_attr.count('indented')
+                # Check if we should preserve spacing
+                self.preserve_spacing = 'paragraph' in class_attr or 'bullet-point' in class_attr
+            def handle_endtag(self, tag):
+                if tag == 'div' and self.in_page_header:
+                    self.text_parts.append('\n' + '=' * 80 + '\n\n')
+                    self.in_page_header = False
+                elif tag == 'div' and self.in_title:
+                    self.text_parts.append('\n\n')
+                    self.in_title = False
+                elif tag == 'div' and self.in_section_heading:
+                    self.text_parts.append('\n\n')
+                    self.in_section_heading = False
+                elif tag == 'table':
+                    self.in_table = False
+                    self._format_table()
+                elif tag == 'th':
+                    self.in_table_header = False
+                elif tag == 'tr' and self.current_table_row:
+                    self.table_data.append(self.current_table_row[:])
+                elif tag == 'div' and not self.in_table and not self.in_title and not self.in_section_heading and not self.in_page_header:
+                    if not self.preserve_spacing:
+                        self.text_parts.append('\n')
+                # Reset indentation when closing div
+                if tag == 'div':
+                    self.current_indent = 0
+                    self.preserve_spacing = False
+            def handle_data(self, data):
+                if data.strip():
+                    # Convert &nbsp; back to spaces for proper spacing
+                    data = data.replace('&nbsp;', ' ')
+                    if self.in_page_header:
+                        # Extract page number and format properly
+                        page_match = re.search(r'Page (\d+)', data)
+                        if page_match:
+                            self.current_page_num = int(page_match.group(1))
+                            page_header = f"PAGE {self.current_page_num}"
+                            self.text_parts.append(page_header.center(80))
+                    elif self.in_title:
+                        indent_str = "  " * self.current_indent
+                        self.text_parts.append(f'\n{indent_str}## {data.strip()}')
+                    elif self.in_section_heading:
+                        indent_str = "  " * self.current_indent
+                        self.text_parts.append(f'\n{indent_str}### {data.strip()}')
+                    elif self.in_table:
+                        if self.in_table_header or self.current_table_row is not None:
+                            self.current_table_row.append(data.strip())
+                    else:
+                        # Apply indentation and preserve internal spacing
+                        indent_str = "  " * self.current_indent
+                        if self.preserve_spacing:
+                            # Keep the exact spacing from the data
+                            formatted_data = data
+                        else:
+                            # Clean up spacing for non-preserved content
+                            formatted_data = re.sub(r'\s+', ' ', data).strip()
+                        # Handle bullet points specially
+                        if 'bullet-point' in getattr(self, '_last_class', ''):
+                            # Remove the bullet symbol that CSS adds and format properly
+                            self.text_parts.append(f'{indent_str}• {formatted_data}')
+                        else:
+                            self.text_parts.append(f'{indent_str}{formatted_data}')
+            def _format_table(self):
+                if not self.table_data:
+                    return
+                self.text_parts.append('\n\n')
+                # Calculate column widths for better formatting
+                if self.table_data:
+                    max_cols = max(len(row) for row in self.table_data)
+                    col_widths = [0] * max_cols
+                    for row in self.table_data:
+                        for i, cell in enumerate(row):
+                            if i < max_cols:
+                                col_widths[i] = max(col_widths[i], len(str(cell)))
+                    # Ensure minimum column width
+                    col_widths = [max(width, 8) for width in col_widths]
+                    # Format rows with proper alignment
+                    for row_idx, row in enumerate(self.table_data):
+                        formatted_cells = []
+                        for i, cell in enumerate(row):
+                            if i < max_cols:
+                                width = col_widths[i]
+                                formatted_cells.append(str(cell).ljust(width))
+                        row_text = ' | '.join(formatted_cells)
+                        self.text_parts.append(row_text)
+                        # Add separator after header
+                        if row_idx == 0 and len(self.table_data) > 1:
+                            separator_cells = ['-' * col_widths[i] for i in range(max_cols)]
+                            separator_text = ' | '.join(separator_cells)
+                            self.text_parts.append(separator_text)
+                        self.text_parts.append('\n')
+                self.text_parts.append('\n')
+        extractor = FixedSpacingTextExtractor()
+        extractor.feed(html_content)
+        result = ''.join(extractor.text_parts)
+        # Clean up excessive newlines while preserving intentional spacing
+        result = re.sub(r'\n{4,}', '\n\n\n', result)  # Max 3 consecutive newlines
+        # Ensure proper spacing around page headers
+        result = re.sub(r'(={80})\n*([A-Z ]+)\n*(={80})', r'\1\n\2\n\3', result)
+        return result.strip()
 class OCRService:
+    """Main OCR service with HTML processing and improved table handling"""
     def __init__(self):
         self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
     def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
         """
+        Convert PDF to text using specified method with HTML processing
         Args:
             pdf_path: Path to the PDF file
             method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
         Returns:
+            Dict containing text content, HTML, metadata, and processing info
         """
         result = {
             'success': False,
             'text': '',
+            'html': '',
             'method_used': '',
             'metadata': {},
             'error': None
         # Try primary method
         try:
             if method == "azure" and self.azure_client:
+                result = self._azure_ocr_with_html(pdf_path)
             elif method == "tesseract":
                 result = self._tesseract_ocr(pdf_path)
             elif method == "pymupdf":
         return result
+    def _azure_ocr_with_html(self, pdf_path: str) -> Dict[str, Any]:
+        """Azure Document Intelligence OCR with HTML processing"""
         result = {
             'success': False,
             'text': '',
+            'html': '',
             'method_used': 'azure_document_intelligence',
             'metadata': {},
             'error': None
             with open(pdf_path, 'rb') as pdf_file:
                 file_content = pdf_file.read()
+                # Try different API call patterns
                 try:
                     poller = self.azure_client.begin_analyze_document(
                         "prebuilt-layout",
                         body=file_content,
                     )
                 except TypeError:
                     try:
                         poller = self.azure_client.begin_analyze_document(
                             model_id="prebuilt-layout",
                             body=file_content
                         )
                     except TypeError:
                         pdf_file.seek(0)
                         poller = self.azure_client.begin_analyze_document(
                             "prebuilt-layout",
             analysis_result = poller.result()
+            # Generate HTML first
+            html_content = HTMLProcessor.create_html_from_azure_result(analysis_result)
+            # Convert HTML to formatted text with proper page numbers and spacing
+            formatted_text = HTMLProcessor.html_to_formatted_text(html_content)
             result.update({
                 'success': True,
                 'text': formatted_text,
+                'html': html_content,
                 'metadata': {
                     'pages': len(analysis_result.pages) if analysis_result.pages else 0,
                     'tables': len(analysis_result.tables) if analysis_result.tables else 0,
                     'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
                     'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
+                    'html_generated': True,
+                    'improved_formatting': True,
+                    'page_numbers_added': True,
+                    'azure_analysis': analysis_result
                 }
             })
+            logger.info("Azure OCR with improved HTML processing completed successfully")
         except Exception as e:
             logger.error(f"Azure OCR error: {e}")
         return result
     def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
+        """Tesseract OCR with basic HTML generation and page numbers"""
         result = {
             'success': False,
             'text': '',
+            'html': '',
             'method_used': 'tesseract',
             'metadata': {},
             'error': None
         pdf_document = None
         try:
             pdf_document = fitz.open(pdf_path)
+            page_count = len(pdf_document)
             all_text = []
+            html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
+            html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
+            html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
+            html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
+            html_parts.append('</style></head><body>')
             for page_num in range(page_count):
+                # Add page header to text
+                page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
+                all_text.append(page_header)
                 page = pdf_document.load_page(page_num)
                 # Render page to image
+                mat = fitz.Matrix(2.0, 2.0)
                 pix = page.get_pixmap(matrix=mat)
                 img_data = pix.tobytes("png")
                 temp_img_path = None
                 try:
                     with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
                         temp_img.write(img_data)
                         temp_img_path = temp_img.name
                     processed_img = self._preprocess_image(temp_img_path)
                     custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
                     text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
                     all_text.append(text)
+                    # Add to HTML with page number
+                    html_parts.append(f'<div class="page">')
+                    html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
+                    html_parts.append(f'<pre>{text}</pre></div>')
                 finally:
                     if temp_img_path and os.path.exists(temp_img_path):
                         try:
                             os.unlink(temp_img_path)
                         except:
                             pass
+            html_parts.append('</body></html>')
             result.update({
                 'success': True,
                 'text': '\n'.join(all_text),
+                'html': '\n'.join(html_parts),
+                'metadata': {
+                    'pages': page_count,
+                    'html_generated': True,
+                    'page_numbers_added': True,
+                    'improved_formatting': True
+                }
             })
+            logger.info("Tesseract OCR with improved formatting completed successfully")
         except Exception as e:
             logger.error(f"Tesseract OCR error: {e}")
             result['error'] = f"Tesseract OCR error: {e}"
         finally:
             if pdf_document is not None:
                 try:
                     pdf_document.close()
         return result
     def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
+        """PyMuPDF text extraction with HTML generation and page numbers"""
         result = {
             'success': False,
             'text': '',
+            'html': '',
             'method_used': 'pymupdf',
             'metadata': {},
             'error': None
         pdf_document = None
         try:
             pdf_document = fitz.open(pdf_path)
+            page_count = len(pdf_document)
             all_text = []
+            html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
+            html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
+            html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
+            html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
+            html_parts.append('</style></head><body>')
             for page_num in range(page_count):
+                # Add page header to text
+                page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
+                all_text.append(page_header)
                 page = pdf_document.load_page(page_num)
                 text = page.get_text()
                 all_text.append(text)
+                # Add to HTML with better formatting and page numbers
+                html_parts.append(f'<div class="page">')
+                html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
+                formatted_text = text.replace('\n', '<br>')
+                html_parts.append(f'<div>{formatted_text}</div></div>')
+            html_parts.append('</body></html>')
             result.update({
                 'success': True,
                 'text': '\n'.join(all_text),
+                'html': '\n'.join(html_parts),
+                'metadata': {
+                    'pages': page_count,
+                    'html_generated': True,
+                    'page_numbers_added': True,
+                    'improved_formatting': True
+                }
             })
+            logger.info("PyMuPDF extraction with improved formatting completed successfully")
         except Exception as e:
             logger.error(f"PyMuPDF error: {e}")
             result['error'] = f"PyMuPDF error: {e}"
         finally:
             if pdf_document is not None:
                 try:
                     pdf_document.close()
         return result
+    def _preprocess_image(self, image_path: str) -> np.ndarray:
+        """Preprocess image for better OCR accuracy"""
+        img = cv2.imread(image_path)
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        denoised = cv2.medianBlur(gray, 3)
+        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
     def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
         """Try fallback OCR methods"""
         fallback_methods = []
         if exclude_method != "azure" and self.azure_client:
             fallback_methods.append("azure")
         if exclude_method != "tesseract" and self._check_tesseract_available():
             logger.info(f"Trying fallback method: {method}")
             try:
                 if method == "azure":
+                    result = self._azure_ocr_with_html(pdf_path)
                 elif method == "tesseract":
                     result = self._tesseract_ocr(pdf_path)
                 elif method == "pymupdf":
         return {
             'success': False,
             'text': '',
+            'html': '',
             'method_used': 'all_methods_failed',
             'metadata': {},
             'error': 'All OCR methods failed'
             methods.append("azure")
         if self._check_tesseract_available():
             methods.append("tesseract")
+        methods.append("pymupdf")
         return methods

readme.md CHANGED Viewed

@@ -1,231 +1,270 @@
 # PDF OCR Service
-A comprehensive PDF to text conversion service with multiple OCR providers and a user-friendly web interface.
 ## Features
-- 🔄 **Multiple OCR Methods**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
-- 📄 **Format Preservation**: Maintains original spacing and layout from PDFs
-- 🛡️ **Fallback Mechanisms**: Automatically tries alternative methods if primary fails
-- 🌐 **Web Interface**: Clean, intuitive Gradio-based UI
-- 📊 **Processing Analytics**: Track processing history and statistics
-- ⚡ **High Performance**: Optimized for speed and accuracy
-## Architecture
-The service consists of three main components:
-1. **`ocr_service.py`** - Core OCR processing with Azure, Tesseract, and PyMuPDF
-2. **`backend.py`** - Backend management, file handling, and coordination
-3. **`ui.py`** - Gradio web interface for user interaction
-## Quick Start
-### 1. Install Dependencies
 ```bash
-# Install Python dependencies
-pip install -r requirements.txt
-# Install system dependencies (Ubuntu/Debian)
 sudo apt-get update
 sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
 sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
-# For macOS
 brew install tesseract
-# For Windows
-# Download Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
-# Add to PATH environment variable
 ```
-### 2. Configure Environment
 ```bash
-# Copy environment template
-cp .env.example .env
-# Edit .env file with your settings
-nano .env
 ```
-**Required Configuration:**
-- Set Azure Document Intelligence endpoint and key (for best quality)
-- Adjust file size limits and server settings as needed
-### 3. Run the Service
 ```bash
-# Start the web interface
 python app.py
-# Or run individual components
-python backend.py  # Test backend functionality
-python ocr_service.py  # Test OCR service
 ```
-The service will be available at `http://localhost:7860`
-## Azure Document Intelligence Setup
-1. **Create Azure Resource**
-   - Go to [Azure Portal](https://portal.azure.com)
-   - Create new "Document Intelligence" resource
-   - Choose subscription, resource group, and region
-   - Select pricing tier (F0 for free, S0 for standard)
-2. **Get Credentials**
-   - Navigate to "Keys and Endpoint" section
-   - Copy the endpoint URL and API key
-   - Add to your `.env` file:
-   ```bash
-   AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
-   AZURE_DOCUMENT_INTELLIGENCE_KEY=your-api-key-here
-   ```
-## OCR Methods
-### Azure Document Intelligence (Recommended)
-- **Best Quality**: Advanced layout analysis and text extraction
-- **Features**: Table detection, handwriting recognition, form understanding
-- **Use Case**: Complex documents, forms, tables, mixed content
-- **Requirements**: Azure subscription and API key
-### Tesseract OCR
-- **Good Quality**: Open-source OCR with preprocessing
-- **Features**: Multiple language support, image enhancement
-- **Use Case**: Scanned documents, images, simple PDFs
-- **Requirements**: Tesseract installation
-### PyMuPDF
-- **Fast Processing**: Direct text extraction from digital PDFs
-- **Features**: Fastest processing, embedded text extraction
-- **Use Case**: Digital PDFs with embedded text
-- **Requirements**: No additional setup needed
-## Usage Examples
-### Web Interface
-1. Open `http://localhost:7860` in your browser
-2. Upload a PDF file
-3. Select OCR method (or use "auto")
-4. Click "Process PDF"
-5. Download extracted text
-### Python API
 ```python
 from backend import BackendManager
 # Initialize backend
-manager = BackendManager()
-# Process PDF
-result = manager.process_pdf('document.pdf', method='auto')
 if result['success']:
-    print("Extracted Text:")
-    print(result['text'])
-    print(f"Method used: {result['method_used']}")
-    print(f"Pages: {result['metadata']['pages']}")
 else:
-    print(f"Error: {result['error']}")
 ```
-## Configuration Options
-### File Processing
-- `MAX_FILE_SIZE_MB`: Maximum file size (default: 50MB)
-- `PROCESSING_TIMEOUT`: Processing timeout in seconds
-- `MAX_CONCURRENT_TASKS`: Concurrent processing limit
-### OCR Settings
-- `DEFAULT_OCR_METHOD`: Default method (auto/azure/tesseract/pymupdf)
-- `AZURE_OCR_MODEL`: Azure model (prebuilt-layout/prebuilt-read)
-- `TESSERACT_LANGUAGES`: Tesseract language packs
-### Server Settings
-- `SERVER_HOST`: Web server host (default: 127.0.0.1)
-- `SERVER_PORT`: Web server port (default: 7860)
-- `SHARE_GRADIO`: Enable public sharing
 ## Troubleshooting
 ### Common Issues
-1. **Azure OCR not working**
-   - Verify endpoint URL and API key
-   - Check Azure subscription status
-   - Ensure resource region matches endpoint
-2. **Tesseract not found**
-   - Install Tesseract OCR system package
-   - Verify installation: `tesseract --version`
-   - Check PATH environment variable
-3. **Large file processing fails**
-   - Increase `MAX_FILE_SIZE_MB` in .env
-   - Check available memory and disk space
-   - Consider splitting large PDFs
-4. **Poor OCR quality**
-   - Try different OCR methods
-   - Use Azure for best quality
-   - Ensure good PDF scan quality
 ### Performance Optimization
-- **Use Azure Document Intelligence** for best accuracy
-- **Enable image preprocessing** for scanned documents
-- **Increase DPI settings** for better image quality
-- **Configure memory limits** based on available resources
-## File Structure
-```
-pdf-ocr-service/
-├── ocr_service.py      # Core OCR processing
-├── backend.py          # Backend management
-├── ui.py              # Gradio web interface
-├── requirements.txt    # Python dependencies
-├── .env               # Environment configuration
-├── README.md          # This file
-├── logs/              # Log files (created automatically)
-├── temp/              # Temporary files (created automatically)
-└── cache/             # Cache directory (optional)
-```
-## Security Considerations
-- Never commit `.env` file to version control
-- Use secure methods to store API keys in production
-- Enable file validation to prevent malicious uploads
-- Consider rate limiting for public deployments
-- Regular cleanup of temporary files
 ## Contributing
 1. Fork the repository
 2. Create a feature branch
 3. Make your changes
-4. Add tests if applicable
 5. Submit a pull request
-## License
-This project is licensed under the MIT License. See LICENSE file for details.
 ## Support
-- Check the troubleshooting section above
-- Review Azure Document Intelligence documentation
-- Open an issue for bug reports or feature requests
-## Changelog
-### Version 1.0.0
-- Initial release
-- Azure Document Intelligence integration
-- Multiple OCR fallback methods
-- Gradio web interface
-- Processing history and analytics

 # PDF OCR Service
+A comprehensive PDF OCR service with HTML processing, smart table detection, and multiple export formats. Convert PDF documents to text with preserved formatting, enhanced table handling, and advanced preprocessing options.
 ## Features
+- **Multiple OCR Engines**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
+- **Smart Table Detection**: Preserves text while accurately detecting and formatting tables
+- **HTML Processing**: Intermediate HTML format for better structure preservation
+- **Advanced Crop Control**: Remove headers/footers with per-page customization
+- **Multiple Export Formats**: TXT, DOCX, and HTML downloads
+- **Real-time Preview**: Visual crop preview with live updates
+- **Enhanced Resolution**: High-quality processing for better accuracy
+- **Automatic Page Numbering**: Clear page separation in extracted content
+## Installation
+### Prerequisites
+- Python 3.8 or higher
+- System dependencies for OCR engines
+#### Ubuntu/Debian
 ```bash
 sudo apt-get update
 sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
 sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
+sudo apt-get install -y libxml2-dev libxslt1-dev
+```
+#### macOS
+```bash
 brew install tesseract
+brew install opencv
+brew install libxml2
 ```
+#### Windows
+- Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
+- Add Tesseract to PATH environment variable
+### Python Dependencies
 ```bash
+pip install -r requirements.txt
+```
+### Environment Configuration
+Create a `.env` file in the project root:
+```env
+# Azure Document Intelligence (Optional)
+AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=your_azure_endpoint
+AZURE_DOCUMENT_INTELLIGENCE_KEY=your_azure_key
+# File Processing Limits
+MAX_FILE_SIZE_MB=50
+MAX_HISTORY_SIZE=100
 ```
+## Usage
+### Starting the Service
 ```bash
 python app.py
 ```
+The service will start on `http://localhost:7860`
+### Web Interface
+1. **Upload PDF**: Select your PDF file using the file upload button
+2. **Choose OCR Method**:
+   - `auto`: Automatically selects the best available method
+   - `azure`: Azure Document Intelligence (requires API key)
+   - `tesseract`: Open-source Tesseract OCR
+   - `pymupdf`: Fast PyMuPDF text extraction
+3. **Configure Preprocessing** (Optional):
+   - Enable header/footer removal
+   - Adjust crop percentages for each edge
+   - Use real-time preview to see crop effects
+   - Apply settings to all pages or customize per page
+4. **Process**: Click "Process PDF with HTML Enhancement"
+5. **Download**: Choose from TXT, DOCX, or HTML formats
+### Crop Control
+The crop feature allows you to remove headers, footers, and margins:
+- **Top/Bottom Crop**: Remove headers and footers (0-40% of page height)
+- **Left/Right Crop**: Remove side margins (0-30% of page width)
+- **Per-page Settings**: Customize crop for individual pages
+- **Real-time Preview**: See crop effects with red (removed) and green (content) areas
+#### Preset Options
+- **Light Crop (5%)**: Minimal header/footer removal
+- **Medium Crop (10%)**: Standard header/footer removal
+- **Heavy Crop (15%)**: Aggressive header/footer removal
+- **Reset**: Remove all cropping
+### API Usage
+The service can be integrated programmatically:
 ```python
 from backend import BackendManager
 # Initialize backend
+backend = BackendManager()
+# Process PDF with options
+preprocessing_options = {
+    'enable_header_footer_removal': True,
+    'crop_settings': {
+        'per_page_crops': {
+            0: {'top': 10, 'bottom': 10, 'left': 5, 'right': 5}
+        },
+        'enhanced_resolution': True
+    }
+}
+result = backend.process_pdf_with_enhanced_resolution(
+    pdf_path='document.pdf',
+    method='auto',
+    preprocessing_options=preprocessing_options
+)
 if result['success']:
+    print("Extracted text:", result['text'])
+    print("HTML content:", result['html'])
 else:
+    print("Error:", result['error'])
 ```
+## Output Formats
+### Text (TXT)
+- Plain text with preserved formatting
+- Page numbers and separators
+- Table formatting with borders
+- Proper indentation and spacing
+### Microsoft Word (DOCX)
+- Structured document with headings
+- Tables converted to Word tables
+- Preserved formatting and layout
+- Metadata and processing information
+### HTML
+- Web-viewable format
+- CSS styling for better readability
+- Interactive tables
+- Responsive design
+## OCR Method Selection
+### Auto (Recommended)
+Automatically chooses the best available method based on:
+1. Azure Document Intelligence (if configured)
+2. Tesseract OCR (if available)
+3. PyMuPDF (fallback)
+### Azure Document Intelligence
+- **Best for**: Complex documents with tables and forms
+- **Requires**: Azure API credentials
+- **Features**: Advanced layout detection, handwriting recognition
+- **Speed**: Medium (cloud processing)
+### Tesseract OCR
+- **Best for**: Scanned documents and images
+- **Requires**: Local Tesseract installation
+- **Features**: Open-source, multilingual support
+- **Speed**: Slow (local processing)
+### PyMuPDF
+- **Best for**: Text-based PDFs
+- **Requires**: No additional setup
+- **Features**: Fast extraction, basic formatting
+- **Speed**: Fast (direct text extraction)
+## Configuration
+### Environment Variables
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | Azure service endpoint | None |
+| `AZURE_DOCUMENT_INTELLIGENCE_KEY` | Azure API key | None |
+| `MAX_FILE_SIZE_MB` | Maximum file size limit | 50 |
+| `MAX_HISTORY_SIZE` | Processing history limit | 100 |
+### Service Status
+Check available OCR methods and service health at the bottom of the web interface. The status panel shows:
+- Available OCR methods
+- Feature availability
+- Configuration status
+- Export format support
 ## Troubleshooting
 ### Common Issues
+**PDF Upload Fails**
+- Check file size (default limit: 50MB)
+- Ensure PDF is not password protected
+- Verify PDF is not corrupted
+**OCR Processing Errors**
+- Check Azure credentials if using Azure method
+- Verify Tesseract installation for Tesseract method
+- Try different OCR method using auto-selection
+**Crop Preview Not Showing**
+- Ensure PDF is loaded successfully
+- Enable header/footer removal option
+- Check browser console for JavaScript errors
+**Export Downloads Not Available**
+- Verify processing completed successfully
+- Check python-docx installation for DOCX export
+- Ensure sufficient disk space for temporary files
 ### Performance Optimization
+- Use PyMuPDF for simple text-based PDFs
+- Enable crop processing only when needed
+- Reduce crop resolution scale for better performance
+- Regular cleanup of temporary files
+## Dependencies
+### Core Dependencies
+- `gradio>=4.0.0` - Web interface
+- `python-dotenv>=1.0.0` - Environment configuration
+- `PyMuPDF>=1.23.0` - PDF processing
+- `opencv-python>=4.8.0` - Image processing
+- `numpy>=1.24.0` - Numerical operations
+### OCR Dependencies
+- `azure-ai-documentintelligence>=1.0.0b1` - Azure OCR
+- `pytesseract>=0.3.10` - Tesseract integration
+- `Pillow>=10.0.0` - Image processing
+### Export Dependencies
+- `python-docx>=0.8.11` - DOCX generation
+- `beautifulsoup4>=4.12.0` - HTML processing
+- `lxml>=4.9.0` - XML processing
+## License
+This project is licensed under the MIT License. See LICENSE file for details.
 ## Contributing
 1. Fork the repository
 2. Create a feature branch
 3. Make your changes
+4. Add tests for new functionality
 5. Submit a pull request
 ## Support
+For issues and questions:
+- Check the troubleshooting section
+- Review the service status panel
+- Check system dependencies
+- Verify environment configuration

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# PDF OCR Service Requirements - Enhanced Version
 # Core web framework and UI
 gradio>=4.0.0
@@ -19,13 +19,21 @@ numpy>=1.24.0
 # PDF processing and manipulation
 PyMuPDF>=1.23.0
-# Document export formats
 python-docx>=0.8.11
 # Additional dependencies for enhanced preprocessing
 matplotlib>=3.7.0  # For image visualization in development
 scikit-image>=0.21.0  # Advanced image processing (optional)
 # System dependencies information (install separately):
 #
 # For Ubuntu/Debian:
@@ -33,30 +41,38 @@ scikit-image>=0.21.0  # Advanced image processing (optional)
 # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
 # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
 # sudo apt-get install -y python3-opencv  # Alternative OpenCV installation
 #
 # For CentOS/RHEL:
 # sudo yum install -y tesseract tesseract-langpack-eng
 # sudo yum install -y opencv-python
 #
 # For macOS:
 # brew install tesseract
 # brew install opencv
 #
 # For Windows:
 # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
 # Add Tesseract to PATH environment variable
-# OpenCV should install automatically with pip
 # Development and testing (optional)
 pytest>=7.0.0
 pytest-cov>=4.0.0
 # Performance monitoring (optional)
 memory-profiler>=0.60.0
 # Note: The enhanced version includes:
-# - Header/footer removal preprocessing
-# - OpenCV-based image manipulation
-# - Enhanced table processing with separator row removal
-# - Crop preview functionality
-# - Advanced PDF manipulation capabilities

+# PDF OCR Service Requirements - Enhanced Version with HTML Processing
 # Core web framework and UI
 gradio>=4.0.0
 # PDF processing and manipulation
 PyMuPDF>=1.23.0
+# Document export formats (ENHANCED)
 python-docx>=0.8.11
+# HTML processing and parsing (NEW)
+beautifulsoup4>=4.12.0
+lxml>=4.9.0
 # Additional dependencies for enhanced preprocessing
 matplotlib>=3.7.0  # For image visualization in development
 scikit-image>=0.21.0  # Advanced image processing (optional)
+# Performance and utility libraries
+tqdm>=4.65.0  # Progress bars for long operations
+requests>=2.31.0  # HTTP requests for external services
 # System dependencies information (install separately):
 #
 # For Ubuntu/Debian:
 # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
 # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
 # sudo apt-get install -y python3-opencv  # Alternative OpenCV installation
+# sudo apt-get install -y libxml2-dev libxslt1-dev  # For lxml
 #
 # For CentOS/RHEL:
 # sudo yum install -y tesseract tesseract-langpack-eng
 # sudo yum install -y opencv-python
+# sudo yum install -y libxml2-devel libxslt-devel
 #
 # For macOS:
 # brew install tesseract
 # brew install opencv
+# brew install libxml2
 #
 # For Windows:
 # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
 # Add Tesseract to PATH environment variable
+# OpenCV and other packages should install automatically with pip
 # Development and testing (optional)
 pytest>=7.0.0
 pytest-cov>=4.0.0
+black>=23.0.0  # Code formatting
+flake8>=6.0.0  # Code linting
 # Performance monitoring (optional)
 memory-profiler>=0.60.0
+psutil>=5.9.0  # System monitoring
 # Note: The enhanced version includes:
+# - Fixed table processing that prevents text loss
+# - HTML intermediate processing for better formatting
+# - Enhanced export capabilities (TXT, DOCX, HTML)
+# - Smart overlap detection with 70% threshold
+# - Improved coordinate calculations for table boundaries
+# - Better document structure preservation
+# - Multi-format download options