Spaces:

Betimes-Solution
/

PDFtoDocx-OCR

Running

App Files Files Community

Chirapath commited on Sep 21, 2025

Commit

cf0c0b1

verified ·

1 Parent(s): 5ba08f1

Upload 11 files

Browse files

Files changed (5) hide show

app.py +171 -56
backend.py +533 -181
enhanced_indentation.py +648 -0
ocr_service.py +580 -346
requirements.txt +275 -12

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ from dotenv import load_dotenv
 load_dotenv()
 from backend import BackendManager
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -25,6 +26,9 @@ logger = logging.getLogger(__name__)
 # Initialize backend manager
 backend_manager = BackendManager()
 # Check if python-docx is available
 try:
     from docx import Document
@@ -303,16 +307,16 @@ def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_
         logger.error(f"Error updating crop preview: {e}")
         return None
-def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer_removal,
-                                    crop_top, crop_bottom, crop_left, crop_right,
-                                    apply_to_all_pages, current_page_selection,
-                                    progress=gr.Progress()):
-    """Process PDF with HTML enhancement and improved table handling - FIXED"""
     if pdf_file is None:
         return "No file uploaded.", "", "", "Error: No file selected"
     try:
-        progress(0.1, desc="Initializing HTML-enhanced processing...")
         # Prepare enhanced preprocessing options
         preprocessing_options = {
@@ -321,19 +325,19 @@ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer
             'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
         }
-        progress(0.3, desc="Processing with HTML enhancement...")
-        # Process the PDF with enhanced preprocessing
         result = backend_manager.process_pdf_with_enhanced_resolution(
             pdf_file.name, ocr_method, preprocessing_options
         )
-        progress(0.9, desc="Finalizing HTML processing...")
         progress(1.0, desc="Complete!")
         if result['success']:
             metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
-            status = f"Success: Processed using {result['method_used']} with HTML enhancement"
             # Return text, HTML, metadata, and status
             return (result['text'],
@@ -345,11 +349,11 @@ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer
             return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
     except Exception as e:
-        logger.error(f"HTML-enhanced processing error: {e}")
         return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
 def format_enhanced_metadata(metadata, method_used):
-    """Enhanced metadata formatting with HTML processing info"""
     if not metadata:
         return f"Method used: {method_used}"
@@ -364,6 +368,15 @@ def format_enhanced_metadata(metadata, method_used):
     if metadata.get('html_processing', False):
         info_lines.append("HTML generation: Enabled")
     if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
         info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
@@ -373,6 +386,25 @@ def format_enhanced_metadata(metadata, method_used):
     if 'tables' in metadata:
         info_lines.append(f"Tables detected: {metadata['tables']}")
     if 'processing_time_seconds' in metadata:
         info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
@@ -381,8 +413,8 @@ def format_enhanced_metadata(metadata, method_used):
 def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
                              crop_top, crop_bottom, crop_left, crop_right,
                              apply_to_all_pages, current_page_selection):
-    """Prepare enhanced downloads with HTML processing"""
-    text, html, metadata, status = process_pdf_with_html_enhancement(
         pdf_file, method, enable_header_footer_removal,
         crop_top, crop_bottom, crop_left, crop_right,
         apply_to_all_pages, current_page_selection
@@ -417,59 +449,95 @@ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
                gr.update(visible=False))
 def get_enhanced_method_info(method):
-    """Get information about selected OCR method with HTML processing"""
     method_descriptions = {
-        "auto": "**Auto Selection**: Automatically chooses the best available method with HTML processing and enhanced table handling.",
-        "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with HTML generation, layout preservation, and smart table detection.",
-        "tesseract": "**Tesseract OCR**: Open-source OCR with HTML output, enhanced image preprocessing, and resolution scaling.",
-        "pymupdf": "**PyMuPDF**: Fast extraction enhanced with HTML processing and improved formatting preservation."
     }
     return method_descriptions.get(method, "Select a method to see details.")
 def check_enhanced_service_status():
-    """Check and display enhanced service status"""
     available_methods = backend_manager.get_available_methods()
-    status_lines = ["**Available OCR Methods (Enhanced with HTML Processing):**"]
     if "azure" in available_methods:
-        status_lines.append("✓ Azure Document Intelligence - Ready (HTML + Tables)")
     else:
-        status_lines.append("✗ Azure Document Intelligence - Not configured")
     if "tesseract" in available_methods:
-        status_lines.append("✓ Tesseract OCR - Ready (HTML Enhanced)")
     else:
-        status_lines.append("✗ Tesseract OCR - Not available")
     if "pymupdf" in available_methods:
-        status_lines.append("✓ PyMuPDF - Ready (HTML Enhanced)")
     else:
-        status_lines.append("✗ PyMuPDF - Not available")
     # Add enhanced features status
-    status_lines.append("✓ HTML Processing - Available")
-    status_lines.append("✓ Enhanced Table Handling - Available")
-    status_lines.append("✓ Smart Text Preservation - Available")
-    status_lines.append("✓ Multi-Page Crop Preview - Available")
-    status_lines.append("✓ Per-Page Crop Customization - Available")
     if HAS_DOCX_SUPPORT:
-        status_lines.append("✓ Enhanced DOCX Export - Available")
     else:
-        status_lines.append("✗ Enhanced DOCX Export - Install python-docx to enable")
-    status_lines.append("✓ HTML File Export - Available")
-    status_lines.append("✓ Enhanced Text Export - Available")
     return "\n".join(status_lines)
 def create_enhanced_interface():
-    """Create enhanced Gradio interface with improved layout and HTML processing"""
     with gr.Blocks(
-        title="PDF OCR Service - Enhanced with HTML Processing",
         theme=gr.themes.Soft(),
         css="""
         .main-header { text-align: center; margin-bottom: 2rem; }
@@ -484,14 +552,14 @@ def create_enhanced_interface():
         gr.HTML("""
         <div class="main-header">
-            <h1>PDF OCR Service - Enhanced with HTML Processing</h1>
-            <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, and format preservation</p>
         </div>
         """)
         # Instructions at the top
         with gr.Group(elem_classes=["instructions-panel"]):
-            gr.HTML("<h3>Instructions & Features</h3>")
             gr.HTML("""
             <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
                 <h4>How to Use:</h4>
@@ -499,19 +567,66 @@ def create_enhanced_interface():
                     <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
                     <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
                     <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
-                    <li><strong>Process:</strong> Click the process button to extract text with HTML enhancement</li>
-                    <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format</li>
                 </ol>
-                <h4>Enhanced Features:</h4>
                 <ul>
                     <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
                     <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
-                    <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads</li>
                     <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
                     <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
-                    <li><strong>Page Numbers:</strong> Automatic page numbering in extracted content</li>
-                    <li><strong>Proper Indentation:</strong> Preserved spacing and formatting</li>
                 </ul>
             </div>
             """)
@@ -543,7 +658,7 @@ def create_enhanced_interface():
                         choices=["auto", "azure", "tesseract", "pymupdf"],
                         value="auto",
                         label="OCR Method",
-                        info="Choose OCR method (all enhanced with HTML processing)"
                     )
                     # Method information display
@@ -628,7 +743,7 @@ def create_enhanced_interface():
             # Process button
             process_btn = gr.Button(
-                "Process PDF with HTML Enhancement",
                 variant="primary",
                 size="lg"
             )
@@ -666,8 +781,8 @@ def create_enhanced_interface():
                     # Extracted text output
                     text_output = gr.Textbox(
-                        label="Extracted Text (Enhanced with Proper Formatting and Page Numbers)",
-                        placeholder="Processed text with HTML enhancement and preserved formatting will appear here...",
                         lines=20,
                         max_lines=30,
                         interactive=False,
@@ -676,9 +791,9 @@ def create_enhanced_interface():
                     # Metadata information
                     metadata_output = gr.Textbox(
-                        label="Processing Information",
                         interactive=False,
-                        lines=4
                     )
                     # Enhanced download buttons
@@ -689,7 +804,7 @@ def create_enhanced_interface():
                             variant="secondary"
                         )
                         download_docx_btn = gr.DownloadButton(
-                            "Download Enhanced DOCX",
                             visible=False,
                             variant="secondary"
                         )
@@ -701,7 +816,7 @@ def create_enhanced_interface():
         # Service Status at the bottom
         with gr.Group(elem_classes=["status-box"]):
-            gr.HTML("<h4>Service Status</h4>")
             service_status = gr.Markdown(
                 value=check_enhanced_service_status()
             )
@@ -793,7 +908,7 @@ def create_enhanced_interface():
     return interface
 def launch_enhanced_ui():
-    """Launch the enhanced Gradio interface with HTML processing"""
     try:
         interface = create_enhanced_interface()
         interface.launch(

 load_dotenv()
 from backend import BackendManager
+from enhanced_indentation import EnhancedIndentationDetector
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Initialize backend manager
 backend_manager = BackendManager()
+# Initialize enhanced indentation detector
+indent_detector = EnhancedIndentationDetector()
 # Check if python-docx is available
 try:
     from docx import Document
         logger.error(f"Error updating crop preview: {e}")
         return None
+def process_pdf_with_enhanced_indentation(pdf_file, ocr_method, enable_header_footer_removal,
+                                        crop_top, crop_bottom, crop_left, crop_right,
+                                        apply_to_all_pages, current_page_selection,
+                                        progress=gr.Progress()):
+    """Process PDF with enhanced indentation detection, text classification, and comprehensive formatting"""
     if pdf_file is None:
         return "No file uploaded.", "", "", "Error: No file selected"
     try:
+        progress(0.1, desc="Initializing enhanced processing with comprehensive indentation detection and intelligent text classification...")
         # Prepare enhanced preprocessing options
         preprocessing_options = {
             'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
         }
+        progress(0.3, desc="Processing with enhanced indentation detection and text classification...")
+        # Process the PDF with enhanced preprocessing, indentation detection, and text classification
         result = backend_manager.process_pdf_with_enhanced_resolution(
             pdf_file.name, ocr_method, preprocessing_options
         )
+        progress(0.9, desc="Finalizing enhanced processing...")
         progress(1.0, desc="Complete!")
         if result['success']:
             metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
+            status = f"Success: Processed using {result['method_used']} with comprehensive indentation detection and intelligent text classification"
             # Return text, HTML, metadata, and status
             return (result['text'],
             return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
     except Exception as e:
+        logger.error(f"Enhanced processing error: {e}")
         return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
 def format_enhanced_metadata(metadata, method_used):
+    """Enhanced metadata formatting with comprehensive indentation processing and text classification info"""
     if not metadata:
         return f"Method used: {method_used}"
     if metadata.get('html_processing', False):
         info_lines.append("HTML generation: Enabled")
+    if metadata.get('comprehensive_indentation', False):
+        info_lines.append("Comprehensive indentation detection: Enabled")
+    if metadata.get('intelligent_text_classification', False):
+        info_lines.append("Intelligent text classification: Enabled")
+    if metadata.get('parenthetical_patterns_supported', False):
+        info_lines.append("Parenthetical patterns: Supported (Arabic, Thai, Letters, Roman)")
     if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
         info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
     if 'tables' in metadata:
         info_lines.append(f"Tables detected: {metadata['tables']}")
+    # Document structure analysis information
+    if 'document_structure_analysis' in metadata:
+        analysis = metadata['document_structure_analysis']
+        if not analysis.get('analysis_failed', False):
+            info_lines.append(f"Patterned lines detected: {analysis.get('patterned_lines', 0)}")
+            info_lines.append(f"Maximum indentation level: {analysis.get('max_level', 0)}")
+            info_lines.append(f"Pattern coverage: {analysis.get('coverage_percentage', 0):.1f}%")
+            # Text classification results
+            if 'text_classification' in analysis:
+                classification = analysis['text_classification']
+                info_lines.append(f"Headers detected: {analysis.get('header_count', 0)}")
+                info_lines.append(f"Paragraphs detected: {analysis.get('paragraph_count', 0)}")
+                info_lines.append(f"List items detected: {analysis.get('list_item_count', 0)}")
+            if analysis.get('dominant_patterns'):
+                dominant = analysis['dominant_patterns'][0][0] if analysis['dominant_patterns'] else 'None'
+                info_lines.append(f"Dominant pattern: {dominant}")
     if 'processing_time_seconds' in metadata:
         info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
 def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
                              crop_top, crop_bottom, crop_left, crop_right,
                              apply_to_all_pages, current_page_selection):
+    """Prepare enhanced downloads with comprehensive indentation processing and text classification"""
+    text, html, metadata, status = process_pdf_with_enhanced_indentation(
         pdf_file, method, enable_header_footer_removal,
         crop_top, crop_bottom, crop_left, crop_right,
         apply_to_all_pages, current_page_selection
                gr.update(visible=False))
 def get_enhanced_method_info(method):
+    """Get information about selected OCR method with comprehensive indentation processing and text classification"""
     method_descriptions = {
+        "auto": "**Auto Selection**: Automatically chooses the best available method with comprehensive indentation detection, intelligent text classification, HTML processing, enhanced pattern recognition for hierarchical numbering (including parenthetical patterns like (1), (๑), (a)), bullets, and multi-language support.",
+        "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with comprehensive indentation detection, intelligent text classification, HTML generation, layout preservation, smart table detection, and support for complex document structures including hierarchical numbering and parenthetical patterns.",
+        "tesseract": "**Tesseract OCR**: Open-source OCR enhanced with comprehensive indentation detection, intelligent text classification, HTML output, advanced image preprocessing, resolution scaling, and pattern recognition for various numbering styles including parenthetical patterns and bullet points.",
+        "pymupdf": "**PyMuPDF**: Fast extraction enhanced with comprehensive indentation detection, intelligent text classification, HTML processing, improved formatting preservation, and pattern recognition for maintaining document structure and hierarchy including parenthetical numbering."
     }
     return method_descriptions.get(method, "Select a method to see details.")
 def check_enhanced_service_status():
+    """Check and display enhanced service status with indentation detection and text classification capabilities"""
     available_methods = backend_manager.get_available_methods()
+    status_lines = ["**Available OCR Methods (Enhanced with Comprehensive Indentation Detection & Text Classification):**"]
     if "azure" in available_methods:
+        status_lines.append("✅ Azure Document Intelligence - Ready (HTML + Tables + Comprehensive Indentation + Text Classification)")
     else:
+        status_lines.append("❌ Azure Document Intelligence - Not configured")
     if "tesseract" in available_methods:
+        status_lines.append("✅ Tesseract OCR - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
     else:
+        status_lines.append("❌ Tesseract OCR - Not available")
     if "pymupdf" in available_methods:
+        status_lines.append("✅ PyMuPDF - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
     else:
+        status_lines.append("❌ PyMuPDF - Not available")
     # Add enhanced features status
+    status_lines.append("")
+    status_lines.append("**Comprehensive Indentation Detection Features:**")
+    status_lines.append("✅ Hierarchical Decimal Numbering (1.1.1.1.1...)")
+    status_lines.append("✅ Mixed Hierarchical Numbering (1.2.a.i.A...)")
+    status_lines.append("✅ Legal Numbering (1.1.1(a)(i))")
+    status_lines.append("✅ Outline Numbering (I.A.1.a.i.)")
+    status_lines.append("✅ Section Numbering (§1.2.3, Article 1.1.1)")
+    status_lines.append("✅ Parenthetical Arabic Numerals ((1), (2), (3))")
+    status_lines.append("✅ Parenthetical Thai Numerals ((๑), (๒), (๓))")
+    status_lines.append("✅ Parenthetical Letters ((a), (b), (A), (B))")
+    status_lines.append("✅ Parenthetical Roman Numerals ((i), (ii), (I), (II))")
+    status_lines.append("✅ Parenthetical Thai Letters ((ก), (ข), (ค))")
+    status_lines.append("✅ Thai Script Support (มาตรา, ข้อ, ก.ข.ค.)")
+    status_lines.append("✅ Multiple Bullet Styles (•◦▪→ and more)")
+    status_lines.append("✅ Checkbox Items ([x], [ ], [✓])")
+    status_lines.append("✅ Roman Numerals (I.II.III, i.ii.iii)")
+    status_lines.append("✅ Letter Lists (A.B.C, a.b.c)")
+    status_lines.append("✅ Space-based Indentation Detection")
+    status_lines.append("✅ Priority-based Pattern Matching")
+    status_lines.append("")
+    status_lines.append("**Intelligent Text Classification Features:**")
+    status_lines.append("✅ Header Detection (title case, all caps, short lines)")
+    status_lines.append("✅ Paragraph Classification (long text, proper punctuation)")
+    status_lines.append("✅ List Item Recognition (patterned content)")
+    status_lines.append("✅ Context-aware Analysis (position, font size)")
+    status_lines.append("✅ Confidence Scoring")
+    status_lines.append("✅ Document Structure Analysis")
+    status_lines.append("")
+    status_lines.append("**Enhanced Processing Features:**")
+    status_lines.append("✅ HTML Processing - Available")
+    status_lines.append("✅ Enhanced Table Handling - Available")
+    status_lines.append("✅ Smart Text Preservation - Available")
+    status_lines.append("✅ Multi-Page Crop Preview - Available")
+    status_lines.append("✅ Per-Page Crop Customization - Available")
+    status_lines.append("✅ Document Structure Analysis - Available")
     if HAS_DOCX_SUPPORT:
+        status_lines.append("✅ Enhanced DOCX Export - Available (with indentation formatting)")
     else:
+        status_lines.append("❌ Enhanced DOCX Export - Install python-docx to enable")
+    status_lines.append("✅ HTML File Export - Available")
+    status_lines.append("✅ Enhanced Text Export - Available")
+    # Add pattern detection statistics
+    pattern_count = len(indent_detector.patterns)
+    status_lines.append(f"✅ Pattern Detection Engine - {pattern_count} patterns supported")
     return "\n".join(status_lines)
 def create_enhanced_interface():
+    """Create enhanced Gradio interface with comprehensive indentation detection and text classification"""
     with gr.Blocks(
+        title="PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Text Classification",
         theme=gr.themes.Soft(),
         css="""
         .main-header { text-align: center; margin-bottom: 2rem; }
         gr.HTML("""
         <div class="main-header">
+            <h1>PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</h1>
+            <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, comprehensive indentation pattern recognition including parenthetical patterns like (1), (๑), (a), and intelligent text classification for headers, paragraphs, and list items</p>
         </div>
         """)
         # Instructions at the top
         with gr.Group(elem_classes=["instructions-panel"]):
+            gr.HTML("<h3>Instructions & Enhanced Features</h3>")
             gr.HTML("""
             <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
                 <h4>How to Use:</h4>
                     <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
                     <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
                     <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
+                    <li><strong>Process:</strong> Click the process button to extract text with comprehensive indentation detection and text classification</li>
+                    <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format with preserved formatting</li>
                 </ol>
+                <h4>Comprehensive Indentation Detection & Text Classification Features:</h4>
+                <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
+                    <div>
+                        <strong>Hierarchical Numbering:</strong>
+                        <ul>
+                            <li>Decimal: 1.1.1.1.1...</li>
+                            <li>Mixed: 1.2.a.i.A...</li>
+                            <li>Legal: 1.1.1(a)(i)</li>
+                            <li>Outline: I.A.1.a.i.</li>
+                            <li>Section: §1.2.3, Article 1.1.1</li>
+                        </ul>
+                    </div>
+                    <div>
+                        <strong>Parenthetical Patterns:</strong>
+                        <ul>
+                            <li>Arabic: (1), (2), (3)</li>
+                            <li>Thai Numerals: (๑), (๒), (๓)</li>
+                            <li>Letters: (a), (b), (A), (B)</li>
+                            <li>Roman: (i), (ii), (I), (II)</li>
+                            <li>Thai Letters: (ก), (ข), (ค)</li>
+                        </ul>
+                    </div>
+                </div>
+                <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
+                    <div>
+                        <strong>Multi-Language & Symbols:</strong>
+                        <ul>
+                            <li>Thai Script: มาตรา, ข้อ, ก.ข.ค.</li>
+                            <li>Bullets: •◦▪→ and 20+ more</li>
+                            <li>Roman: I.II.III, i.ii.iii</li>
+                            <li>Letters: A.B.C, a.b.c</li>
+                            <li>Checkboxes: [x], [ ], [✓]</li>
+                        </ul>
+                    </div>
+                    <div>
+                        <strong>Intelligent Text Classification:</strong>
+                        <ul>
+                            <li>Header Detection: Title case, all caps, short lines</li>
+                            <li>Paragraph Recognition: Long text, proper punctuation</li>
+                            <li>List Item Identification: Patterned content</li>
+                            <li>Context Analysis: Position, font size, formatting</li>
+                            <li>Confidence Scoring: Reliability assessment</li>
+                        </ul>
+                    </div>
+                </div>
+                <h4>Technical Enhancements:</h4>
                 <ul>
                     <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
                     <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
+                    <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads with preserved indentation</li>
                     <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
                     <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
+                    <li><strong>Document Analysis:</strong> Automatic structure detection and statistics</li>
+                    <li><strong>Priority Pattern Matching:</strong> Intelligent pattern detection with priority ranking</li>
+                    <li><strong>Text Classification:</strong> Automated header, paragraph, and list item detection</li>
                 </ul>
             </div>
             """)
                         choices=["auto", "azure", "tesseract", "pymupdf"],
                         value="auto",
                         label="OCR Method",
+                        info="Choose OCR method (all enhanced with comprehensive indentation detection and text classification)"
                     )
                     # Method information display
             # Process button
             process_btn = gr.Button(
+                "Process PDF with Comprehensive Indentation Detection & Text Classification",
                 variant="primary",
                 size="lg"
             )
                     # Extracted text output
                     text_output = gr.Textbox(
+                        label="Extracted Text (Enhanced with Comprehensive Indentation Detection & Text Classification)",
+                        placeholder="Processed text with comprehensive indentation detection, intelligent text classification, HTML enhancement, and preserved formatting will appear here...",
                         lines=20,
                         max_lines=30,
                         interactive=False,
                     # Metadata information
                     metadata_output = gr.Textbox(
+                        label="Processing Information & Document Analysis",
                         interactive=False,
+                        lines=8
                     )
                     # Enhanced download buttons
                             variant="secondary"
                         )
                         download_docx_btn = gr.DownloadButton(
+                            "Download Enhanced DOCX (with Indentation & Classification)",
                             visible=False,
                             variant="secondary"
                         )
         # Service Status at the bottom
         with gr.Group(elem_classes=["status-box"]):
+            gr.HTML("<h4>Service Status & Capabilities</h4>")
             service_status = gr.Markdown(
                 value=check_enhanced_service_status()
             )
     return interface
 def launch_enhanced_ui():
+    """Launch the enhanced Gradio interface with comprehensive indentation detection and text classification"""
     try:
         interface = create_enhanced_interface()
         interface.launch(

backend.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Backend Management Module - FIXED VERSION with Corrected Crop Processing
 Coordinates between UI and OCR services, handles file management and preprocessing
 """
 import re
@@ -14,24 +14,34 @@ from datetime import datetime
 import cv2
 import numpy as np
 import fitz  # PyMuPDF
 # Load environment variables
 from dotenv import load_dotenv
 load_dotenv()
 from ocr_service import OCRService
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class DocumentExporter:
-    """Advanced document export with HTML-based formatting"""
     @staticmethod
     def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
-        """Create enhanced TXT file with improved formatting"""
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         temp_file = tempfile.NamedTemporaryFile(
             suffix=f'_extracted_text_{timestamp}.txt',
@@ -42,8 +52,8 @@ class DocumentExporter:
         try:
             # Add header
-            temp_file.write("PDF OCR Extraction Results - Enhanced with HTML Processing\n")
-            temp_file.write("=" * 70 + "\n\n")
             # Add metadata
             if metadata_info:
@@ -53,11 +63,22 @@ class DocumentExporter:
             # Add timestamp
             temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
-            temp_file.write("=" * 70 + "\n\n")
             # Add main content
-            temp_file.write("Extracted Text (Formatted):\n")
-            temp_file.write("-" * 30 + "\n\n")
             temp_file.write(text_content)
             temp_file.close()
@@ -68,67 +89,57 @@ class DocumentExporter:
             temp_file.close()
             raise
-    @staticmethod
-    def create_enhanced_docx_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
-        """Create enhanced DOCX file from HTML content with proper spacing and indentation"""
         try:
-            from docx import Document
-            from docx.shared import Inches, Pt, RGBColor
-            from docx.enum.text import WD_ALIGN_PARAGRAPH
-            from docx.enum.table import WD_TABLE_ALIGNMENT
-            from docx.oxml.shared import OxmlElement, qn
-            from html.parser import HTMLParser
-            # Enhanced HTML to DOCX parser with spacing preservation
             class EnhancedDOCXHTMLParser(HTMLParser):
-                def __init__(self, doc):
                     super().__init__()
                     self.doc = doc
                     self.current_paragraph = None
-                    self.current_run = None
                     self.in_table = False
-                    self.current_table = None
-                    self.current_row = None
-                    self.current_cell = None
                     self.table_data = []
                     self.current_table_row = []
-                    self.current_indent_em = 0
-                    self.is_bold = False
-                    self.is_title = False
-                    self.is_heading = False
-                    self.is_bullet_point = False
                 def handle_starttag(self, tag, attrs):
                     attr_dict = dict(attrs)
                     class_attr = attr_dict.get('class', '')
-                    style_attr = attr_dict.get('style', '')
-                    if tag == 'div' and 'page' in class_attr:
-                        # Add minimal page separation (just paragraph spacing, no page break)
                         if hasattr(self, 'has_content'):
-                            # Add just 2 line breaks worth of spacing
                             self.doc.add_paragraph()
                             self.doc.add_paragraph()
                         self.has_content = True
-                    elif tag == 'div' and 'page-header' in class_attr:
                         self.current_paragraph = self.doc.add_heading(level=1)
                         self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
-                    elif tag == 'div' and 'title' in class_attr:
                         self.current_paragraph = self.doc.add_heading(level=1)
-                        self.is_title = True
-                        self._apply_spacing_from_style(style_attr)
-                    elif tag == 'div' and 'section-heading' in class_attr:
                         self.current_paragraph = self.doc.add_heading(level=2)
-                        self.is_heading = True
-                        self._apply_spacing_from_style(style_attr)
                     elif tag == 'div' and 'paragraph' in class_attr:
                         self.current_paragraph = self.doc.add_paragraph()
-                        self.is_bullet_point = 'bullet-point' in class_attr
-                        self._apply_spacing_from_style(style_attr)
                     elif tag == 'table':
                         self.in_table = True
@@ -137,47 +148,81 @@ class DocumentExporter:
                     elif tag == 'tr':
                         self.current_table_row = []
-                    elif tag == 'th' or tag == 'td':
-                        pass  # Will be handled in handle_data
                     elif tag == 'br':
                         if self.current_paragraph:
                             self.current_paragraph.add_run().add_break()
-                def _apply_spacing_from_style(self, style_attr):
-                    """Apply spacing and indentation from HTML style to DOCX paragraph"""
                     if not self.current_paragraph:
                         return
-                    # Extract margin-left for indentation
-                    import re
-                    margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
-                    if margin_match:
-                        em_value = float(margin_match.group(1))
-                        # Convert em to inches (1em ≈ 12pt, 72pt = 1 inch)
-                        indent_inches = (em_value * 12) / 72
                         self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
-                        # For bullet points, add hanging indent
-                        if self.is_bullet_point:
-                            self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
-                    # Set line spacing for better readability
-                    from docx.shared import Length
                     self.current_paragraph.paragraph_format.line_spacing = 1.15
-                    # Add appropriate spacing after paragraphs
-                    self.current_paragraph.paragraph_format.space_after = Pt(6)
                 def handle_endtag(self, tag):
-                    if tag == 'div' and (self.is_title or self.is_heading):
-                        self.is_title = False
-                        self.is_heading = False
-                        self.current_paragraph = None
-                    elif tag == 'div' and self.current_paragraph and not self.in_table:
-                        self.is_bullet_point = False
                         self.current_paragraph = None
                     elif tag == 'table':
                         self.in_table = False
@@ -189,28 +234,123 @@ class DocumentExporter:
                 def handle_data(self, data):
                     if data.strip():
-                        # Convert &nbsp; back to regular spaces
                         data = data.replace('&nbsp;', ' ')
                         if self.in_table:
                             self.current_table_row.append(data.strip())
                         elif self.current_paragraph is not None:
-                            run = self.current_paragraph.add_run(data)
-                            if self.is_title:
                                 run.bold = True
                                 run.font.size = Pt(16)
-                            elif self.is_heading:
                                 run.bold = True
                                 run.font.size = Pt(14)
                             else:
-                                # Regular text formatting
-                                run.font.size = Pt(11)
                 def _create_enhanced_docx_table(self):
                     if not self.table_data:
                         return
-                    # Create table with proper formatting
                     rows = len(self.table_data)
                     cols = max(len(row) for row in self.table_data) if self.table_data else 1
@@ -218,10 +358,7 @@ class DocumentExporter:
                     table.style = 'Table Grid'
                     table.alignment = WD_TABLE_ALIGNMENT.LEFT
-                    # Set table margins
-                    table.autofit = False
-                    # Fill table data with proper formatting
                     for row_idx, row_data in enumerate(self.table_data):
                         table_row = table.rows[row_idx]
                         for col_idx, cell_data in enumerate(row_data):
@@ -235,15 +372,19 @@ class DocumentExporter:
                                         for run in paragraph.runs:
                                             run.bold = True
                                             run.font.size = Pt(10)
                                         paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
                                 else:
                                     # Regular data cells
                                     for paragraph in cell.paragraphs:
                                         for run in paragraph.runs:
                                             run.font.size = Pt(10)
-                                # Set cell margins for better spacing
-                                cell.vertical_alignment = WD_ALIGN_PARAGRAPH.LEFT
                     # Add spacing after table
                     self.doc.add_paragraph()
@@ -251,14 +392,14 @@ class DocumentExporter:
             # Create DOCX document
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             temp_file = tempfile.NamedTemporaryFile(
-                suffix=f'_extracted_document_{timestamp}.docx',
                 delete=False
             )
             temp_file.close()
             doc = Document()
-            # Set document margins for better spacing
             sections = doc.sections
             for section in sections:
                 section.top_margin = Inches(1)
@@ -266,84 +407,65 @@ class DocumentExporter:
                 section.left_margin = Inches(1)
                 section.right_margin = Inches(1)
-            # Title with better formatting
             title = doc.add_heading('PDF OCR Extraction Results', 0)
             title.alignment = WD_ALIGN_PARAGRAPH.CENTER
-            # Add subtitle with enhanced styling
             subtitle_para = doc.add_paragraph()
-            subtitle_run = subtitle_para.add_run('Enhanced with HTML Processing and Preserved Formatting')
             subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
             subtitle_run.italic = True
             subtitle_run.font.size = Pt(12)
             subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
-            # Metadata section with better formatting
             if metadata_info:
                 doc.add_heading('Processing Information', level=1)
                 meta_para = doc.add_paragraph()
                 meta_run = meta_para.add_run(metadata_info)
                 meta_run.font.size = Pt(10)
                 meta_para.style = 'Intense Quote'
-                doc.add_paragraph()  # Add spacing
-            # Process HTML content with enhanced spacing
             doc.add_heading('Extracted Content', level=1)
-            if html_content and '<table' in html_content:
-                # Parse HTML and convert to DOCX with spacing preservation
-                parser = EnhancedDOCXHTMLParser(doc)
                 parser.feed(html_content)
             else:
-                # Fallback to text content with enhanced formatting
-                paragraphs = text_content.split('\n\n')
-                for para in paragraphs:
-                    if para.strip():
-                        if para.strip().startswith('==='):
-                            # Page headers with minimal separation
-                            page_header = doc.add_heading(para.strip(), level=1)
-                            page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
-                        elif para.strip().startswith('#'):
-                            # Titles
-                            title_text = para.strip().lstrip('#').strip()
-                            title_para = doc.add_heading(title_text, level=1)
-                        elif para.strip().startswith('##'):
-                            # Section headings
-                            heading_text = para.strip().lstrip('#').strip()
-                            heading_para = doc.add_heading(heading_text, level=2)
-                        else:
-                            # Regular paragraphs with spacing preservation
-                            lines = para.split('\n')
-                            for line in lines:
-                                if line.strip():
-                                    para_element = doc.add_paragraph()
-                                    # Calculate indentation from leading spaces
-                                    leading_spaces = len(line) - len(line.lstrip())
-                                    if leading_spaces > 0:
-                                        indent_level = leading_spaces // 2  # 2 spaces = 1 indent level
-                                        para_element.paragraph_format.left_indent = Inches(0.5 * indent_level)
-                                    # Add the text content
-                                    run = para_element.add_run(line.strip())
-                                    run.font.size = Pt(11)
-                                    # Set line spacing
-                                    para_element.paragraph_format.line_spacing = 1.15
-                                    para_element.paragraph_format.space_after = Pt(3)
-            # Enhanced footer
             footer_section = doc.sections[0]
             footer = footer_section.footer
             footer_para = footer.paragraphs[0]
-            footer_para.text = f"Generated by Enhanced PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
             footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
             footer_run = footer_para.runs[0]
-            footer_run.font.size = Pt(9)
             footer_run.font.color.rgb = RGBColor(128, 128, 128)
             doc.save(temp_file.name)
-            logger.info(f"Enhanced DOCX file with proper spacing created: {temp_file.name}")
             return temp_file.name
         except ImportError:
@@ -356,9 +478,99 @@ class DocumentExporter:
                 pass
             raise
     @staticmethod
     def create_html_file(html_content: str, metadata_info: str = "") -> str:
-        """Create standalone HTML file"""
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         temp_file = tempfile.NamedTemporaryFile(
             suffix=f'_extracted_document_{timestamp}.html',
@@ -368,26 +580,80 @@ class DocumentExporter:
         )
         try:
-            # Enhanced HTML with better styling
-            enhanced_html = html_content.replace(
-                '<style>',
-                '''<style>
-                    body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; margin: 20px; background-color: #f9f9f9; }
-                    .container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
-                    .header { text-align: center; margin-bottom: 30px; border-bottom: 3px solid #2c3e50; padding-bottom: 20px; }
-                    .metadata { background-color: #ecf0f1; padding: 15px; border-radius: 5px; margin-bottom: 25px; border-left: 4px solid #3498db; }
-                '''
-            )
-            # Wrap content in container
-            if '<body>' in enhanced_html:
                 enhanced_html = enhanced_html.replace(
                     '<body>',
                     '''<body>
                     <div class="container">
                     <div class="header">
                         <h1>PDF OCR Extraction Results</h1>
-                        <p>Enhanced with HTML Processing and Format Preservation</p>
                     </div>''' +
                     (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
                 )
@@ -404,23 +670,24 @@ class DocumentExporter:
 class BackendManager:
-    """Enhanced backend manager with FIXED crop processing and advanced export capabilities"""
     def __init__(self):
         self.ocr_service = OCRService()
         self.processing_history = []
         self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
         # Create directories for temporary files and logs
-        self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
         self.temp_dir.mkdir(exist_ok=True)
-        logger.info("Enhanced backend manager with fixed crop processing initialized successfully")
     def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
                                            preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         """
-        Process PDF with enhanced resolution and HTML generation
         Args:
             pdf_path: Path to the PDF file
@@ -428,7 +695,7 @@ class BackendManager:
             preprocessing_options: Dictionary containing preprocessing settings
         Returns:
-            Dict containing processing results with HTML content
         """
         start_time = datetime.now()
@@ -460,7 +727,7 @@ class BackendManager:
         # Generate file hash for tracking
         file_hash = self._calculate_file_hash(pdf_path)
-        logger.info(f"Processing PDF with enhanced resolution: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
         logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
         # Handle preprocessing if enabled
@@ -478,12 +745,23 @@ class BackendManager:
                 processed_pdf_path = pdf_path
         try:
-            # Process with enhanced OCR
             result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
             # Add processing metadata
             processing_time = (datetime.now() - start_time).total_seconds()
             result['metadata'].update({
                 'file_hash': file_hash,
                 'file_size_mb': round(file_size / (1024*1024), 2),
@@ -491,8 +769,12 @@ class BackendManager:
                 'timestamp': start_time.isoformat(),
                 'enhanced_processing': True,
                 'html_processing': True,
                 'header_footer_removed': preprocessing_applied,
-                'preprocessing_options': preprocessing_options if preprocessing_applied else None
             })
             # Cleanup temporary preprocessed file
@@ -502,7 +784,7 @@ class BackendManager:
                 except:
                     pass
-            # Log results
             if result['success']:
                 text_length = len(result['text'])
                 has_html = bool(result.get('html'))
@@ -512,10 +794,17 @@ class BackendManager:
                 logger.info(f"Method used: {result['method_used']}")
                 logger.info(f"Text extracted: {text_length} characters")
                 logger.info(f"HTML generated: {has_html}")
                 if table_count > 0:
                     logger.info(f"Tables detected: {table_count}")
                 if preprocessing_applied:
                     logger.info("Enhanced preprocessing applied")
                 # Add to processing history
                 self._add_to_history({
@@ -528,7 +817,11 @@ class BackendManager:
                     'processing_time': processing_time,
                     'preprocessing_applied': preprocessing_applied,
                     'html_generated': has_html,
-                    'enhanced_processing': True
                 })
             else:
                 logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
@@ -542,7 +835,10 @@ class BackendManager:
                     'error': result.get('error', 'Unknown error'),
                     'processing_time': processing_time,
                     'preprocessing_applied': preprocessing_applied,
-                    'enhanced_processing': True
                 })
             return result
@@ -566,7 +862,10 @@ class BackendManager:
                 'success': False,
                 'error': str(e),
                 'processing_time': processing_time,
-                'enhanced_processing': True
             })
             return {
@@ -579,12 +878,15 @@ class BackendManager:
                     'file_hash': file_hash,
                     'processing_time_seconds': round(processing_time, 2),
                     'timestamp': start_time.isoformat(),
-                    'enhanced_processing': True
                 }
             }
     def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
-        """Apply enhanced preprocessing with high-resolution crop handling - FIXED"""
         crop_settings = options.get('crop_settings', {})
         per_page_crops = crop_settings.get('per_page_crops', {})
         enhanced_resolution = crop_settings.get('enhanced_resolution', True)
@@ -602,7 +904,7 @@ class BackendManager:
                 page = doc.load_page(page_num)
                 page_rect = page.rect
-                # Get crop settings for this page - FIXED indexing
                 page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
                     'top': 0, 'bottom': 0, 'left': 0, 'right': 0
                 }))
@@ -636,7 +938,6 @@ class BackendManager:
                 # Create new page with enhanced resolution if enabled
                 if enhanced_resolution:
-                    # Use high resolution for better quality
                     new_page = new_doc.new_page(
                         width=new_rect.width,
                         height=new_rect.height
@@ -676,36 +977,36 @@ class BackendManager:
     def create_enhanced_downloads(self, text_content: str, html_content: str,
                                 metadata_info: str = "") -> Dict[str, str]:
-        """Create enhanced download files with HTML processing"""
         download_files = {}
         try:
             # Create enhanced TXT file
-            txt_path = DocumentExporter.create_enhanced_txt_file(
                 text_content, html_content, metadata_info
             )
             download_files['txt'] = txt_path
             logger.info(f"Enhanced TXT file created: {txt_path}")
-            # Create enhanced DOCX file if possible
             try:
-                docx_path = DocumentExporter.create_enhanced_docx_file(
                     text_content, html_content, metadata_info
                 )
                 download_files['docx'] = docx_path
-                logger.info(f"Enhanced DOCX file created: {docx_path}")
             except ImportError:
                 logger.warning("python-docx not available. DOCX creation skipped.")
             except Exception as e:
-                logger.error(f"DOCX creation failed: {e}")
             # Create standalone HTML file
             try:
-                html_path = DocumentExporter.create_html_file(
                     html_content, metadata_info
                 )
                 download_files['html'] = html_path
-                logger.info(f"HTML file created: {html_path}")
             except Exception as e:
                 logger.error(f"HTML file creation failed: {e}")
@@ -744,10 +1045,18 @@ class BackendManager:
             'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
             'enhanced_processing': True,
             'html_processing': True,
             'docx_export_available': docx_available,
             'enhanced_crop_processing': True,
             'multi_resolution_support': True,
-            'crop_processing_fixed': True
         }
         return status
@@ -795,7 +1104,7 @@ class BackendManager:
             logger.error(f"Error during cleanup: {e}")
     def get_enhanced_statistics(self) -> Dict[str, Any]:
-        """Get enhanced processing statistics"""
         if not self.processing_history:
             return {
                 'total_processed': 0,
@@ -806,7 +1115,11 @@ class BackendManager:
                 'total_tables_processed': 0,
                 'preprocessing_usage': 0,
                 'html_generation_rate': 0,
-                'enhanced_processing_usage': 0
             }
         total_processed = len(self.processing_history)
@@ -826,9 +1139,20 @@ class BackendManager:
         preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
         html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
         enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
         html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
         enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
         return {
             'total_processed': total_processed,
@@ -842,7 +1166,14 @@ class BackendManager:
             'preprocessing_usage': preprocessing_usage,
             'html_generation_rate': round(html_generation_rate, 2),
             'enhanced_processing_usage': enhanced_processing,
-            'enhanced_processing_rate': round(enhanced_processing_rate, 2)
         }
@@ -861,8 +1192,29 @@ if __name__ == "__main__":
     # Test the enhanced backend manager
     manager = BackendManager()
-    print("Enhanced Backend Manager with Fixed Crop Processing Test")
-    print("=" * 60)
     print(f"Available methods: {manager.get_available_methods()}")
     print(f"Service status: {manager.get_service_status()}")
-    print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")

 """
+Backend Management Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
 Coordinates between UI and OCR services, handles file management and preprocessing
 """
 import re
 import cv2
 import numpy as np
 import fitz  # PyMuPDF
+from docx import Document
+from docx.shared import Inches, Pt, RGBColor
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.enum.table import WD_TABLE_ALIGNMENT
+from docx.oxml.shared import OxmlElement, qn
+from html.parser import HTMLParser
 # Load environment variables
 from dotenv import load_dotenv
 load_dotenv()
 from ocr_service import OCRService
+from enhanced_indentation import EnhancedIndentationDetector
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class EnhancedDocumentExporter:
+    """Advanced document export with comprehensive indentation support, parenthetical patterns, and text classification for HTML and DOCX"""
+    def __init__(self):
+        self.indent_detector = EnhancedIndentationDetector()
     @staticmethod
     def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
+        """Create enhanced TXT file with improved formatting and indentation preservation"""
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         temp_file = tempfile.NamedTemporaryFile(
             suffix=f'_extracted_text_{timestamp}.txt',
         try:
             # Add header
+            temp_file.write("PDF OCR Extraction Results - Enhanced with Comprehensive Indentation Detection & Text Classification\n")
+            temp_file.write("=" * 90 + "\n\n")
             # Add metadata
             if metadata_info:
             # Add timestamp
             temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            temp_file.write("=" * 90 + "\n\n")
+            # Add feature list
+            temp_file.write("Enhanced Features Applied:\n")
+            temp_file.write("-" * 25 + "\n")
+            temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n")
+            temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n")
+            temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n")
+            temp_file.write("• Multi-language Support (English, Thai)\n")
+            temp_file.write("• HTML Intermediate Processing\n")
+            temp_file.write("• Priority-based Pattern Matching\n")
+            temp_file.write("• Document Structure Analysis\n\n")
             # Add main content
+            temp_file.write("Extracted Text (Enhanced with Comprehensive Pattern Detection):\n")
+            temp_file.write("-" * 60 + "\n\n")
             temp_file.write(text_content)
             temp_file.close()
             temp_file.close()
             raise
+    def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str:
+        """Create enhanced DOCX file with comprehensive indentation support, parenthetical patterns, and text classification"""
         try:
             class EnhancedDOCXHTMLParser(HTMLParser):
+                def __init__(self, doc, processor):
                     super().__init__()
                     self.doc = doc
+                    self.processor = processor
                     self.current_paragraph = None
                     self.in_table = False
                     self.table_data = []
                     self.current_table_row = []
+                    self.current_indent_level = 0
+                    self.current_formatting_hint = 'normal_text'
+                    self.in_title = False
+                    self.in_section_heading = False
+                    self.in_page_header = False
+                    self.in_content_header = False
+                    self.current_classes = []
                 def handle_starttag(self, tag, attrs):
                     attr_dict = dict(attrs)
                     class_attr = attr_dict.get('class', '')
+                    self.current_classes = class_attr.split()
+                    if 'page' in class_attr and tag == 'div':
                         if hasattr(self, 'has_content'):
                             self.doc.add_paragraph()
                             self.doc.add_paragraph()
                         self.has_content = True
+                    elif 'page-header' in class_attr:
                         self.current_paragraph = self.doc.add_heading(level=1)
                         self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                        self.in_page_header = True
+                    elif 'content-header' in class_attr:
+                        self.current_paragraph = self.doc.add_heading(level=2)
+                        self.in_content_header = True
+                    elif 'title' in class_attr:
                         self.current_paragraph = self.doc.add_heading(level=1)
+                        self.in_title = True
+                    elif 'section-heading' in class_attr:
                         self.current_paragraph = self.doc.add_heading(level=2)
+                        self.in_section_heading = True
                     elif tag == 'div' and 'paragraph' in class_attr:
                         self.current_paragraph = self.doc.add_paragraph()
+                        self._apply_enhanced_formatting()
                     elif tag == 'table':
                         self.in_table = True
                     elif tag == 'tr':
                         self.current_table_row = []
                     elif tag == 'br':
                         if self.current_paragraph:
                             self.current_paragraph.add_run().add_break()
+                def _apply_enhanced_formatting(self):
+                    """Apply enhanced formatting based on CSS classes and indentation detection"""
                     if not self.current_paragraph:
                         return
+                    # Extract indent level from classes
+                    for cls in self.current_classes:
+                        if cls.startswith('indent-level-'):
+                            try:
+                                self.current_indent_level = int(cls.split('-')[-1])
+                            except ValueError:
+                                self.current_indent_level = 0
+                            break
+                    # Extract formatting hint from classes
+                    formatting_hints = [
+                        'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary',
+                        'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary',
+                        'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary',
+                        'lettered-primary', 'lettered-secondary',
+                        'roman-primary', 'roman-secondary',
+                        'thai-primary', 'thai-secondary',
+                        'indented_text', 'space-indent'
+                    ]
+                    for hint in formatting_hints:
+                        if hint in self.current_classes:
+                            self.current_formatting_hint = hint
+                            break
+                    else:
+                        self.current_formatting_hint = 'normal_text'
+                    # Apply indentation
+                    if self.current_indent_level > 0:
+                        indent_inches = self.current_indent_level * 0.5
                         self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
+                    # Apply hanging indent for bullets and parenthetical items
+                    if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint:
+                        self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
+                    # Set line spacing and paragraph spacing
                     self.current_paragraph.paragraph_format.line_spacing = 1.15
+                    # Apply spacing based on formatting hint
+                    if 'primary' in self.current_formatting_hint:
+                        self.current_paragraph.paragraph_format.space_before = Pt(10)
+                        self.current_paragraph.paragraph_format.space_after = Pt(8)
+                    elif 'secondary' in self.current_formatting_hint:
+                        self.current_paragraph.paragraph_format.space_before = Pt(8)
+                        self.current_paragraph.paragraph_format.space_after = Pt(6)
+                    elif 'tertiary' in self.current_formatting_hint:
+                        self.current_paragraph.paragraph_format.space_before = Pt(6)
+                        self.current_paragraph.paragraph_format.space_after = Pt(4)
+                    else:
+                        self.current_paragraph.paragraph_format.space_after = Pt(3)
                 def handle_endtag(self, tag):
+                    if tag == 'div':
+                        if self.in_page_header:
+                            self.in_page_header = False
+                        elif self.in_content_header:
+                            self.in_content_header = False
+                        elif self.in_title:
+                            self.in_title = False
+                        elif self.in_section_heading:
+                            self.in_section_heading = False
                         self.current_paragraph = None
+                        self.current_indent_level = 0
+                        self.current_formatting_hint = 'normal_text'
+                        self.current_classes = []
                     elif tag == 'table':
                         self.in_table = False
                 def handle_data(self, data):
                     if data.strip():
                         data = data.replace('&nbsp;', ' ')
                         if self.in_table:
                             self.current_table_row.append(data.strip())
                         elif self.current_paragraph is not None:
+                            # Detect patterns in the text for additional formatting
+                            indent_info = self.processor.indent_detector.detect_indentation(data)
+                            text_classification = self.processor.indent_detector.classify_text_type(data)
+                            run = self.current_paragraph.add_run(data.strip())
+                            # Apply formatting based on pattern, level, and text classification
+                            if self.in_title:
                                 run.bold = True
                                 run.font.size = Pt(16)
+                                run.font.color.rgb = RGBColor(44, 62, 80)  # Dark blue
+                            elif self.in_content_header or text_classification.get('is_header'):
+                                run.bold = True
+                                run.font.size = Pt(14)
+                                run.font.color.rgb = RGBColor(44, 62, 80)  # Dark blue
+                            elif self.in_section_heading:
+                                run.bold = True
+                                run.font.size = Pt(14)
+                                run.font.color.rgb = RGBColor(52, 73, 94)  # Darker blue
+                            elif self.in_page_header:
                                 run.bold = True
                                 run.font.size = Pt(14)
+                                run.font.color.rgb = RGBColor(44, 62, 80)
                             else:
+                                # Apply pattern-specific formatting
+                                self._apply_pattern_formatting(run, indent_info, text_classification)
+                def _apply_pattern_formatting(self, run, indent_info, text_classification):
+                    """Apply formatting based on detected pattern, classification, and current formatting hint"""
+                    pattern_type = indent_info.get('pattern_type', 'normal')
+                    level = indent_info.get('level', 0)
+                    is_numbered = indent_info.get('is_numbered', False)
+                    is_bullet = indent_info.get('is_bullet', False)
+                    is_lettered = indent_info.get('is_lettered', False)
+                    is_roman = indent_info.get('is_roman', False)
+                    is_thai = indent_info.get('is_thai', False)
+                    is_parenthetical = indent_info.get('is_parenthetical', False)
+                    # Base font size
+                    run.font.size = Pt(11)
+                    # Apply formatting based on current formatting hint and detected pattern
+                    if 'numbered' in self.current_formatting_hint or is_numbered:
+                        if 'primary' in self.current_formatting_hint or level == 1:
+                            run.bold = True
+                            run.font.color.rgb = RGBColor(44, 62, 80)  # Dark blue
+                        elif 'secondary' in self.current_formatting_hint or level == 2:
+                            run.font.color.rgb = RGBColor(52, 73, 94)  # Medium blue
+                        elif 'tertiary' in self.current_formatting_hint or level == 3:
+                            run.font.color.rgb = RGBColor(85, 85, 85)  # Dark gray
+                        else:
+                            run.font.color.rgb = RGBColor(102, 102, 102)  # Gray
+                    elif 'parenthetical' in self.current_formatting_hint or is_parenthetical:
+                        # Special formatting for parenthetical patterns
+                        if 'primary' in self.current_formatting_hint or level == 2:
+                            run.bold = True
+                            run.font.color.rgb = RGBColor(142, 68, 173)  # Purple
+                        elif 'secondary' in self.current_formatting_hint or level == 3:
+                            run.font.color.rgb = RGBColor(155, 89, 182)  # Light purple
+                        elif 'tertiary' in self.current_formatting_hint or level == 4:
+                            run.font.color.rgb = RGBColor(175, 122, 197)  # Lighter purple
+                        else:
+                            run.font.color.rgb = RGBColor(195, 155, 211)  # Very light purple
+                    elif 'bullet' in self.current_formatting_hint or is_bullet:
+                        if 'primary' in self.current_formatting_hint or level == 1:
+                            run.font.color.rgb = RGBColor(52, 152, 219)  # Blue
+                        elif 'secondary' in self.current_formatting_hint or level == 2:
+                            run.font.color.rgb = RGBColor(149, 165, 166)  # Gray
+                        elif 'tertiary' in self.current_formatting_hint or level == 3:
+                            run.font.color.rgb = RGBColor(189, 195, 199)  # Light gray
+                        else:
+                            run.font.color.rgb = RGBColor(189, 195, 199)  # Light gray
+                    elif 'lettered' in self.current_formatting_hint or is_lettered:
+                        run.italic = True
+                        if 'primary' in self.current_formatting_hint:
+                            run.font.color.rgb = RGBColor(142, 68, 173)  # Purple
+                        else:
+                            run.font.color.rgb = RGBColor(155, 89, 182)  # Light purple
+                    elif 'roman' in self.current_formatting_hint or is_roman:
+                        run.font.color.rgb = RGBColor(211, 84, 0)  # Orange
+                        run.font.name = 'Times New Roman'  # Roman style font
+                    elif 'thai' in self.current_formatting_hint or is_thai:
+                        if 'primary' in self.current_formatting_hint:
+                            run.bold = True
+                            run.font.color.rgb = RGBColor(22, 160, 133)  # Teal
+                        else:
+                            run.font.color.rgb = RGBColor(26, 188, 156)  # Light teal
+                    elif 'space-indent' in self.current_formatting_hint:
+                        run.italic = True
+                        run.font.color.rgb = RGBColor(85, 85, 85)  # Dark gray
+                    else:
+                        # Default text formatting based on classification
+                        if text_classification.get('is_header'):
+                            run.bold = True
+                            run.font.color.rgb = RGBColor(44, 62, 80)  # Dark blue
+                        elif text_classification.get('is_list_item'):
+                            run.font.color.rgb = RGBColor(52, 152, 219)  # Blue
+                        else:
+                            run.font.color.rgb = RGBColor(0, 0, 0)  # Black
                 def _create_enhanced_docx_table(self):
+                    """Create table with enhanced formatting"""
                     if not self.table_data:
                         return
                     rows = len(self.table_data)
                     cols = max(len(row) for row in self.table_data) if self.table_data else 1
                     table.style = 'Table Grid'
                     table.alignment = WD_TABLE_ALIGNMENT.LEFT
+                    # Fill table data with enhanced formatting
                     for row_idx, row_data in enumerate(self.table_data):
                         table_row = table.rows[row_idx]
                         for col_idx, cell_data in enumerate(row_data):
                                         for run in paragraph.runs:
                                             run.bold = True
                                             run.font.size = Pt(10)
+                                            run.font.color.rgb = RGBColor(44, 62, 80)
                                         paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                                        # Add background color to header
+                                        shading_elm_1 = OxmlElement('w:shd')
+                                        shading_elm_1.set(qn('w:fill'), 'ECF0F1')
+                                        paragraph._element.get_or_add_pPr().append(shading_elm_1)
                                 else:
                                     # Regular data cells
                                     for paragraph in cell.paragraphs:
                                         for run in paragraph.runs:
                                             run.font.size = Pt(10)
+                                        paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
                     # Add spacing after table
                     self.doc.add_paragraph()
             # Create DOCX document
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             temp_file = tempfile.NamedTemporaryFile(
+                suffix=f'_enhanced_document_{timestamp}.docx',
                 delete=False
             )
             temp_file.close()
             doc = Document()
+            # Set document margins for better layout
             sections = doc.sections
             for section in sections:
                 section.top_margin = Inches(1)
                 section.left_margin = Inches(1)
                 section.right_margin = Inches(1)
+            # Add title with enhanced styling
             title = doc.add_heading('PDF OCR Extraction Results', 0)
             title.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            title_run = title.runs[0]
+            title_run.font.color.rgb = RGBColor(44, 62, 80)
+            # Add subtitle
             subtitle_para = doc.add_paragraph()
+            subtitle_run = subtitle_para.add_run('Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification')
             subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
             subtitle_run.italic = True
             subtitle_run.font.size = Pt(12)
             subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
+            # Add feature list
+            features_para = doc.add_paragraph()
+            features_run = features_para.add_run('Features: Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification')
+            features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            features_run.font.size = Pt(9)
+            features_run.font.color.rgb = RGBColor(149, 165, 166)
+            # Add metadata section
             if metadata_info:
                 doc.add_heading('Processing Information', level=1)
                 meta_para = doc.add_paragraph()
                 meta_run = meta_para.add_run(metadata_info)
                 meta_run.font.size = Pt(10)
                 meta_para.style = 'Intense Quote'
+                # Add background to metadata
+                shading_elm = OxmlElement('w:shd')
+                shading_elm.set(qn('w:fill'), 'F8F9FA')
+                meta_para._element.get_or_add_pPr().append(shading_elm)
+                doc.add_paragraph()
+            # Process content
             doc.add_heading('Extracted Content', level=1)
+            if html_content and '<div' in html_content:
+                # Parse HTML with enhanced indentation processing and text classification
+                parser = EnhancedDOCXHTMLParser(doc, self)
                 parser.feed(html_content)
             else:
+                # Fallback to text processing with enhanced indentation and classification
+                self._process_text_content_enhanced(doc, text_content)
+            # Add enhanced footer
             footer_section = doc.sections[0]
             footer = footer_section.footer
             footer_para = footer.paragraphs[0]
+            footer_para.text = f"Generated by Enhanced PDF OCR Service with Comprehensive Indentation Detection & Text Classification on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
             footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
             footer_run = footer_para.runs[0]
+            footer_run.font.size = Pt(8)
             footer_run.font.color.rgb = RGBColor(128, 128, 128)
             doc.save(temp_file.name)
+            logger.info(f"Enhanced DOCX file with comprehensive indentation support and text classification created: {temp_file.name}")
             return temp_file.name
         except ImportError:
                 pass
             raise
+    def _process_text_content_enhanced(self, doc, text_content):
+        """Process text content with enhanced indentation detection and text classification"""
+        paragraphs = text_content.split('\n\n')
+        for para_text in paragraphs:
+            if not para_text.strip():
+                continue
+            lines = para_text.split('\n')
+            for line in lines:
+                if not line.strip():
+                    continue
+                # Detect indentation and classify text
+                indent_info = self.indent_detector.detect_indentation(line)
+                text_classification = self.indent_detector.classify_text_type(line)
+                if line.strip().startswith('==='):
+                    # Page headers
+                    page_header = doc.add_heading(line.strip(), level=1)
+                    page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    header_run = page_header.runs[0]
+                    header_run.font.color.rgb = RGBColor(44, 62, 80)
+                elif line.strip().startswith('##'):
+                    # Section headings
+                    heading_text = line.strip().lstrip('#').strip()
+                    heading = doc.add_heading(heading_text, level=2)
+                    heading_run = heading.runs[0]
+                    heading_run.font.color.rgb = RGBColor(52, 73, 94)
+                elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7:
+                    # Detected headers
+                    heading = doc.add_heading(indent_info.get('content', line.strip()), level=2)
+                    heading_run = heading.runs[0]
+                    heading_run.font.color.rgb = RGBColor(52, 73, 94)
+                else:
+                    # Regular content with enhanced indentation and classification
+                    para = doc.add_paragraph()
+                    # Apply indentation based on detected level
+                    level = indent_info.get('level', 0)
+                    if level > 0:
+                        para.paragraph_format.left_indent = Inches(level * 0.5)
+                    # Apply pattern-specific formatting
+                    if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False):
+                        para.paragraph_format.first_line_indent = Inches(-0.25)
+                    # Set proper spacing
+                    para.paragraph_format.line_spacing = 1.15
+                    para.paragraph_format.space_after = Pt(3)
+                    # Add content with enhanced formatting
+                    content = indent_info.get('content', line.strip())
+                    marker = indent_info.get('pattern_marker', '')
+                    # Include marker for non-bullet items
+                    if marker and not indent_info.get('is_bullet', False):
+                        content = f"{marker} {content}"
+                    run = para.add_run(content)
+                    run.font.size = Pt(11)
+                    # Apply color coding based on pattern type and classification
+                    pattern_type = indent_info.get('pattern_type', 'normal')
+                    if 'numbered' in pattern_type or 'decimal' in pattern_type:
+                        if level == 1:
+                            run.bold = True
+                            run.font.color.rgb = RGBColor(44, 62, 80)
+                        elif level == 2:
+                            run.font.color.rgb = RGBColor(52, 73, 94)
+                        else:
+                            run.font.color.rgb = RGBColor(85, 85, 85)
+                    elif 'parenthetical' in pattern_type:
+                        if level <= 2:
+                            run.bold = True
+                            run.font.color.rgb = RGBColor(142, 68, 173)  # Purple
+                        else:
+                            run.font.color.rgb = RGBColor(155, 89, 182)  # Light purple
+                    elif 'bullet' in pattern_type:
+                        run.font.color.rgb = RGBColor(52, 152, 219)
+                    elif 'lettered' in pattern_type:
+                        run.italic = True
+                        run.font.color.rgb = RGBColor(142, 68, 173)
+                    elif 'roman' in pattern_type:
+                        run.font.color.rgb = RGBColor(211, 84, 0)
+                    elif 'thai' in pattern_type:
+                        run.font.color.rgb = RGBColor(22, 160, 133)
+                    elif text_classification.get('is_list_item'):
+                        run.font.color.rgb = RGBColor(52, 152, 219)
     @staticmethod
     def create_html_file(html_content: str, metadata_info: str = "") -> str:
+        """Create standalone HTML file with enhanced styling for comprehensive indentation and text classification"""
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         temp_file = tempfile.NamedTemporaryFile(
             suffix=f'_extracted_document_{timestamp}.html',
         )
         try:
+            # Enhance HTML with better styling
+            enhanced_html = html_content
+            # Add comprehensive styling if not already present
+            if '<style>' not in enhanced_html:
+                enhanced_html = enhanced_html.replace(
+                    '<head>',
+                    '''<head>
+                    <style>
+                        body {
+                            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+                            line-height: 1.6;
+                            margin: 20px;
+                            background-color: #f9f9f9;
+                        }
+                        .container {
+                            max-width: 1200px;
+                            margin: 0 auto;
+                            background-color: white;
+                            padding: 30px;
+                            border-radius: 8px;
+                            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+                        }
+                        .header {
+                            text-align: center;
+                            margin-bottom: 30px;
+                            border-bottom: 3px solid #2c3e50;
+                            padding-bottom: 20px;
+                        }
+                        .metadata {
+                            background-color: #ecf0f1;
+                            padding: 15px;
+                            border-radius: 5px;
+                            margin-bottom: 25px;
+                            border-left: 4px solid #3498db;
+                        }
+                        .enhanced-features {
+                            background-color: #e8f5e8;
+                            padding: 10px;
+                            border-radius: 5px;
+                            margin-bottom: 20px;
+                            border-left: 4px solid #27ae60;
+                            font-size: 0.9em;
+                        }
+                        .classification-features {
+                            background-color: #fef9e7;
+                            padding: 10px;
+                            border-radius: 5px;
+                            margin-bottom: 20px;
+                            border-left: 4px solid #f39c12;
+                            font-size: 0.9em;
+                        }
+                    </style>'''
+                )
+            # Wrap content in container if not already wrapped
+            if '<body>' in enhanced_html and '.container' not in enhanced_html:
                 enhanced_html = enhanced_html.replace(
                     '<body>',
                     '''<body>
                     <div class="container">
                     <div class="header">
                         <h1>PDF OCR Extraction Results</h1>
+                        <p>Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</p>
+                    </div>
+                    <div class="enhanced-features">
+                        <strong>Indentation Features:</strong> Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a), (i), (ก)) •
+                        Multi-level Bullets • Letter & Roman Numerals • Thai Script Support •
+                        Space-based Indentation • Pattern Priority Detection
+                    </div>
+                    <div class="classification-features">
+                        <strong>Text Classification:</strong> Header Detection • Paragraph Recognition •
+                        List Item Identification • Context Analysis • Confidence Scoring •
+                        Document Structure Analysis
                     </div>''' +
                     (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
                 )
 class BackendManager:
+    """Enhanced backend manager with comprehensive indentation detection, parenthetical patterns, text classification, and advanced export capabilities"""
     def __init__(self):
         self.ocr_service = OCRService()
+        self.document_exporter = EnhancedDocumentExporter()
         self.processing_history = []
         self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
         # Create directories for temporary files and logs
+        self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_enhanced_v2'
         self.temp_dir.mkdir(exist_ok=True)
+        logger.info("Enhanced backend manager with comprehensive indentation detection and text classification initialized successfully")
     def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
                                            preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
         """
+        Process PDF with enhanced resolution, comprehensive indentation detection, and intelligent text classification
         Args:
             pdf_path: Path to the PDF file
             preprocessing_options: Dictionary containing preprocessing settings
         Returns:
+            Dict containing processing results with enhanced HTML content, indentation, and text classification
         """
         start_time = datetime.now()
         # Generate file hash for tracking
         file_hash = self._calculate_file_hash(pdf_path)
+        logger.info(f"Processing PDF with enhanced indentation detection and text classification: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
         logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
         # Handle preprocessing if enabled
                 processed_pdf_path = pdf_path
         try:
+            # Process with enhanced OCR, indentation detection, and text classification
             result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
             # Add processing metadata
             processing_time = (datetime.now() - start_time).total_seconds()
+            # Analyze document structure with text classification if successful
+            document_analysis = {}
+            if result['success'] and result['text']:
+                try:
+                    text_lines = result['text'].split('\n')
+                    detector = EnhancedIndentationDetector()
+                    document_analysis = detector.analyze_document_structure(text_lines)
+                except Exception as analysis_error:
+                    logger.warning(f"Document structure analysis failed: {analysis_error}")
+                    document_analysis = {'analysis_failed': True}
             result['metadata'].update({
                 'file_hash': file_hash,
                 'file_size_mb': round(file_size / (1024*1024), 2),
                 'timestamp': start_time.isoformat(),
                 'enhanced_processing': True,
                 'html_processing': True,
+                'comprehensive_indentation': True,
+                'parenthetical_patterns_supported': True,
+                'intelligent_text_classification': True,
                 'header_footer_removed': preprocessing_applied,
+                'preprocessing_options': preprocessing_options if preprocessing_applied else None,
+                'document_structure_analysis': document_analysis
             })
             # Cleanup temporary preprocessed file
                 except:
                     pass
+            # Log results with enhanced information
             if result['success']:
                 text_length = len(result['text'])
                 has_html = bool(result.get('html'))
                 logger.info(f"Method used: {result['method_used']}")
                 logger.info(f"Text extracted: {text_length} characters")
                 logger.info(f"HTML generated: {has_html}")
+                logger.info(f"Comprehensive indentation detection: Enabled")
+                logger.info(f"Parenthetical patterns supported: Enabled")
+                logger.info(f"Intelligent text classification: Enabled")
                 if table_count > 0:
                     logger.info(f"Tables detected: {table_count}")
                 if preprocessing_applied:
                     logger.info("Enhanced preprocessing applied")
+                if document_analysis and not document_analysis.get('analysis_failed'):
+                    logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}")
+                    logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items")
                 # Add to processing history
                 self._add_to_history({
                     'processing_time': processing_time,
                     'preprocessing_applied': preprocessing_applied,
                     'html_generated': has_html,
+                    'enhanced_processing': True,
+                    'comprehensive_indentation': True,
+                    'parenthetical_patterns_supported': True,
+                    'intelligent_text_classification': True,
+                    'document_analysis': document_analysis
                 })
             else:
                 logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
                     'error': result.get('error', 'Unknown error'),
                     'processing_time': processing_time,
                     'preprocessing_applied': preprocessing_applied,
+                    'enhanced_processing': True,
+                    'comprehensive_indentation': True,
+                    'parenthetical_patterns_supported': True,
+                    'intelligent_text_classification': True
                 })
             return result
                 'success': False,
                 'error': str(e),
                 'processing_time': processing_time,
+                'enhanced_processing': True,
+                'comprehensive_indentation': True,
+                'parenthetical_patterns_supported': True,
+                'intelligent_text_classification': True
             })
             return {
                     'file_hash': file_hash,
                     'processing_time_seconds': round(processing_time, 2),
                     'timestamp': start_time.isoformat(),
+                    'enhanced_processing': True,
+                    'comprehensive_indentation': True,
+                    'parenthetical_patterns_supported': True,
+                    'intelligent_text_classification': True
                 }
             }
     def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
+        """Apply enhanced preprocessing with high-resolution crop handling"""
         crop_settings = options.get('crop_settings', {})
         per_page_crops = crop_settings.get('per_page_crops', {})
         enhanced_resolution = crop_settings.get('enhanced_resolution', True)
                 page = doc.load_page(page_num)
                 page_rect = page.rect
+                # Get crop settings for this page
                 page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
                     'top': 0, 'bottom': 0, 'left': 0, 'right': 0
                 }))
                 # Create new page with enhanced resolution if enabled
                 if enhanced_resolution:
                     new_page = new_doc.new_page(
                         width=new_rect.width,
                         height=new_rect.height
     def create_enhanced_downloads(self, text_content: str, html_content: str,
                                 metadata_info: str = "") -> Dict[str, str]:
+        """Create enhanced download files with comprehensive indentation support, parenthetical patterns, and text classification"""
         download_files = {}
         try:
             # Create enhanced TXT file
+            txt_path = EnhancedDocumentExporter.create_enhanced_txt_file(
                 text_content, html_content, metadata_info
             )
             download_files['txt'] = txt_path
             logger.info(f"Enhanced TXT file created: {txt_path}")
+            # Create enhanced DOCX file with comprehensive indentation support and text classification
             try:
+                docx_path = self.document_exporter.create_enhanced_docx_file(
                     text_content, html_content, metadata_info
                 )
                 download_files['docx'] = docx_path
+                logger.info(f"Enhanced DOCX file with comprehensive indentation and text classification created: {docx_path}")
             except ImportError:
                 logger.warning("python-docx not available. DOCX creation skipped.")
             except Exception as e:
+                logger.error(f"Enhanced DOCX creation failed: {e}")
             # Create standalone HTML file
             try:
+                html_path = EnhancedDocumentExporter.create_html_file(
                     html_content, metadata_info
                 )
                 download_files['html'] = html_path
+                logger.info(f"Enhanced HTML file created: {html_path}")
             except Exception as e:
                 logger.error(f"HTML file creation failed: {e}")
             'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
             'enhanced_processing': True,
             'html_processing': True,
+            'comprehensive_indentation': True,
+            'parenthetical_patterns_supported': True,
+            'intelligent_text_classification': True,
+            'pattern_detection_count': len(EnhancedIndentationDetector().patterns),
             'docx_export_available': docx_available,
             'enhanced_crop_processing': True,
             'multi_resolution_support': True,
+            'crop_processing_fixed': True,
+            'document_structure_analysis': True,
+            'thai_script_support': True,
+            'multi_level_support': True,
+            'text_classification_features': True
         }
         return status
             logger.error(f"Error during cleanup: {e}")
     def get_enhanced_statistics(self) -> Dict[str, Any]:
+        """Get enhanced processing statistics with indentation analysis and text classification"""
         if not self.processing_history:
             return {
                 'total_processed': 0,
                 'total_tables_processed': 0,
                 'preprocessing_usage': 0,
                 'html_generation_rate': 0,
+                'enhanced_processing_usage': 0,
+                'comprehensive_indentation_usage': 0,
+                'parenthetical_patterns_usage': 0,
+                'text_classification_usage': 0,
+                'document_analysis_success_rate': 0
             }
         total_processed = len(self.processing_history)
         preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
         html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
         enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
+        comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False))
+        parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False))
+        text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False))
+        # Document analysis statistics
+        doc_analysis_success = sum(1 for h in self.processing_history
+                                 if h.get('document_analysis', {}) and not h.get('document_analysis', {}).get('analysis_failed', False))
+        doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0
         html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
         enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
+        comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0
+        parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0
+        text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0
         return {
             'total_processed': total_processed,
             'preprocessing_usage': preprocessing_usage,
             'html_generation_rate': round(html_generation_rate, 2),
             'enhanced_processing_usage': enhanced_processing,
+            'enhanced_processing_rate': round(enhanced_processing_rate, 2),
+            'comprehensive_indentation_usage': comprehensive_indentation,
+            'comprehensive_indentation_rate': round(comprehensive_indentation_rate, 2),
+            'parenthetical_patterns_usage': parenthetical_patterns,
+            'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2),
+            'text_classification_usage': text_classification,
+            'text_classification_rate': round(text_classification_rate, 2),
+            'document_analysis_success_rate': round(doc_analysis_rate, 2)
         }
     # Test the enhanced backend manager
     manager = BackendManager()
+    print("Enhanced Backend Manager with Comprehensive Indentation Detection & Text Classification Test")
+    print("=" * 100)
     print(f"Available methods: {manager.get_available_methods()}")
     print(f"Service status: {manager.get_service_status()}")
+    print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
+    # Test indentation detector with parenthetical patterns
+    detector = EnhancedIndentationDetector()
+    test_cases = [
+        "1.2.3. Hierarchical item",
+        "(1) Parenthetical Arabic",
+        "(๑) Parenthetical Thai numeral",
+        "(a) Parenthetical letter",
+        "(i) Parenthetical Roman",
+        "(ก) Parenthetical Thai letter"
+    ]
+    print(f"\nIndentation Detection Test with Parenthetical Patterns:")
+    print("-" * 60)
+    for test_text in test_cases:
+        result = detector.detect_indentation(test_text)
+        classification = detector.classify_text_type(test_text)
+        print(f"Text: {test_text}")
+        print(f"  Pattern: {result['pattern_type']}, Level: {result['level']}")
+        print(f"  Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
+        print()

enhanced_indentation.py ADDED Viewed

	@@ -0,0 +1,648 @@

+"""
+Enhanced Indentation Detection System
+Comprehensive regex-based system for detecting hierarchical numbering and indentation levels
+For PDF OCR Service with HTML and DOCX output support including parenthetical patterns
+"""
+import re
+import logging
+from typing import Dict, Tuple, Optional, List, Any
+from collections import Counter
+logger = logging.getLogger(__name__)
+class EnhancedIndentationDetector:
+    """Advanced indentation detection with comprehensive pattern matching including parenthetical patterns"""
+    def __init__(self):
+        # Define comprehensive patterns for different numbering styles
+        self.patterns = {
+            # Hierarchical decimal numbering (1.1.1.1.1...)
+            'decimal_hierarchy': {
+                'pattern': r'^\s*(\d+(?:\.\d+)*)\.\s+',
+                'example': '1.2.3.4.5.',
+                'level_func': self._calculate_decimal_level,
+                'priority': 15
+            },
+            # Hierarchical numbering without final dot (1.1.1.1.1)
+            'decimal_hierarchy_no_dot': {
+                'pattern': r'^\s*(\d+(?:\.\d+)+)\s+',
+                'example': '1.2.3.4.5',
+                'level_func': self._calculate_decimal_level,
+                'priority': 14
+            },
+            # Hierarchical numbering with parentheses (1.1.1) or 1.1.1)
+            'decimal_hierarchy_paren': {
+                'pattern': r'^\s*(\d+(?:\.\d+)*)\)\s+',
+                'example': '1.2.3.4)',
+                'level_func': self._calculate_decimal_level,
+                'priority': 13
+            },
+            # Mixed hierarchical (1.1.a.i.A...)
+            'mixed_hierarchy': {
+                'pattern': r'^\s*(\d+(?:\.(?:\d+|[a-z]+|[A-Z]+|[ivxlcdm]+))+)\.\s+',
+                'example': '1.2.a.i.A.',
+                'level_func': self._calculate_mixed_level,
+                'priority': 12
+            },
+            # Legal numbering (1.1.1.1(a)(i))
+            'legal_numbering': {
+                'pattern': r'^\s*(\d+(?:\.\d+)*(?:\([a-z]+\))*(?:\([ivxlcdm]+\))*)\s+',
+                'example': '1.1.1(a)(i)',
+                'level_func': self._calculate_legal_level,
+                'priority': 11
+            },
+            # Outline numbering (I.A.1.a.i.)
+            'outline_numbering': {
+                'pattern': r'^\s*([IVXLCDM]+(?:\.[A-Z]+)*(?:\.\d+)*(?:\.[a-z]+)*(?:\.[ivxlcdm]+)*)\.\s+',
+                'example': 'I.A.1.a.i.',
+                'level_func': self._calculate_outline_level,
+                'priority': 10
+            },
+            # Section numbering (§1.1.1, Article 1.1.1)
+            'section_numbering': {
+                'pattern': r'^\s*(?:§|Section|Article|Chapter|Part)\s*(\d+(?:\.\d+)*)\.\s+',
+                'example': '§1.2.3.',
+                'level_func': self._calculate_decimal_level,
+                'priority': 9
+            },
+            # Thai section numbering (มาตรา, ข้อ, หมวด)
+            'thai_section_numbering': {
+                'pattern': r'^\s*(?:มาตรา|ข้อ|หมวด|ส่วน)\s*(\d+(?:\.\d+)*)\s+',
+                'example': 'มาตรา 1.2.3',
+                'level_func': self._calculate_decimal_level,
+                'priority': 9
+            },
+            # Parenthetical numbering - Arabic numerals (1), (2), (3)
+            'parenthetical_arabic': {
+                'pattern': r'^\s*\((\d+)\)\s+',
+                'example': '(1)',
+                'level_func': lambda x: 2,
+                'priority': 8
+            },
+            # Parenthetical numbering - Thai numerals (๑), (๒), (๓)
+            'parenthetical_thai_numerals': {
+                'pattern': r'^\s*\(([๐-๙]+)\)\s+',
+                'example': '(๑)',
+                'level_func': lambda x: 2,
+                'priority': 8
+            },
+            # Parenthetical letters - lowercase (a), (b), (c)
+            'parenthetical_letters_lower': {
+                'pattern': r'^\s*\(([a-z]+)\)\s+',
+                'example': '(a)',
+                'level_func': lambda x: 3,
+                'priority': 7
+            },
+            # Parenthetical letters - uppercase (A), (B), (C)
+            'parenthetical_letters_upper': {
+                'pattern': r'^\s*\(([A-Z]+)\)\s+',
+                'example': '(A)',
+                'level_func': lambda x: 2,
+                'priority': 7
+            },
+            # Parenthetical Thai letters (ก), (ข), (ค)
+            'parenthetical_thai_letters': {
+                'pattern': r'^\s*\(([ก-ฮ]+)\)\s+',
+                'example': '(ก)',
+                'level_func': lambda x: 3,
+                'priority': 7
+            },
+            # Parenthetical Roman numerals - lowercase (i), (ii), (iii)
+            'parenthetical_roman_lower': {
+                'pattern': r'^\s*\(([ivxlcdm]+)\)\s+',
+                'example': '(i)',
+                'level_func': lambda x: 4,
+                'priority': 6
+            },
+            # Parenthetical Roman numerals - uppercase (I), (II), (III)
+            'parenthetical_roman_upper': {
+                'pattern': r'^\s*\(([IVXLCDM]+)\)\s+',
+                'example': '(I)',
+                'level_func': lambda x: 2,
+                'priority': 6
+            },
+            # Simple numbered lists (1., 2., 3.)
+            'simple_numbered': {
+                'pattern': r'^\s*(\d+)\.\s+',
+                'example': '1.',
+                'level_func': lambda x: 1,
+                'priority': 5
+            },
+            # Simple numbered with parens (1), 2), 3))
+            'simple_numbered_paren': {
+                'pattern': r'^\s*(\d+)\)\s+',
+                'example': '1)',
+                'level_func': lambda x: 1,
+                'priority': 5
+            },
+            # Lettered lists (a., b., c.) and (A., B., C.)
+            'lettered_lower': {
+                'pattern': r'^\s*([a-z]+)\.\s+',
+                'example': 'a.',
+                'level_func': lambda x: 2,
+                'priority': 4
+            },
+            'lettered_upper': {
+                'pattern': r'^\s*([A-Z]+)\.\s+',
+                'example': 'A.',
+                'level_func': lambda x: 1,
+                'priority': 4
+            },
+            # Thai letters (ก., ข., ค.)
+            'thai_lettered': {
+                'pattern': r'^\s*([ก-ฮ]+)\.\s+',
+                'example': 'ก.',
+                'level_func': lambda x: 2,
+                'priority': 4
+            },
+            # Roman numerals (i., ii., iii.) and (I., II., III.)
+            'roman_lower': {
+                'pattern': r'^\s*([ivxlcdm]+)\.\s+',
+                'example': 'i.',
+                'level_func': lambda x: 3,
+                'priority': 3
+            },
+            'roman_upper': {
+                'pattern': r'^\s*([IVXLCDM]+)\.\s+',
+                'example': 'I.',
+                'level_func': lambda x: 1,
+                'priority': 3
+            },
+            # Bullet points with various symbols
+            'bullet_symbols': {
+                'pattern': r'^\s*([•·▪▫◦‣⁃⁌⁍◘◙○●▶▷►▻★☆♦♠♣♥◆◇■□▲△▼▽❖❀❁❂❃❄❅❆❇❈❉❊❋❍❏❐❑❒❖])\s+',
+                'example': '•',
+                'level_func': self._calculate_bullet_level,
+                'priority': 2
+            },
+            # Dash and asterisk bullets
+            'dash_bullets': {
+                'pattern': r'^\s*([\-\*\+~=])\s+',
+                'example': '-',
+                'level_func': self._calculate_bullet_level,
+                'priority': 2
+            },
+            # Arrow bullets
+            'arrow_bullets': {
+                'pattern': r'^\s*([\→\←\↑\↓\↔\↕\↖\↗\↘\↙\⇒\⇐\⇑\⇓\⇔\⇕\➔\➜\➤\➪\➫\➬\➭\➮\➯\➱\➲\➳\➴\➵\➶\➷\➸\➹\➺\➻\➼\➽\➾])\s+',
+                'example': '→',
+                'level_func': self._calculate_bullet_level,
+                'priority': 2
+            },
+            # Checkbox items ([x], [ ], [✓])
+            'checkbox': {
+                'pattern': r'^\s*\[([x✓✗\s])\]\s+',
+                'example': '[x]',
+                'level_func': lambda x: 2,
+                'priority': 1
+            }
+        }
+        # Sort patterns by priority (higher priority first)
+        self.sorted_patterns = sorted(
+            self.patterns.items(),
+            key=lambda x: x[1]['priority'],
+            reverse=True
+        )
+        # Header detection patterns
+        self.header_patterns = {
+            'title_case': r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$',
+            'all_caps': r'^[A-Z\s]+$',
+            'numbered_header': r'^\d+\.\s*[A-Z]',
+            'section_header': r'^(?:SECTION|CHAPTER|PART|ARTICLE)\s+',
+            'thai_header': r'^(?:หมวด|บท|ส่วน|มาตรา)\s+',
+            'short_line': lambda text: len(text.strip()) < 50 and not text.strip().endswith('.'),
+            'ends_without_period': lambda text: not text.strip().endswith('.') and not text.strip().endswith(':'),
+            'capitalized_words': lambda text: sum(1 for word in text.split() if word and word[0].isupper()) / max(len(text.split()), 1) > 0.5
+        }
+    def detect_indentation(self, text: str, base_margin: float = 0) -> Dict[str, Any]:
+        """
+        Detect indentation pattern and level for given text
+        Args:
+            text: Text line to analyze
+            base_margin: Base left margin for relative positioning
+        Returns:
+            Dict with pattern info, level, and formatting details
+        """
+        if not text or not text.strip():
+            return self._create_empty_result(text)
+        text_stripped = text.strip()
+        # Count leading whitespace for additional indentation
+        leading_spaces = len(text) - len(text.lstrip())
+        space_indent_level = leading_spaces // 4  # 4 spaces = 1 level
+        # Try each pattern in priority order
+        for pattern_name, pattern_info in self.sorted_patterns:
+            match = re.match(pattern_info['pattern'], text, re.IGNORECASE)
+            if match:
+                # Extract the numbering/bullet part
+                marker = match.group(1) if match.groups() else match.group(0)
+                # Calculate pattern-specific level
+                if callable(pattern_info['level_func']):
+                    try:
+                        pattern_level = pattern_info['level_func'](marker)
+                    except:
+                        pattern_level = 1
+                else:
+                    pattern_level = pattern_info['level_func']
+                # Combine pattern level with space indentation
+                total_level = max(pattern_level + space_indent_level, 1)
+                # Extract content after the marker
+                content_start = match.end()
+                content = text[content_start:].strip()
+                return {
+                    'has_pattern': True,
+                    'pattern_type': pattern_name,
+                    'pattern_marker': marker,
+                    'level': min(total_level, 10),  # Cap at level 10
+                    'content': content,
+                    'original_text': text,
+                    'leading_spaces': leading_spaces,
+                    'space_indent_level': space_indent_level,
+                    'pattern_level': pattern_level,
+                    'is_bullet': self._is_bullet_pattern(pattern_name),
+                    'is_numbered': self._is_numbered_pattern(pattern_name),
+                    'is_lettered': self._is_lettered_pattern(pattern_name),
+                    'is_roman': self._is_roman_pattern(pattern_name),
+                    'is_thai': self._is_thai_pattern(pattern_name),
+                    'is_parenthetical': self._is_parenthetical_pattern(pattern_name),
+                    'formatting_hint': self._get_formatting_hint(pattern_name, total_level),
+                    'priority': pattern_info['priority']
+                }
+        # No pattern found - check for basic indentation
+        if leading_spaces > 0:
+            return {
+                'has_pattern': False,
+                'pattern_type': 'space_indent',
+                'pattern_marker': '',
+                'level': max(space_indent_level, 1),
+                'content': text_stripped,
+                'original_text': text,
+                'leading_spaces': leading_spaces,
+                'space_indent_level': space_indent_level,
+                'pattern_level': 0,
+                'is_bullet': False,
+                'is_numbered': False,
+                'is_lettered': False,
+                'is_roman': False,
+                'is_thai': False,
+                'is_parenthetical': False,
+                'formatting_hint': 'indented_text',
+                'priority': 0
+            }
+        # No indentation at all
+        return self._create_empty_result(text)
+    def classify_text_type(self, text: str, context: Dict = None) -> Dict[str, Any]:
+        """
+        Classify text as header, paragraph, or list item based on patterns and context
+        Args:
+            text: Text to classify
+            context: Additional context like position, formatting, etc.
+        Returns:
+            Dict with classification results
+        """
+        if not text or not text.strip():
+            return {'type': 'empty', 'confidence': 1.0}
+        text_stripped = text.strip()
+        context = context or {}
+        # Check for indentation patterns first
+        indent_result = self.detect_indentation(text)
+        # Initialize classification scores
+        scores = {
+            'header': 0.0,
+            'paragraph': 0.0,
+            'list_item': 0.0
+        }
+        # List item indicators
+        if indent_result['has_pattern']:
+            scores['list_item'] += 0.8
+            if indent_result['is_numbered'] or indent_result['is_lettered']:
+                scores['list_item'] += 0.1
+            if indent_result['is_bullet']:
+                scores['list_item'] += 0.1
+        # Header indicators
+        if len(text_stripped) < 100:  # Short text more likely to be header
+            scores['header'] += 0.3
+        if len(text_stripped) < 50:  # Very short text even more likely
+            scores['header'] += 0.2
+        # Check header patterns
+        for pattern_name, pattern in self.header_patterns.items():
+            if callable(pattern):
+                if pattern(text_stripped):
+                    scores['header'] += 0.2
+            else:
+                if re.match(pattern, text_stripped):
+                    scores['header'] += 0.2
+        # Position-based scoring from context
+        if context.get('y_position'):
+            # Higher on page = more likely header
+            if context['y_position'] < 100:  # Top of page
+                scores['header'] += 0.3
+        # Font size context
+        if context.get('font_size'):
+            if context['font_size'] > 12:  # Larger font
+                scores['header'] += 0.2
+        # Font weight context
+        if context.get('is_bold'):
+            scores['header'] += 0.2
+        # Paragraph indicators
+        if len(text_stripped) > 100:  # Long text more likely paragraph
+            scores['paragraph'] += 0.4
+        if text_stripped.endswith('.'):  # Ends with period
+            scores['paragraph'] += 0.2
+        if not indent_result['has_pattern'] and len(text_stripped) > 50:
+            scores['paragraph'] += 0.3
+        # Determine final classification
+        max_score = max(scores.values())
+        classification = max(scores.items(), key=lambda x: x[1])
+        return {
+            'type': classification[0],
+            'confidence': classification[1],
+            'scores': scores,
+            'indentation': indent_result,
+            'is_header': classification[0] == 'header',
+            'is_paragraph': classification[0] == 'paragraph',
+            'is_list_item': classification[0] == 'list_item'
+        }
+    def _create_empty_result(self, text: str) -> Dict[str, Any]:
+        """Create result for text with no indentation pattern"""
+        return {
+            'has_pattern': False,
+            'pattern_type': 'normal',
+            'pattern_marker': '',
+            'level': 0,
+            'content': text.strip(),
+            'original_text': text,
+            'leading_spaces': 0,
+            'space_indent_level': 0,
+            'pattern_level': 0,
+            'is_bullet': False,
+            'is_numbered': False,
+            'is_lettered': False,
+            'is_roman': False,
+            'is_thai': False,
+            'is_parenthetical': False,
+            'formatting_hint': 'normal_text',
+            'priority': 0
+        }
+    def _calculate_decimal_level(self, marker: str) -> int:
+        """Calculate level for decimal hierarchies (1.2.3.4 = level 4)"""
+        # Count dots to determine depth
+        dots = marker.count('.')
+        return dots + 1
+    def _calculate_mixed_level(self, marker: str) -> int:
+        """Calculate level for mixed hierarchies (1.2.a.i = level 4)"""
+        parts = marker.split('.')
+        return len([p for p in parts if p.strip()])
+    def _calculate_legal_level(self, marker: str) -> int:
+        """Calculate level for legal numbering (1.1.1(a)(i) = level 5)"""
+        # Count dots and parenthetical parts
+        dots = marker.count('.')
+        parens = marker.count('(')
+        return dots + parens + 1
+    def _calculate_outline_level(self, marker: str) -> int:
+        """Calculate level for outline numbering (I.A.1.a.i = level 5)"""
+        parts = marker.split('.')
+        return len([p for p in parts if p.strip()])
+    def _calculate_bullet_level(self, marker: str) -> int:
+        """Calculate level for bullet points based on symbol complexity"""
+        # More complex symbols typically indicate deeper levels
+        complex_bullets = ['◦', '‣', '⁃', '▪', '▫', '◘', '◙']
+        if marker in complex_bullets:
+            return 2
+        return 1
+    def _is_bullet_pattern(self, pattern_type: str) -> bool:
+        """Check if pattern is a bullet type"""
+        return any(bullet_type in pattern_type for bullet_type in ['bullet', 'dash', 'arrow', 'checkbox'])
+    def _is_numbered_pattern(self, pattern_type: str) -> bool:
+        """Check if pattern is a numbered type"""
+        return any(num_type in pattern_type for num_type in ['numbered', 'decimal', 'legal', 'section'])
+    def _is_lettered_pattern(self, pattern_type: str) -> bool:
+        """Check if pattern is a lettered type"""
+        return 'lettered' in pattern_type
+    def _is_roman_pattern(self, pattern_type: str) -> bool:
+        """Check if pattern is a roman numeral type"""
+        return 'roman' in pattern_type
+    def _is_thai_pattern(self, pattern_type: str) -> bool:
+        """Check if pattern is Thai-specific"""
+        return 'thai' in pattern_type
+    def _is_parenthetical_pattern(self, pattern_type: str) -> bool:
+        """Check if pattern is parenthetical type"""
+        return 'parenthetical' in pattern_type
+    def _get_formatting_hint(self, pattern_type: str, level: int) -> str:
+        """Get formatting hint for rendering"""
+        level_names = ['primary', 'secondary', 'tertiary', 'quaternary', 'quinary']
+        level_name = level_names[min(level-1, len(level_names)-1)] if level > 0 else 'normal'
+        if self._is_bullet_pattern(pattern_type):
+            return f'bullet_{level_name}'
+        elif self._is_numbered_pattern(pattern_type):
+            return f'numbered_{level_name}'
+        elif self._is_lettered_pattern(pattern_type):
+            return f'lettered_{level_name}'
+        elif self._is_roman_pattern(pattern_type):
+            return f'roman_{level_name}'
+        elif self._is_thai_pattern(pattern_type):
+            return f'thai_{level_name}'
+        elif self._is_parenthetical_pattern(pattern_type):
+            return f'parenthetical_{level_name}'
+        else:
+            return f'indent_{level_name}'
+    def analyze_document_structure(self, text_lines: List[str]) -> Dict[str, Any]:
+        """
+        Analyze entire document structure for consistent formatting
+        Args:
+            text_lines: List of text lines to analyze
+        Returns:
+            Dict with document structure analysis
+        """
+        analysis = {
+            'total_lines': len(text_lines),
+            'patterned_lines': 0,
+            'max_level': 0,
+            'pattern_distribution': Counter(),
+            'level_distribution': Counter(),
+            'formatting_hints': Counter(),
+            'text_classification': Counter(),
+            'has_consistent_numbering': False,
+            'dominant_patterns': [],
+            'header_count': 0,
+            'paragraph_count': 0,
+            'list_item_count': 0
+        }
+        indent_results = []
+        for line in text_lines:
+            if line.strip():
+                # Indentation analysis
+                indent_result = self.detect_indentation(line)
+                indent_results.append(indent_result)
+                # Text classification
+                classification = self.classify_text_type(line)
+                analysis['text_classification'][classification['type']] += 1
+                if classification['type'] == 'header':
+                    analysis['header_count'] += 1
+                elif classification['type'] == 'paragraph':
+                    analysis['paragraph_count'] += 1
+                elif classification['type'] == 'list_item':
+                    analysis['list_item_count'] += 1
+                if indent_result['has_pattern']:
+                    analysis['patterned_lines'] += 1
+                    analysis['pattern_distribution'][indent_result['pattern_type']] += 1
+                    analysis['level_distribution'][indent_result['level']] += 1
+                    analysis['formatting_hints'][indent_result['formatting_hint']] += 1
+                    analysis['max_level'] = max(analysis['max_level'], indent_result['level'])
+        # Determine dominant patterns
+        if analysis['pattern_distribution']:
+            analysis['dominant_patterns'] = analysis['pattern_distribution'].most_common(3)
+        # Check for consistent numbering
+        numbered_patterns = [p for p in analysis['pattern_distribution'] if 'numbered' in p or 'decimal' in p]
+        analysis['has_consistent_numbering'] = len(numbered_patterns) > 0
+        analysis['coverage_percentage'] = (analysis['patterned_lines'] / analysis['total_lines'] * 100) if analysis['total_lines'] > 0 else 0
+        return analysis
+# Example usage and testing
+if __name__ == "__main__":
+    detector = EnhancedIndentationDetector()
+    test_cases = [
+        "1. First level item",
+        "  1.1. Second level item",
+        "    1.1.1. Third level item",
+        "      1.1.1.1. Fourth level item",
+        "        1.1.1.1.1. Fifth level item",
+        "(1) Parenthetical Arabic",
+        "(๑) Parenthetical Thai numeral",
+        "(a) Parenthetical lowercase letter",
+        "(A) Parenthetical uppercase letter",
+        "(ก) Parenthetical Thai letter",
+        "(i) Parenthetical lowercase Roman",
+        "(I) Parenthetical uppercase Roman",
+        "2. Another first level",
+        "  a. Letter sub-item",
+        "    i. Roman sub-sub-item",
+        "• Bullet point",
+        "  ◦ Sub bullet",
+        "    ▪ Sub-sub bullet",
+        "- Dash item",
+        "  * Asterisk sub-item",
+        "    + Plus sub-sub-item",
+        "§1.2.3. Section numbering",
+        "Article 1.1.1. Article numbering",
+        "มาตรา 1.2.3 Thai section",
+        "ก. Thai letter",
+        "[x] Checkbox item",
+        "→ Arrow item",
+        "I. Roman numeral",
+        "  A. Capital letter",
+        "    1. Number",
+        "      a. Lowercase letter",
+        "        i. Lowercase roman",
+        "            Normal indented text without pattern",
+        "CHAPTER 1: INTRODUCTION",
+        "This is a regular paragraph with some text that should be classified as paragraph content.",
+    ]
+    print("Enhanced Indentation Detection Results with Parenthetical Patterns:")
+    print("=" * 80)
+    for i, test_text in enumerate(test_cases, 1):
+        result = detector.detect_indentation(test_text)
+        classification = detector.classify_text_type(test_text)
+        print(f"{i:2d}. Text: {test_text!r}")
+        print(f"    Pattern: {result['pattern_type']}")
+        print(f"    Level: {result['level']}")
+        print(f"    Marker: {result['pattern_marker']!r}")
+        print(f"    Content: {result['content']!r}")
+        print(f"    Hint: {result['formatting_hint']}")
+        print(f"    Priority: {result['priority']}")
+        print(f"    Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
+        print()
+    # Test document analysis
+    print("\nDocument Structure Analysis:")
+    print("=" * 40)
+    analysis = detector.analyze_document_structure(test_cases)
+    for key, value in analysis.items():
+        print(f"{key}: {value}")

ocr_service.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-OCR Service Module - FIXED VERSION with Improved Text Formatting and Page Numbers
-Handles PDF to text conversion with proper indentation, spacing, and page numbering
 """
 import re
 import os
@@ -30,17 +30,25 @@ except ImportError:
 import fitz  # PyMuPDF
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class HTMLProcessor:
-    """Process OCR results through HTML for better formatting preservation - FIXED VERSION"""
     @staticmethod
     def create_html_from_azure_result(analysis_result) -> str:
-        """Create structured HTML from Azure Document Intelligence result with proper spacing and page numbers"""
         html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
         html_parts.append('<style>')
         html_parts.append('''
@@ -71,12 +79,177 @@ class HTMLProcessor:
                 text-transform: uppercase;
                 letter-spacing: 1px;
             }
             .paragraph {
                 margin-bottom: 0.8em;
                 white-space: pre-wrap;
                 font-family: 'Consolas', 'Courier New', monospace;
                 line-height: 1.4;
             }
             .title {
                 font-size: 1.4em;
                 font-weight: bold;
@@ -124,24 +297,13 @@ class HTMLProcessor:
             .table tr:nth-child(even) {
                 background-color: #f8f9fa;
             }
-            .indented {
-                display: inline-block;
-                white-space: pre-wrap;
             }
-            .bullet-point {
-                position: relative;
-                padding-left: 1.2em;
-                margin-bottom: 0.3em;
-            }
-            .bullet-point:before {
-                content: "•";
-                position: absolute;
-                left: 0;
-                color: #3498db;
-                font-weight: bold;
-            }
-            .spaced {
-                margin-top: 10px;
             }
             .page-number {
                 position: relative;
@@ -164,48 +326,46 @@ class HTMLProcessor:
             html_parts.append(f'<div class="page">')
             html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
-            # Process content with proper ordering and spacing preservation
-            content_items = HTMLProcessor._extract_page_content(page, analysis_result, page_num)
             content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
-            # Generate HTML for each content item with preserved spacing
             for item in content_items:
                 if item['type'] == 'table':
-                    html_parts.append(HTMLProcessor._table_to_html(item['content'], item['table_idx']))
                 else:
-                    html_parts.append(HTMLProcessor._text_to_html(item))
             html_parts.append('</div>')
         html_parts.append('</body></html>')
         return '\n'.join(html_parts)
-    @staticmethod
-    def _extract_page_content(page, analysis_result, page_num):
-        """Extract and organize page content without losing text with proper spacing"""
         content_items = []
-        # First, collect all tables for this page
         page_tables = []
         table_regions = []
         if analysis_result.tables:
             for table_idx, table in enumerate(analysis_result.tables):
-                if HTMLProcessor._is_table_on_page(table, page_num):
                     page_tables.append((table_idx, table))
-                    # Store table regions for overlap detection
                     if table.bounding_regions:
                         table_regions.append({
                             'polygon': table.bounding_regions[0].polygon,
                             'table_idx': table_idx
                         })
-        # Add table items to content
         for table_idx, table in page_tables:
             if table.bounding_regions and table.bounding_regions[0].polygon:
                 polygon = table.bounding_regions[0].polygon
-                y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7])  # Top Y
-                x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])  # Left X
                 content_items.append({
                     'type': 'table',
@@ -215,51 +375,192 @@ class HTMLProcessor:
                     'x_pos': x_pos
                 })
-        # Calculate page margins for proper indentation detection
-        page_left_margin = HTMLProcessor._calculate_page_margins(page, analysis_result, page_num)
-        # Process text content - use paragraphs if available, otherwise lines
         if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
-            # Use paragraphs (better content grouping)
             page_paragraphs = [p for p in analysis_result.paragraphs if
                              p.bounding_regions and
                              p.bounding_regions[0].page_number == page_num]
             for para in page_paragraphs:
                 if para.content.strip():
-                    # Check if this paragraph overlaps significantly with any table
-                    overlap_ratio = HTMLProcessor._calculate_table_overlap(para, table_regions)
-                    # Only exclude if heavily overlapping (>70%) with a table
-                    if overlap_ratio < 0.7:
                         polygon = para.bounding_regions[0].polygon
                         y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
                         x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
-                        # Calculate proper indentation based on page margins
-                        indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, para.content)
                         content_items.append({
                             'type': 'paragraph',
-                            'content': para.content.strip(),
                             'role': getattr(para, 'role', 'paragraph'),
                             'y_pos': y_pos,
                             'x_pos': x_pos,
-                            'indent_level': indent_info['level'],
-                            'indent_pixels': indent_info['pixels'],
-                            'is_bullet': indent_info['is_bullet'],
                             'preserve_spacing': True
                         })
         elif page.lines:
-            # Use lines as fallback with enhanced spacing preservation
-            processed_lines = HTMLProcessor._process_lines_content_with_spacing(page.lines, table_regions, page_left_margin)
             content_items.extend(processed_lines)
         return content_items
-    @staticmethod
-    def _is_table_on_page(table, page_num):
         """Check if table belongs to the specified page"""
         if not table.cells:
             return False
@@ -270,9 +571,8 @@ class HTMLProcessor:
                 return True
         return False
-    @staticmethod
-    def _calculate_table_overlap(content_item, table_regions):
-        """Calculate overlap ratio between content and tables (FIXED)"""
         if not table_regions or not content_item.bounding_regions:
             return 0.0
@@ -316,120 +616,7 @@ class HTMLProcessor:
         return max_overlap_ratio
-    @staticmethod
-    def _calculate_page_margins(page, analysis_result, page_num):
-        """Calculate page margins to determine proper indentation baseline"""
-        left_positions = []
-        # Collect x positions from paragraphs if available
-        if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
-            page_paragraphs = [p for p in analysis_result.paragraphs if
-                             p.bounding_regions and
-                             p.bounding_regions[0].page_number == page_num]
-            for para in page_paragraphs:
-                if para.bounding_regions and para.bounding_regions[0].polygon:
-                    polygon = para.bounding_regions[0].polygon
-                    x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
-                    left_positions.append(x_pos)
-        # Fallback to lines if no paragraphs
-        elif page.lines:
-            for line in page.lines:
-                if line.polygon:
-                    x_pos = min(line.polygon[0], line.polygon[2], line.polygon[4], line.polygon[6])
-                    left_positions.append(x_pos)
-        # Find the most common left margin (baseline)
-        if left_positions:
-            left_positions.sort()
-            # Take the most frequent left position as the main margin
-            from collections import Counter
-            position_counts = Counter([round(pos, -1) for pos in left_positions])  # Round to nearest 10
-            base_margin = position_counts.most_common(1)[0][0]
-            return base_margin
-        return 50  # Default margin if no content found
-    @staticmethod
-    def _calculate_precise_indentation(x_pos, base_margin, content):
-        """Calculate precise indentation based on x position and content analysis"""
-        # Calculate indent distance from base margin
-        indent_distance = max(0, x_pos - base_margin)
-        # Define indentation levels based on distance
-        # Each level represents approximately 0.5 inch or 36 points
-        level_threshold = 30  # Reduced threshold for better sensitivity
-        indent_level = int(indent_distance / level_threshold)
-        # Detect bullet points or numbered lists
-        is_bullet = False
-        content_stripped = content.strip()
-        # Common bullet point patterns
-        bullet_patterns = [
-            r'^\s*[•·▪▫◦‣⁃]\s+',  # Bullet symbols
-            r'^\s*[\-\*\+]\s+',     # Dash, asterisk, plus
-            r'^\s*\d+[\.\)]\s+',    # Numbered lists (1. or 1))
-            r'^\s*[a-zA-Z][\.\)]\s+', # Lettered lists (a. or a))
-            r'^\s*[ivxlcdm]+[\.\)]\s+', # Roman numerals
-        ]
-        for pattern in bullet_patterns:
-            if re.match(pattern, content_stripped, re.IGNORECASE):
-                is_bullet = True
-                break
-        return {
-            'level': min(indent_level, 6),  # Cap at level 6
-            'pixels': indent_distance,
-            'is_bullet': is_bullet
-        }
-    @staticmethod
-    def _process_lines_content_with_spacing(lines, table_regions, page_left_margin):
-        """Process lines content with enhanced spacing preservation"""
-        content_items = []
-        processed_content = set()
-        for line in lines:
-            if not line.content.strip():
-                continue
-            # Avoid duplicates
-            content_key = line.content.strip().lower()
-            if content_key in processed_content:
-                continue
-            processed_content.add(content_key)
-            # Check table overlap
-            overlap_ratio = HTMLProcessor._calculate_line_table_overlap(line, table_regions)
-            # Only exclude if heavily overlapping with table
-            if overlap_ratio < 0.7:
-                polygon = line.polygon
-                y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
-                x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
-                # Calculate precise indentation for lines
-                indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, line.content)
-                content_items.append({
-                    'type': 'line',
-                    'content': line.content.strip(),
-                    'role': 'text',
-                    'y_pos': y_pos,
-                    'x_pos': x_pos,
-                    'indent_level': indent_info['level'],
-                    'indent_pixels': indent_info['pixels'],
-                    'is_bullet': indent_info['is_bullet'],
-                    'preserve_spacing': True
-                })
-        return content_items
-    @staticmethod
-    def _calculate_line_table_overlap(line, table_regions):
         """Calculate overlap between line and tables"""
         if not table_regions or not line.polygon:
             return 0.0
@@ -474,119 +661,37 @@ class HTMLProcessor:
         return max_overlap
     @staticmethod
-    def _text_to_html(item):
-        """Convert text item to HTML with proper formatting and preserved spacing"""
-        content = item['content']
-        role = item.get('role', 'paragraph')
-        indent_level = item.get('indent_level', 0)
-        indent_pixels = item.get('indent_pixels', 0)
-        is_bullet = item.get('is_bullet', False)
-        preserve_spacing = item.get('preserve_spacing', False)
-        # Calculate CSS indentation
-        css_indent = max(0, indent_level)
-        # Build CSS classes and inline styles
-        css_classes = []
-        inline_styles = []
-        if css_indent > 0:
-            inline_styles.append(f"margin-left: {css_indent * 1.5}em")
-            css_classes.append("indented")
-        if is_bullet:
-            css_classes.append("bullet-point")
-        # Preserve internal spacing within content
-        if preserve_spacing:
-            # Replace multiple spaces with &nbsp; to preserve spacing
-            content = re.sub(r'  +', lambda m: '&nbsp;' * len(m.group()), content)
-            # Preserve line breaks within content
-            content = content.replace('\n', '<br>')
-        # Combine CSS
-        class_str = f' class="{" ".join(css_classes)}"' if css_classes else ''
-        style_str = f' style="{"; ".join(inline_styles)}"' if inline_styles else ''
-        if role == 'title':
-            return f'<div class="title"{class_str}{style_str}>{content}</div>'
-        elif role == 'sectionHeading':
-            return f'<div class="section-heading"{class_str}{style_str}>{content}</div>'
-        else:
-            # Regular paragraphs with preserved formatting
-            return f'<div class="paragraph"{class_str}{style_str}>{content}</div>'
-    @staticmethod
-    def _table_to_html(table, table_idx):
-        """Convert table to HTML with proper structure"""
-        if not table.cells:
-            return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
-        # Create table matrix
-        max_row = max(cell.row_index for cell in table.cells) + 1
-        max_col = max(cell.column_index for cell in table.cells) + 1
-        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
-        # Fill matrix
-        for cell in table.cells:
-            content = (cell.content or "").strip()
-            table_matrix[cell.row_index][cell.column_index] = content
-        # Generate HTML
-        html_parts = [f'<div class="table-container">']
-        html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
-        html_parts.append('<table class="table">')
-        for row_idx, row in enumerate(table_matrix):
-            if row_idx == 0 and any(cell.strip() for cell in row):
-                # Header row
-                html_parts.append('<tr>')
-                for cell in row:
-                    html_parts.append(f'<th>{cell}</th>')
-                html_parts.append('</tr>')
-            else:
-                # Data row
-                if any(cell.strip() for cell in row):  # Skip empty rows
-                    html_parts.append('<tr>')
-                    for cell in row:
-                        html_parts.append(f'<td>{cell}</td>')
-                    html_parts.append('</tr>')
-        html_parts.append('</table></div>')
-        return '\n'.join(html_parts)
-    @staticmethod
-    def html_to_formatted_text(html_content):
-        """Convert HTML back to formatted text preserving structure, spacing, and adding page numbers"""
         from html.parser import HTMLParser
-        class FixedSpacingTextExtractor(HTMLParser):
             def __init__(self):
                 super().__init__()
                 self.text_parts = []
                 self.in_title = False
                 self.in_section_heading = False
                 self.in_table = False
-                self.in_table_header = False
                 self.current_table_row = []
                 self.table_data = []
-                self.current_indent = 0
-                self.preserve_spacing = False
                 self.in_page_header = False
-                self.current_page_num = 0
             def handle_starttag(self, tag, attrs):
                 attr_dict = dict(attrs)
                 class_attr = attr_dict.get('class', '')
-                style_attr = attr_dict.get('style', '')
                 if 'page-header' in class_attr:
                     self.in_page_header = True
-                    # Add proper page separation with page number
                     if len(self.text_parts) > 0:
                         self.text_parts.append('\n\n' + '=' * 80 + '\n')
                 elif 'title' in class_attr:
                     self.in_title = True
                 elif 'section-heading' in class_attr:
@@ -594,32 +699,47 @@ class HTMLProcessor:
                 elif tag == 'table':
                     self.in_table = True
                     self.table_data = []
-                elif tag == 'th':
-                    self.in_table_header = True
                 elif tag == 'tr':
                     self.current_table_row = []
                 elif tag == 'br':
                     self.text_parts.append('\n')
-                # Extract indentation from style
-                if 'margin-left' in style_attr:
-                    import re
-                    margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
-                    if margin_match:
-                        self.current_indent = int(float(margin_match.group(1)))
-                    else:
-                        self.current_indent = 0
                 else:
-                    # Count indented classes as fallback
-                    self.current_indent = class_attr.count('indented')
-                # Check if we should preserve spacing
-                self.preserve_spacing = 'paragraph' in class_attr or 'bullet-point' in class_attr
             def handle_endtag(self, tag):
                 if tag == 'div' and self.in_page_header:
                     self.text_parts.append('\n' + '=' * 80 + '\n\n')
                     self.in_page_header = False
                 elif tag == 'div' and self.in_title:
                     self.text_parts.append('\n\n')
                     self.in_title = False
@@ -629,69 +749,83 @@ class HTMLProcessor:
                 elif tag == 'table':
                     self.in_table = False
                     self._format_table()
-                elif tag == 'th':
-                    self.in_table_header = False
                 elif tag == 'tr' and self.current_table_row:
                     self.table_data.append(self.current_table_row[:])
-                elif tag == 'div' and not self.in_table and not self.in_title and not self.in_section_heading and not self.in_page_header:
-                    if not self.preserve_spacing:
                         self.text_parts.append('\n')
-                # Reset indentation when closing div
                 if tag == 'div':
-                    self.current_indent = 0
-                    self.preserve_spacing = False
             def handle_data(self, data):
                 if data.strip():
-                    # Convert &nbsp; back to spaces for proper spacing
                     data = data.replace('&nbsp;', ' ')
                     if self.in_page_header:
-                        # Extract page number and format properly
                         page_match = re.search(r'Page (\d+)', data)
                         if page_match:
-                            self.current_page_num = int(page_match.group(1))
-                            page_header = f"PAGE {self.current_page_num}"
                             self.text_parts.append(page_header.center(80))
                     elif self.in_title:
-                        indent_str = "  " * self.current_indent
                         self.text_parts.append(f'\n{indent_str}## {data.strip()}')
                     elif self.in_section_heading:
-                        indent_str = "  " * self.current_indent
                         self.text_parts.append(f'\n{indent_str}### {data.strip()}')
                     elif self.in_table:
-                        if self.in_table_header or self.current_table_row is not None:
-                            self.current_table_row.append(data.strip())
                     else:
-                        # Apply indentation and preserve internal spacing
-                        indent_str = "  " * self.current_indent
-                        if self.preserve_spacing:
-                            # Keep the exact spacing from the data
-                            formatted_data = data
-                        else:
-                            # Clean up spacing for non-preserved content
-                            formatted_data = re.sub(r'\s+', ' ', data).strip()
-                        # Handle bullet points specially
-                        if 'bullet-point' in getattr(self, '_last_class', ''):
-                            # Remove the bullet symbol that CSS adds and format properly
-                            self.text_parts.append(f'{indent_str}• {formatted_data}')
                         else:
-                            self.text_parts.append(f'{indent_str}{formatted_data}')
             def _format_table(self):
                 if not self.table_data:
                     return
                 self.text_parts.append('\n\n')
-                # Calculate column widths for better formatting
                 if self.table_data:
                     max_cols = max(len(row) for row in self.table_data)
                     col_widths = [0] * max_cols
                     for row in self.table_data:
                         for i, cell in enumerate(row):
                             if i < max_cols:
@@ -721,7 +855,7 @@ class HTMLProcessor:
                 self.text_parts.append('\n')
-        extractor = FixedSpacingTextExtractor()
         extractor.feed(html_content)
         result = ''.join(extractor.text_parts)
@@ -736,7 +870,7 @@ class HTMLProcessor:
 class OCRService:
-    """Main OCR service with HTML processing and improved table handling"""
     def __init__(self):
         self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
@@ -758,7 +892,7 @@ class OCRService:
     def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
         """
-        Convert PDF to text using specified method with HTML processing
         Args:
             pdf_path: Path to the PDF file
@@ -792,7 +926,7 @@ class OCRService:
         # Try primary method
         try:
             if method == "azure" and self.azure_client:
-                result = self._azure_ocr_with_html(pdf_path)
             elif method == "tesseract":
                 result = self._tesseract_ocr(pdf_path)
             elif method == "pymupdf":
@@ -811,13 +945,13 @@ class OCRService:
         return result
-    def _azure_ocr_with_html(self, pdf_path: str) -> Dict[str, Any]:
-        """Azure Document Intelligence OCR with HTML processing"""
         result = {
             'success': False,
             'text': '',
             'html': '',
-            'method_used': 'azure_document_intelligence',
             'metadata': {},
             'error': None
         }
@@ -848,11 +982,16 @@ class OCRService:
             analysis_result = poller.result()
-            # Generate HTML first
-            html_content = HTMLProcessor.create_html_from_azure_result(analysis_result)
-            # Convert HTML to formatted text with proper page numbers and spacing
-            formatted_text = HTMLProcessor.html_to_formatted_text(html_content)
             result.update({
                 'success': True,
@@ -864,13 +1003,17 @@ class OCRService:
                     'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
                     'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
                     'html_generated': True,
-                    'improved_formatting': True,
                     'page_numbers_added': True,
-                    'azure_analysis': analysis_result
                 }
             })
-            logger.info("Azure OCR with improved HTML processing completed successfully")
         except Exception as e:
             logger.error(f"Azure OCR error: {e}")
@@ -879,12 +1022,12 @@ class OCRService:
         return result
     def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
-        """Tesseract OCR with basic HTML generation and page numbers"""
         result = {
             'success': False,
             'text': '',
             'html': '',
-            'method_used': 'tesseract',
             'metadata': {},
             'error': None
         }
@@ -899,11 +1042,19 @@ class OCRService:
             page_count = len(pdf_document)
             all_text = []
             html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
-            html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
-            html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
-            html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
             html_parts.append('</style></head><body>')
             for page_num in range(page_count):
                 # Add page header to text
                 page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
@@ -929,10 +1080,41 @@ class OCRService:
                     all_text.append(text)
-                    # Add to HTML with page number
                     html_parts.append(f'<div class="page">')
                     html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
-                    html_parts.append(f'<pre>{text}</pre></div>')
                 finally:
                     if temp_img_path and os.path.exists(temp_img_path):
@@ -943,19 +1125,26 @@ class OCRService:
             html_parts.append('</body></html>')
             result.update({
                 'success': True,
-                'text': '\n'.join(all_text),
-                'html': '\n'.join(html_parts),
                 'metadata': {
                     'pages': page_count,
                     'html_generated': True,
                     'page_numbers_added': True,
-                    'improved_formatting': True
                 }
             })
-            logger.info("Tesseract OCR with improved formatting completed successfully")
         except Exception as e:
             logger.error(f"Tesseract OCR error: {e}")
@@ -970,12 +1159,12 @@ class OCRService:
         return result
     def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
-        """PyMuPDF text extraction with HTML generation and page numbers"""
         result = {
             'success': False,
             'text': '',
             'html': '',
-            'method_used': 'pymupdf',
             'metadata': {},
             'error': None
         }
@@ -986,11 +1175,19 @@ class OCRService:
             page_count = len(pdf_document)
             all_text = []
             html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
-            html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
-            html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
-            html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
             html_parts.append('</style></head><body>')
             for page_num in range(page_count):
                 # Add page header to text
                 page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
@@ -1001,27 +1198,64 @@ class OCRService:
                 all_text.append(text)
-                # Add to HTML with better formatting and page numbers
                 html_parts.append(f'<div class="page">')
                 html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
-                formatted_text = text.replace('\n', '<br>')
-                html_parts.append(f'<div>{formatted_text}</div></div>')
             html_parts.append('</body></html>')
             result.update({
                 'success': True,
-                'text': '\n'.join(all_text),
-                'html': '\n'.join(html_parts),
                 'metadata': {
                     'pages': page_count,
                     'html_generated': True,
                     'page_numbers_added': True,
-                    'improved_formatting': True
                 }
             })
-            logger.info("PyMuPDF extraction with improved formatting completed successfully")
         except Exception as e:
             logger.error(f"PyMuPDF error: {e}")
@@ -1058,7 +1292,7 @@ class OCRService:
             logger.info(f"Trying fallback method: {method}")
             try:
                 if method == "azure":
-                    result = self._azure_ocr_with_html(pdf_path)
                 elif method == "tesseract":
                     result = self._tesseract_ocr(pdf_path)
                 elif method == "pymupdf":

 """
+OCR Service Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
+Handles PDF to text conversion with proper indentation, spacing, page numbering, and intelligent text analysis
 """
 import re
 import os
 import fitz  # PyMuPDF
+# Enhanced indentation detection
+from enhanced_indentation import EnhancedIndentationDetector
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class EnhancedHTMLProcessor:
+    """Process OCR results through HTML with comprehensive indentation detection and intelligent text classification"""
+    def __init__(self):
+        self.indent_detector = EnhancedIndentationDetector()
     @staticmethod
     def create_html_from_azure_result(analysis_result) -> str:
+        """Create structured HTML from Azure Document Intelligence result with enhanced indentation and text classification"""
+        processor = EnhancedHTMLProcessor()
         html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
         html_parts.append('<style>')
         html_parts.append('''
                 text-transform: uppercase;
                 letter-spacing: 1px;
             }
+            /* Enhanced indentation levels */
+            .indent-level-0 { margin-left: 0em; }
+            .indent-level-1 { margin-left: 1.5em; }
+            .indent-level-2 { margin-left: 3.0em; }
+            .indent-level-3 { margin-left: 4.5em; }
+            .indent-level-4 { margin-left: 6.0em; }
+            .indent-level-5 { margin-left: 7.5em; }
+            .indent-level-6 { margin-left: 9.0em; }
+            .indent-level-7 { margin-left: 10.5em; }
+            .indent-level-8 { margin-left: 12.0em; }
+            .indent-level-9 { margin-left: 13.5em; }
+            .indent-level-10 { margin-left: 15.0em; }
+            /* Text classification styles */
+            .content-header {
+                font-weight: bold;
+                color: #2c3e50;
+                font-size: 1.1em;
+                margin: 15px 0 8px 0;
+                border-left: 4px solid #3498db;
+                padding-left: 10px;
+                background-color: #f8f9fa;
+            }
+            .content-paragraph {
+                color: #333;
+                margin-bottom: 1em;
+                line-height: 1.5;
+            }
+            .content-list-item {
+                margin-bottom: 0.5em;
+                line-height: 1.4;
+            }
+            /* Pattern-specific styles */
+            .numbered-primary {
+                font-weight: bold;
+                color: #2c3e50;
+                border-left: 4px solid #3498db;
+                padding-left: 8px;
+                margin-bottom: 0.5em;
+                background-color: #f8f9fa;
+            }
+            .numbered-secondary {
+                font-weight: 600;
+                color: #34495e;
+                border-left: 3px solid #95a5a6;
+                padding-left: 6px;
+                margin-bottom: 0.4em;
+                background-color: #f9f9f9;
+            }
+            .numbered-tertiary {
+                color: #555;
+                border-left: 2px solid #bdc3c7;
+                padding-left: 4px;
+                margin-bottom: 0.3em;
+            }
+            .numbered-quaternary {
+                color: #666;
+                border-left: 1px solid #dee2e6;
+                padding-left: 3px;
+                margin-bottom: 0.2em;
+            }
+            .numbered-quinary {
+                color: #777;
+                padding-left: 2px;
+                margin-bottom: 0.2em;
+            }
+            /* Parenthetical styles */
+            .parenthetical-primary {
+                font-weight: 600;
+                color: #8e44ad;
+                border-left: 3px solid #9b59b6;
+                padding-left: 6px;
+                margin-bottom: 0.4em;
+            }
+            .parenthetical-secondary {
+                color: #9b59b6;
+                border-left: 2px solid #af7ac5;
+                padding-left: 4px;
+                margin-bottom: 0.3em;
+            }
+            .parenthetical-tertiary {
+                color: #af7ac5;
+                padding-left: 3px;
+                margin-bottom: 0.2em;
+            }
+            .parenthetical-quaternary {
+                color: #c39bd3;
+                padding-left: 2px;
+                margin-bottom: 0.2em;
+            }
+            .bullet-primary {
+                position: relative;
+                padding-left: 1.2em;
+            }
+            .bullet-primary::before {
+                content: "•";
+                position: absolute;
+                left: 0;
+                color: #3498db;
+                font-weight: bold;
+            }
+            .bullet-secondary {
+                position: relative;
+                padding-left: 1.2em;
+            }
+            .bullet-secondary::before {
+                content: "◦";
+                position: absolute;
+                left: 0;
+                color: #95a5a6;
+            }
+            .bullet-tertiary {
+                position: relative;
+                padding-left: 1.2em;
+            }
+            .bullet-tertiary::before {
+                content: "▪";
+                position: absolute;
+                left: 0;
+                color: #bdc3c7;
+            }
+            .bullet-quaternary {
+                position: relative;
+                padding-left: 1.2em;
+            }
+            .bullet-quaternary::before {
+                content: "‣";
+                position: absolute;
+                left: 0;
+                color: #dee2e6;
+            }
+            .lettered-primary {
+                font-style: italic;
+                color: #8e44ad;
+                font-weight: 600;
+            }
+            .lettered-secondary {
+                color: #9b59b6;
+                font-style: italic;
+            }
+            .roman-primary {
+                font-variant: small-caps;
+                color: #d35400;
+                font-weight: bold;
+            }
+            .roman-secondary {
+                color: #e67e22;
+                font-variant: small-caps;
+            }
+            .thai-primary {
+                color: #16a085;
+                font-weight: bold;
+            }
+            .thai-secondary {
+                color: #1abc9c;
+            }
             .paragraph {
                 margin-bottom: 0.8em;
                 white-space: pre-wrap;
                 font-family: 'Consolas', 'Courier New', monospace;
                 line-height: 1.4;
             }
             .title {
                 font-size: 1.4em;
                 font-weight: bold;
             .table tr:nth-child(even) {
                 background-color: #f8f9fa;
             }
+            .indented_text {
+                color: #555;
+                font-style: italic;
             }
+            .space-indent {
+                border-left: 1px dotted #ccc;
+                padding-left: 5px;
             }
             .page-number {
                 position: relative;
             html_parts.append(f'<div class="page">')
             html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
+            # Process content with enhanced indentation detection and text classification
+            content_items = processor._extract_page_content_enhanced(page, analysis_result, page_num)
             content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
+            # Generate HTML for each content item with enhanced formatting and classification
             for item in content_items:
                 if item['type'] == 'table':
+                    html_parts.append(processor._table_to_html(item['content'], item['table_idx']))
                 else:
+                    html_parts.append(processor._text_to_html_enhanced(item))
             html_parts.append('</div>')
         html_parts.append('</body></html>')
         return '\n'.join(html_parts)
+    def _extract_page_content_enhanced(self, page, analysis_result, page_num):
+        """Extract page content with enhanced indentation detection and intelligent text classification"""
         content_items = []
+        # Handle tables (existing logic)
         page_tables = []
         table_regions = []
         if analysis_result.tables:
             for table_idx, table in enumerate(analysis_result.tables):
+                if self._is_table_on_page(table, page_num):
                     page_tables.append((table_idx, table))
                     if table.bounding_regions:
                         table_regions.append({
                             'polygon': table.bounding_regions[0].polygon,
                             'table_idx': table_idx
                         })
+        # Add tables to content
         for table_idx, table in page_tables:
             if table.bounding_regions and table.bounding_regions[0].polygon:
                 polygon = table.bounding_regions[0].polygon
+                y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7])
+                x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
                 content_items.append({
                     'type': 'table',
                     'x_pos': x_pos
                 })
+        # Process text content with enhanced indentation detection and text classification
         if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
             page_paragraphs = [p for p in analysis_result.paragraphs if
                              p.bounding_regions and
                              p.bounding_regions[0].page_number == page_num]
             for para in page_paragraphs:
                 if para.content.strip():
+                    # Check table overlap
+                    overlap_ratio = self._calculate_table_overlap(para, table_regions)
+                    if overlap_ratio < 0.7:  # Not heavily overlapping with table
                         polygon = para.bounding_regions[0].polygon
                         y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
                         x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
+                        # Enhanced indentation detection
+                        indent_info = self.indent_detector.detect_indentation(para.content)
+                        # Intelligent text classification with context
+                        context = {
+                            'y_position': y_pos,
+                            'x_position': x_pos,
+                            'font_size': getattr(para, 'font_size', None),
+                            'is_bold': getattr(para, 'is_bold', False),
+                            'page_number': page_num
+                        }
+                        text_classification = self.indent_detector.classify_text_type(para.content, context)
                         content_items.append({
                             'type': 'paragraph',
+                            'content': indent_info['content'],
                             'role': getattr(para, 'role', 'paragraph'),
                             'y_pos': y_pos,
                             'x_pos': x_pos,
+                            'indent_info': indent_info,
+                            'text_classification': text_classification,
                             'preserve_spacing': True
                         })
         elif page.lines:
+            # Process lines with enhanced indentation detection and classification
+            processed_lines = self._process_lines_enhanced(page.lines, table_regions)
             content_items.extend(processed_lines)
         return content_items
+    def _process_lines_enhanced(self, lines, table_regions):
+        """Process lines with enhanced indentation detection and text classification"""
+        content_items = []
+        processed_content = set()
+        for line in lines:
+            if not line.content.strip():
+                continue
+            content_key = line.content.strip().lower()
+            if content_key in processed_content:
+                continue
+            processed_content.add(content_key)
+            # Check table overlap
+            overlap_ratio = self._calculate_line_table_overlap(line, table_regions)
+            if overlap_ratio < 0.7:
+                polygon = line.polygon
+                y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
+                x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
+                # Enhanced indentation detection
+                indent_info = self.indent_detector.detect_indentation(line.content)
+                # Text classification with context
+                context = {
+                    'y_position': y_pos,
+                    'x_position': x_pos
+                }
+                text_classification = self.indent_detector.classify_text_type(line.content, context)
+                content_items.append({
+                    'type': 'line',
+                    'content': indent_info['content'],
+                    'role': 'text',
+                    'y_pos': y_pos,
+                    'x_pos': x_pos,
+                    'indent_info': indent_info,
+                    'text_classification': text_classification,
+                    'preserve_spacing': True
+                })
+        return content_items
+    def _text_to_html_enhanced(self, item):
+        """Convert text item to HTML with enhanced indentation formatting and intelligent classification"""
+        content = item['content']
+        role = item.get('role', 'paragraph')
+        indent_info = item.get('indent_info', {})
+        text_classification = item.get('text_classification', {})
+        preserve_spacing = item.get('preserve_spacing', False)
+        # Build CSS classes based on indentation info and text classification
+        css_classes = ['paragraph']
+        # Add text classification class
+        if text_classification.get('type'):
+            css_classes.append(f"content-{text_classification['type']}")
+        # Add indentation level class
+        level = indent_info.get('level', 0)
+        css_classes.append(f'indent-level-{min(level, 10)}')
+        # Add pattern-specific formatting
+        formatting_hint = indent_info.get('formatting_hint', 'normal_text')
+        if formatting_hint != 'normal_text':
+            css_classes.append(formatting_hint)
+        # Add space indent class if needed
+        if indent_info.get('pattern_type') == 'space_indent':
+            css_classes.append('space-indent')
+        # Preserve internal spacing
+        if preserve_spacing:
+            content = re.sub(r'  +', lambda m: '&nbsp;' * len(m.group()), content)
+            content = content.replace('\n', '<br>')
+        # Add pattern marker if needed (but not for bullets as CSS handles them)
+        pattern_marker = indent_info.get('pattern_marker', '')
+        if pattern_marker and not indent_info.get('is_bullet', False):
+            # For numbered/lettered items, include the marker
+            content = f"{pattern_marker} {content}"
+        # Build final HTML with enhanced classification
+        class_str = f' class="{" ".join(css_classes)}"'
+        # Use text classification to determine HTML structure
+        if text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.6:
+            return f'<div class="content-header"{class_str}>{content}</div>'
+        elif role == 'title':
+            return f'<div class="title"{class_str}>{content}</div>'
+        elif role == 'sectionHeading':
+            return f'<div class="section-heading"{class_str}>{content}</div>'
+        else:
+            return f'<div{class_str}>{content}</div>'
+    def _table_to_html(self, table, table_idx):
+        """Convert table to HTML with proper structure"""
+        if not table.cells:
+            return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
+        # Create table matrix
+        max_row = max(cell.row_index for cell in table.cells) + 1
+        max_col = max(cell.column_index for cell in table.cells) + 1
+        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
+        # Fill matrix
+        for cell in table.cells:
+            content = (cell.content or "").strip()
+            table_matrix[cell.row_index][cell.column_index] = content
+        # Generate HTML
+        html_parts = [f'<div class="table-container">']
+        html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
+        html_parts.append('<table class="table">')
+        for row_idx, row in enumerate(table_matrix):
+            if row_idx == 0 and any(cell.strip() for cell in row):
+                # Header row
+                html_parts.append('<tr>')
+                for cell in row:
+                    html_parts.append(f'<th>{cell}</th>')
+                html_parts.append('</tr>')
+            else:
+                # Data row
+                if any(cell.strip() for cell in row):  # Skip empty rows
+                    html_parts.append('<tr>')
+                    for cell in row:
+                        html_parts.append(f'<td>{cell}</td>')
+                    html_parts.append('</tr>')
+        html_parts.append('</table></div>')
+        return '\n'.join(html_parts)
+    def _is_table_on_page(self, table, page_num):
         """Check if table belongs to the specified page"""
         if not table.cells:
             return False
                 return True
         return False
+    def _calculate_table_overlap(self, content_item, table_regions):
+        """Calculate overlap ratio between content and tables"""
         if not table_regions or not content_item.bounding_regions:
             return 0.0
         return max_overlap_ratio
+    def _calculate_line_table_overlap(self, line, table_regions):
         """Calculate overlap between line and tables"""
         if not table_regions or not line.polygon:
             return 0.0
         return max_overlap
     @staticmethod
+    def html_to_formatted_text_enhanced(html_content):
+        """Convert HTML back to formatted text with enhanced indentation preservation and text classification"""
         from html.parser import HTMLParser
+        class EnhancedTextExtractor(HTMLParser):
             def __init__(self):
                 super().__init__()
                 self.text_parts = []
+                self.indent_detector = EnhancedIndentationDetector()
                 self.in_title = False
                 self.in_section_heading = False
                 self.in_table = False
                 self.current_table_row = []
                 self.table_data = []
+                self.current_indent_level = 0
+                self.current_formatting_hint = 'normal_text'
                 self.in_page_header = False
+                self.current_classes = []
+                self.in_content_header = False
             def handle_starttag(self, tag, attrs):
                 attr_dict = dict(attrs)
                 class_attr = attr_dict.get('class', '')
+                self.current_classes = class_attr.split()
                 if 'page-header' in class_attr:
                     self.in_page_header = True
                     if len(self.text_parts) > 0:
                         self.text_parts.append('\n\n' + '=' * 80 + '\n')
+                elif 'content-header' in class_attr:
+                    self.in_content_header = True
                 elif 'title' in class_attr:
                     self.in_title = True
                 elif 'section-heading' in class_attr:
                 elif tag == 'table':
                     self.in_table = True
                     self.table_data = []
                 elif tag == 'tr':
                     self.current_table_row = []
                 elif tag == 'br':
                     self.text_parts.append('\n')
+                # Extract indent level from class
+                for cls in self.current_classes:
+                    if cls.startswith('indent-level-'):
+                        try:
+                            self.current_indent_level = int(cls.split('-')[-1])
+                        except ValueError:
+                            self.current_indent_level = 0
+                        break
                 else:
+                    self.current_indent_level = 0
+                # Extract formatting hint
+                formatting_hints = [
+                    'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary',
+                    'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary',
+                    'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary',
+                    'lettered-primary', 'lettered-secondary',
+                    'roman-primary', 'roman-secondary',
+                    'thai-primary', 'thai-secondary',
+                    'indented_text', 'space-indent'
+                ]
+                for hint in formatting_hints:
+                    if hint in self.current_classes:
+                        self.current_formatting_hint = hint
+                        break
+                else:
+                    self.current_formatting_hint = 'normal_text'
             def handle_endtag(self, tag):
                 if tag == 'div' and self.in_page_header:
                     self.text_parts.append('\n' + '=' * 80 + '\n\n')
                     self.in_page_header = False
+                elif tag == 'div' and self.in_content_header:
+                    self.text_parts.append('\n\n')
+                    self.in_content_header = False
                 elif tag == 'div' and self.in_title:
                     self.text_parts.append('\n\n')
                     self.in_title = False
                 elif tag == 'table':
                     self.in_table = False
                     self._format_table()
                 elif tag == 'tr' and self.current_table_row:
                     self.table_data.append(self.current_table_row[:])
+                elif tag == 'div' and not self.in_table:
+                    if not self.in_title and not self.in_section_heading and not self.in_page_header and not self.in_content_header:
                         self.text_parts.append('\n')
+                # Reset state
                 if tag == 'div':
+                    self.current_indent_level = 0
+                    self.current_formatting_hint = 'normal_text'
+                    self.current_classes = []
             def handle_data(self, data):
                 if data.strip():
                     data = data.replace('&nbsp;', ' ')
                     if self.in_page_header:
                         page_match = re.search(r'Page (\d+)', data)
                         if page_match:
+                            page_num = int(page_match.group(1))
+                            page_header = f"PAGE {page_num}"
                             self.text_parts.append(page_header.center(80))
+                    elif self.in_content_header:
+                        indent_str = "  " * self.current_indent_level
+                        self.text_parts.append(f'\n{indent_str}# {data.strip()}')
                     elif self.in_title:
+                        indent_str = "  " * self.current_indent_level
                         self.text_parts.append(f'\n{indent_str}## {data.strip()}')
                     elif self.in_section_heading:
+                        indent_str = "  " * self.current_indent_level
                         self.text_parts.append(f'\n{indent_str}### {data.strip()}')
                     elif self.in_table:
+                        self.current_table_row.append(data.strip())
                     else:
+                        # Apply enhanced indentation formatting
+                        indent_str = "  " * self.current_indent_level
+                        # Handle different formatting hints including parenthetical
+                        if 'bullet' in self.current_formatting_hint:
+                            # Use appropriate bullet symbol based on level
+                            if 'primary' in self.current_formatting_hint:
+                                bullet = '•'
+                            elif 'secondary' in self.current_formatting_hint:
+                                bullet = '◦'
+                            elif 'tertiary' in self.current_formatting_hint:
+                                bullet = '▪'
+                            elif 'quaternary' in self.current_formatting_hint:
+                                bullet = '‣'
+                            else:
+                                bullet = '•'
+                            self.text_parts.append(f'{indent_str}{bullet} {data.strip()}')
+                        elif any(pattern in self.current_formatting_hint for pattern in ['numbered', 'lettered', 'roman', 'thai', 'parenthetical']):
+                            # For numbered/lettered/parenthetical items, the marker should already be in the text
+                            self.text_parts.append(f'{indent_str}{data.strip()}')
+                        elif 'space-indent' in self.current_formatting_hint:
+                            # Simple indented text
+                            self.text_parts.append(f'{indent_str}{data.strip()}')
                         else:
+                            # Regular text with indentation
+                            self.text_parts.append(f'{indent_str}{data.strip()}')
             def _format_table(self):
+                """Format table with proper alignment"""
                 if not self.table_data:
                     return
                 self.text_parts.append('\n\n')
                 if self.table_data:
                     max_cols = max(len(row) for row in self.table_data)
                     col_widths = [0] * max_cols
+                    # Calculate column widths
                     for row in self.table_data:
                         for i, cell in enumerate(row):
                             if i < max_cols:
                 self.text_parts.append('\n')
+        extractor = EnhancedTextExtractor()
         extractor.feed(html_content)
         result = ''.join(extractor.text_parts)
 class OCRService:
+    """Main OCR service with enhanced HTML processing, comprehensive indentation detection, and intelligent text classification"""
     def __init__(self):
         self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
     def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
         """
+        Convert PDF to text using specified method with enhanced HTML processing and intelligent text classification
         Args:
             pdf_path: Path to the PDF file
         # Try primary method
         try:
             if method == "azure" and self.azure_client:
+                result = self._azure_ocr_with_enhanced_html(pdf_path)
             elif method == "tesseract":
                 result = self._tesseract_ocr(pdf_path)
             elif method == "pymupdf":
         return result
+    def _azure_ocr_with_enhanced_html(self, pdf_path: str) -> Dict[str, Any]:
+        """Azure Document Intelligence OCR with enhanced HTML processing, indentation detection, and intelligent text classification"""
         result = {
             'success': False,
             'text': '',
             'html': '',
+            'method_used': 'azure_document_intelligence_enhanced_v2',
             'metadata': {},
             'error': None
         }
             analysis_result = poller.result()
+            # Generate HTML with enhanced indentation processing and text classification
+            html_content = EnhancedHTMLProcessor.create_html_from_azure_result(analysis_result)
+            # Convert HTML to formatted text with enhanced indentation preservation and classification
+            formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
+            # Analyze document structure with text classification
+            detector = EnhancedIndentationDetector()
+            text_lines = formatted_text.split('\n')
+            document_analysis = detector.analyze_document_structure(text_lines)
             result.update({
                 'success': True,
                     'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
                     'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
                     'html_generated': True,
+                    'enhanced_indentation': True,
+                    'intelligent_text_classification': True,
+                    'parenthetical_patterns_supported': True,
                     'page_numbers_added': True,
+                    'comprehensive_formatting': True,
+                    'azure_analysis': analysis_result,
+                    'document_structure_analysis': document_analysis
                 }
             })
+            logger.info("Azure OCR with enhanced indentation processing and intelligent text classification completed successfully")
         except Exception as e:
             logger.error(f"Azure OCR error: {e}")
         return result
     def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
+        """Tesseract OCR with enhanced HTML generation, indentation detection, and text classification"""
         result = {
             'success': False,
             'text': '',
             'html': '',
+            'method_used': 'tesseract_enhanced_v2',
             'metadata': {},
             'error': None
         }
             page_count = len(pdf_document)
             all_text = []
             html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
+            html_parts.append('''
+                body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }
+                .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
+                .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
+                .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
+                .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
+                .content-paragraph { margin-bottom: 1em; }
+                .content-list-item { margin-bottom: 0.5em; }
+            ''')
             html_parts.append('</style></head><body>')
+            indent_detector = EnhancedIndentationDetector()
             for page_num in range(page_count):
                 # Add page header to text
                 page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
                     all_text.append(text)
+                    # Add to HTML with enhanced indentation processing and text classification
                     html_parts.append(f'<div class="page">')
                     html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
+                    # Process each line for indentation and classification
+                    lines = text.split('\n')
+                    for line in lines:
+                        if line.strip():
+                            indent_info = indent_detector.detect_indentation(line)
+                            text_classification = indent_detector.classify_text_type(line)
+                            level = indent_info.get('level', 0)
+                            formatting_hint = indent_info.get('formatting_hint', 'normal_text')
+                            css_classes = [f'indent-level-{min(level, 10)}']
+                            if formatting_hint != 'normal_text':
+                                css_classes.append(formatting_hint)
+                            # Add text classification class
+                            if text_classification.get('type'):
+                                css_classes.append(f"content-{text_classification['type']}")
+                            class_str = f' class="paragraph {" ".join(css_classes)}"'
+                            content = indent_info.get('content', line.strip())
+                            # Add marker for non-bullet items
+                            marker = indent_info.get('pattern_marker', '')
+                            if marker and not indent_info.get('is_bullet', False):
+                                content = f"{marker} {content}"
+                            html_parts.append(f'<div{class_str}>{content}</div>')
+                        else:
+                            html_parts.append('<div class="paragraph"><br></div>')
+                    html_parts.append('</div>')
                 finally:
                     if temp_img_path and os.path.exists(temp_img_path):
             html_parts.append('</body></html>')
+            # Convert HTML back to formatted text
+            html_content = '\n'.join(html_parts)
+            formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
             result.update({
                 'success': True,
+                'text': formatted_text,
+                'html': html_content,
                 'metadata': {
                     'pages': page_count,
                     'html_generated': True,
+                    'enhanced_indentation': True,
+                    'intelligent_text_classification': True,
+                    'parenthetical_patterns_supported': True,
                     'page_numbers_added': True,
+                    'comprehensive_formatting': True
                 }
             })
+            logger.info("Tesseract OCR with enhanced indentation processing and text classification completed successfully")
         except Exception as e:
             logger.error(f"Tesseract OCR error: {e}")
         return result
     def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
+        """PyMuPDF text extraction with enhanced HTML generation, indentation detection, and text classification"""
         result = {
             'success': False,
             'text': '',
             'html': '',
+            'method_used': 'pymupdf_enhanced_v2',
             'metadata': {},
             'error': None
         }
             page_count = len(pdf_document)
             all_text = []
             html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
+            html_parts.append('''
+                body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }
+                .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
+                .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
+                .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
+                .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
+                .content-paragraph { margin-bottom: 1em; }
+                .content-list-item { margin-bottom: 0.5em; }
+            ''')
             html_parts.append('</style></head><body>')
+            indent_detector = EnhancedIndentationDetector()
             for page_num in range(page_count):
                 # Add page header to text
                 page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
                 all_text.append(text)
+                # Add to HTML with enhanced indentation processing and text classification
                 html_parts.append(f'<div class="page">')
                 html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
+                # Process each line for indentation and classification
+                lines = text.split('\n')
+                for line in lines:
+                    if line.strip():
+                        indent_info = indent_detector.detect_indentation(line)
+                        text_classification = indent_detector.classify_text_type(line)
+                        level = indent_info.get('level', 0)
+                        formatting_hint = indent_info.get('formatting_hint', 'normal_text')
+                        css_classes = [f'indent-level-{min(level, 10)}']
+                        if formatting_hint != 'normal_text':
+                            css_classes.append(formatting_hint)
+                        # Add text classification class
+                        if text_classification.get('type'):
+                            css_classes.append(f"content-{text_classification['type']}")
+                        class_str = f' class="paragraph {" ".join(css_classes)}"'
+                        content = indent_info.get('content', line.strip())
+                        # Add marker for non-bullet items
+                        marker = indent_info.get('pattern_marker', '')
+                        if marker and not indent_info.get('is_bullet', False):
+                            content = f"{marker} {content}"
+                        html_parts.append(f'<div{class_str}>{content}</div>')
+                    else:
+                        html_parts.append('<div class="paragraph"><br></div>')
+                html_parts.append('</div>')
             html_parts.append('</body></html>')
+            # Convert HTML back to formatted text
+            html_content = '\n'.join(html_parts)
+            formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
             result.update({
                 'success': True,
+                'text': formatted_text,
+                'html': html_content,
                 'metadata': {
                     'pages': page_count,
                     'html_generated': True,
+                    'enhanced_indentation': True,
+                    'intelligent_text_classification': True,
+                    'parenthetical_patterns_supported': True,
                     'page_numbers_added': True,
+                    'comprehensive_formatting': True
                 }
             })
+            logger.info("PyMuPDF extraction with enhanced indentation processing and text classification completed successfully")
         except Exception as e:
             logger.error(f"PyMuPDF error: {e}")
             logger.info(f"Trying fallback method: {method}")
             try:
                 if method == "azure":
+                    result = self._azure_ocr_with_enhanced_html(pdf_path)
                 elif method == "tesseract":
                     result = self._tesseract_ocr(pdf_path)
                 elif method == "pymupdf":

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# PDF OCR Service Requirements - Enhanced Version with HTML Processing
 # Core web framework and UI
 gradio>=4.0.0
@@ -22,10 +22,21 @@ PyMuPDF>=1.23.0
 # Document export formats (ENHANCED)
 python-docx>=0.8.11
-# HTML processing and parsing (NEW)
 beautifulsoup4>=4.12.0
 lxml>=4.9.0
 # Additional dependencies for enhanced preprocessing
 matplotlib>=3.7.0  # For image visualization in development
 scikit-image>=0.21.0  # Advanced image processing (optional)
@@ -34,45 +45,297 @@ scikit-image>=0.21.0  # Advanced image processing (optional)
 tqdm>=4.65.0  # Progress bars for long operations
 requests>=2.31.0  # HTTP requests for external services
 # System dependencies information (install separately):
 #
 # For Ubuntu/Debian:
 # sudo apt-get update
-# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
 # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
 # sudo apt-get install -y python3-opencv  # Alternative OpenCV installation
 # sudo apt-get install -y libxml2-dev libxslt1-dev  # For lxml
 #
 # For CentOS/RHEL:
-# sudo yum install -y tesseract tesseract-langpack-eng
 # sudo yum install -y opencv-python
 # sudo yum install -y libxml2-devel libxslt-devel
 #
 # For macOS:
 # brew install tesseract
 # brew install opencv
 # brew install libxml2
 #
 # For Windows:
 # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
 # Add Tesseract to PATH environment variable
 # OpenCV and other packages should install automatically with pip
 # Development and testing (optional)
 pytest>=7.0.0
 pytest-cov>=4.0.0
 black>=23.0.0  # Code formatting
 flake8>=6.0.0  # Code linting
 # Performance monitoring (optional)
 memory-profiler>=0.60.0
 psutil>=5.9.0  # System monitoring
-# Note: The enhanced version includes:
-# - Fixed table processing that prevents text loss
-# - HTML intermediate processing for better formatting
-# - Enhanced export capabilities (TXT, DOCX, HTML)
-# - Smart overlap detection with 70% threshold
-# - Improved coordinate calculations for table boundaries
-# - Better document structure preservation
-# - Multi-format download options

+# PDF OCR Service Requirements - Enhanced Version with Comprehensive Indentation Detection & Text Classification
 # Core web framework and UI
 gradio>=4.0.0
 # Document export formats (ENHANCED)
 python-docx>=0.8.11
+# HTML processing and parsing
 beautifulsoup4>=4.12.0
 lxml>=4.9.0
+# Enhanced text processing and pattern detection
+regex>=2023.10.3  # For advanced regex patterns including parenthetical detection
+# Data handling and analysis
+pandas>=2.0.0  # For document structure analysis
+collections-extended>=2.0.2  # For enhanced counter operations
+# Text classification and analysis
+scikit-learn>=1.3.0  # For advanced text classification algorithms (optional)
+nltk>=3.8  # Natural language processing toolkit (optional)
 # Additional dependencies for enhanced preprocessing
 matplotlib>=3.7.0  # For image visualization in development
 scikit-image>=0.21.0  # Advanced image processing (optional)
 tqdm>=4.65.0  # Progress bars for long operations
 requests>=2.31.0  # HTTP requests for external services
+# Logging and monitoring
+colorlog>=6.7.0  # Enhanced logging with colors
+structlog>=23.1.0  # Structured logging for better debugging
+# File handling and temporary file management
+pathlib2>=2.3.7  # Enhanced path operations
+tempfile-plus>=1.2.0  # Advanced temporary file handling
+# Date and time handling
+python-dateutil>=2.8.2  # Enhanced date parsing
+# Enhanced Unicode and text processing
+unicodedata2>=15.0.0  # Enhanced Unicode support for Thai and other scripts
+ftfy>=6.1.1  # Text fixing and encoding repair
+# Configuration and validation
+pydantic>=2.0.0  # Data validation and settings management
+confuse>=2.0.0  # Configuration file handling
 # System dependencies information (install separately):
 #
 # For Ubuntu/Debian:
 # sudo apt-get update
+# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-tha
 # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
 # sudo apt-get install -y python3-opencv  # Alternative OpenCV installation
 # sudo apt-get install -y libxml2-dev libxslt1-dev  # For lxml
+# sudo apt-get install -y fonts-thai-tlwg fonts-thai-tlwg-otf  # Thai font support
+# sudo apt-get install -y language-pack-th  # Thai language support
+# sudo apt-get install -y fonts-noto fonts-noto-cjk  # Unicode font support
 #
 # For CentOS/RHEL:
+# sudo yum install -y tesseract tesseract-langpack-eng tesseract-langpack-tha
 # sudo yum install -y opencv-python
 # sudo yum install -y libxml2-devel libxslt-devel
+# sudo yum install -y thai-scalable-fonts google-noto-fonts
 #
 # For macOS:
 # brew install tesseract
+# brew install tesseract-lang  # Includes Thai support
 # brew install opencv
 # brew install libxml2
+# brew install font-thai-fonts font-noto
 #
 # For Windows:
 # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
+# Download Thai language data from: https://github.com/tesseract-ocr/tessdata
+# Download Thai numerals training data if available
 # Add Tesseract to PATH environment variable
 # OpenCV and other packages should install automatically with pip
+# Install Thai fonts from Windows Language Settings
+# Install Unicode fonts (Noto fonts recommended)
 # Development and testing (optional)
 pytest>=7.0.0
 pytest-cov>=4.0.0
+pytest-asyncio>=0.21.0  # For async testing
+pytest-mock>=3.11.0  # For mocking in tests
 black>=23.0.0  # Code formatting
 flake8>=6.0.0  # Code linting
+mypy>=1.5.0  # Type checking
+isort>=5.12.0  # Import sorting
 # Performance monitoring (optional)
 memory-profiler>=0.60.0
 psutil>=5.9.0  # System monitoring
+py-spy>=0.3.14  # Performance profiling
+# Enhanced error handling and debugging
+rich>=13.0.0  # Rich console output for debugging
+icecream>=2.1.3  # Enhanced debugging print statements
+# Enhanced file type detection
+python-magic>=0.4.27  # File type detection
+filetype>=1.2.0  # Alternative file type detection
+# Additional text processing utilities
+Unidecode>=1.3.6  # ASCII transliteration for Unicode text
+langdetect>=1.0.9  # Language detection for multi-language documents
+# Note: The enhanced version includes comprehensive features:
+#
+# COMPREHENSIVE INDENTATION DETECTION FEATURES:
+# ===============================================
+#
+# 1. HIERARCHICAL NUMBERING PATTERNS:
+#    - Decimal hierarchy: 1.1.1.1.1... (unlimited depth)
+#    - Mixed hierarchy: 1.2.a.i.A... (numbers, letters, Roman mixed)
+#    - Legal numbering: 1.1.1(a)(i) (with parenthetical sub-sections)
+#    - Outline numbering: I.A.1.a.i. (formal document structure)
+#    - Section numbering: §1.2.3, Article 1.1.1, Chapter 1.2
+#
+# 2. PARENTHETICAL PATTERNS (NEW):
+#    - Arabic numerals: (1), (2), (3), (10), (25)...
+#    - Thai numerals: (๑), (๒), (๓), (๑๐), (๒๕)...
+#    - Lowercase letters: (a), (b), (c)... (z), (aa), (bb)...
+#    - Uppercase letters: (A), (B), (C)... (Z), (AA), (BB)...
+#    - Thai letters: (ก), (ข), (ค)... (ฮ)
+#    - Lowercase Roman: (i), (ii), (iii), (iv), (v)...
+#    - Uppercase Roman: (I), (II), (III), (IV), (V)...
+#
+# 3. TRADITIONAL PATTERNS:
+#    - Simple numbered lists: 1., 2., 3.
+#    - Simple numbered with parens: 1), 2), 3)
+#    - Letter lists: a., b., c. and A., B., C.
+#    - Thai letters: ก., ข., ค.
+#    - Roman numerals: i., ii., iii. and I., II., III.
+#    - Multiple bullet styles: •◦▪→ and 20+ more symbols
+#    - Checkbox items: [x], [ ], [✓], [✗]
+#    - Arrow bullets: →, ←, ↑, ↓, ⇒, ➔ and more
+#    - Dash bullets: -, *, +, ~, =
+#
+# 4. MULTI-LANGUAGE SUPPORT:
+#    - Thai script: มาตรา, ข้อ, หมวด, ส่วน
+#    - Thai numerals: ๐๑๒๓๔๕๖๗๘๙
+#    - Thai letters: ก-ฮ (44 consonants)
+#    - Unicode symbols: Full range of bullet and arrow characters
+#    - Mixed language documents: English + Thai seamlessly
+#
+# 5. SPACE-BASED INDENTATION:
+#    - Automatic detection of space-based indentation levels
+#    - 4-space = 1 level standard
+#    - Combining space indentation with pattern indentation
+#    - Up to 10 indentation levels supported
+#
+# 6. PRIORITY-BASED PATTERN MATCHING:
+#    - Hierarchical patterns get higher priority
+#    - Parenthetical patterns prioritized appropriately
+#    - Prevents false positives in pattern detection
+#    - Smart disambiguation between similar patterns
+#
+# INTELLIGENT TEXT CLASSIFICATION FEATURES:
+# =========================================
+#
+# 1. HEADER DETECTION:
+#    - Title case detection: "Chapter One Introduction"
+#    - All caps detection: "SECTION A: OVERVIEW"
+#    - Numbered headers: "1. INTRODUCTION"
+#    - Section headers: "SECTION 1.2.3", "CHAPTER IV"
+#    - Thai headers: "หมวด ๑", "บท ก"
+#    - Short line detection: Lines under 50 characters
+#    - Position-based detection: Top of page content
+#    - Font size consideration: Larger fonts = likely headers
+#
+# 2. PARAGRAPH CLASSIFICATION:
+#    - Long text detection: Over 100 characters
+#    - Proper punctuation: Ends with periods
+#    - Context analysis: Position and formatting
+#    - Multi-sentence detection
+#    - Normal text flow patterns
+#
+# 3. LIST ITEM RECOGNITION:
+#    - Pattern-based identification
+#    - Numbered list items
+#    - Bulleted list items
+#    - Lettered list items
+#    - Roman numeral lists
+#    - Parenthetical lists
+#    - Checkbox lists
+#
+# 4. CONFIDENCE SCORING:
+#    - 0.0 to 1.0 confidence levels
+#    - Multiple factors considered
+#    - Context-aware scoring
+#    - Threshold-based classification
+#
+# 5. DOCUMENT STRUCTURE ANALYSIS:
+#    - Overall document statistics
+#    - Pattern distribution analysis
+#    - Coverage percentage calculation
+#    - Dominant pattern identification
+#    - Text type distribution
+#
+# ENHANCED PROCESSING FEATURES:
+# =============================
+#
+# 1. HTML INTERMEDIATE PROCESSING:
+#    - Better structure preservation
+#    - CSS-based indentation levels
+#    - Pattern-specific styling
+#    - Text classification styling
+#    - Responsive design
+#
+# 2. TABLE HANDLING:
+#    - Smart overlap detection (70% threshold)
+#    - Prevents text loss in tables
+#    - Improved coordinate calculations
+#    - Better boundary detection
+#
+# 3. EXPORT CAPABILITIES:
+#    - Enhanced TXT: Preserved indentation and structure
+#    - Enhanced DOCX: Color-coded formatting, proper indentation
+#    - Enhanced HTML: CSS styling, responsive design
+#    - All formats preserve pattern recognition results
+#
+# 4. CROP PROCESSING:
+#    - High-resolution processing (2x scale)
+#    - Per-page customization
+#    - Real-time preview
+#    - Enhanced coordinate handling
+#
+# 5. PERFORMANCE MONITORING:
+#    - Processing time tracking
+#    - Success rate monitoring
+#    - Pattern usage statistics
+#    - Document analysis metrics
+#
+# TECHNICAL IMPROVEMENTS:
+# ======================
+#
+# 1. ADVANCED REGEX PATTERNS:
+#    - Unicode-aware pattern matching
+#    - Thai script support
+#    - Complex parenthetical detection
+#    - Priority-based matching system
+#
+# 2. ERROR HANDLING:
+#    - Comprehensive error catching
+#    - Graceful degradation
+#    - Detailed logging
+#    - Recovery mechanisms
+#
+# 3. TESTING CAPABILITIES:
+#    - Unit tests for pattern detection
+#    - Integration tests for OCR
+#    - Performance benchmarking
+#    - Coverage reporting
+#
+# 4. DEBUGGING SUPPORT:
+#    - Rich console output
+#    - Structured logging
+#    - Pattern detection debugging
+#    - Classification confidence display
+#
+# INSTALLATION NOTES:
+# ==================
+#
+# 1. SYSTEM DEPENDENCIES:
+#    Install system dependencies first (see comments above)
+#    Ensure Thai language support is installed
+#    Install Unicode fonts for proper display
+#
+# 2. PYTHON DEPENDENCIES:
+#    Run: pip install -r requirements.txt
+#    Consider using virtual environment
+#    Update pip before installation: pip install --upgrade pip
+#
+# 3. AZURE CONFIGURATION (OPTIONAL):
+#    Set environment variables:
+#    - AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT
+#    - AZURE_DOCUMENT_INTELLIGENCE_KEY
+#
+# 4. TESTING:
+#    Test with sample documents containing:
+#    - Various indentation patterns
+#    - Parenthetical numbering
+#    - Mixed languages (English + Thai)
+#    - Complex document structures
+#    - Tables and lists
+#
+# 5. PERFORMANCE OPTIMIZATION:
+#    For high-volume processing:
+#    - Consider increasing system memory
+#    - Use SSD storage for temporary files
+#    - Monitor CPU usage during processing
+#    - Configure appropriate log levels
+#
+# SUPPORTED LANGUAGES AND SCRIPTS:
+# ================================
+#
+# - English: Full comprehensive support
+# - Thai: Complete support including numerals and letters
+# - Arabic numerals: 0-9 in all contexts
+# - Roman numerals: I, V, X, L, C, D, M and combinations
+# - Unicode symbols: Full range of bullets, arrows, and marks
+# - Mixed documents: Seamless handling of multi-language content
+# - International conventions: Support for various numbering systems
+#
+# VERSION COMPATIBILITY:
+# =====================
+#
+# - Python: 3.8+ required, 3.10+ recommended
+# - Operating Systems: Windows, macOS, Linux
+# - Memory: 4GB+ recommended for large documents
+# - Storage: 1GB+ free space for temporary files
+# - Network: Required for Azure Document Intelligence (optional)
+#
+# This enhanced version provides the most comprehensive indentation detection
+# and text classification system available, with particular strength in:
+# - Parenthetical pattern recognition ((1), (๑), (a), (i), (ก))
+# - Thai language and script support
+# - Intelligent document structure analysis
+# - Multi-format export with preserved formatting
+# - Real-time pattern demonstration and analysis