""" Backend Management Module - ENHANCED VERSION with OpenCV Text Block Analysis and Bold Detection Coordinates between UI and OCR services, handles file management and preprocessing with OpenCV integration """ import re import os import logging import tempfile from typing import Dict, Any, List, Optional from pathlib import Path import hashlib import json from datetime import datetime import cv2 import numpy as np import fitz # PyMuPDF from docx import Document from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.shared import OxmlElement, qn from html.parser import HTMLParser # Load environment variables from dotenv import load_dotenv load_dotenv() from ocr_service import OCRService from enhanced_indentation import EnhancedIndentationDetector, OpenCVTextAnalyzer # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class EnhancedDocumentExporter: """Advanced document export with OpenCV-enhanced text analysis, bold detection, and comprehensive formatting""" def __init__(self): self.indent_detector = EnhancedIndentationDetector() self.opencv_analyzer = OpenCVTextAnalyzer() @staticmethod def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str: """Create enhanced TXT file with OpenCV-improved formatting and spacing analysis""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") temp_file = tempfile.NamedTemporaryFile( suffix=f'_extracted_text_opencv_{timestamp}.txt', delete=False, mode='w', encoding='utf-8' ) try: # Add header temp_file.write("PDF OCR Extraction Results - Enhanced with OpenCV Text Block Analysis & Bold Detection\n") temp_file.write("=" * 100 + "\n\n") # Add metadata if metadata_info: temp_file.write("Processing Information:\n") temp_file.write("-" * 25 + "\n") temp_file.write(metadata_info + "\n\n") # Add timestamp temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") temp_file.write("=" * 100 + "\n\n") # Add enhanced feature list temp_file.write("OpenCV-Enhanced Features Applied:\n") temp_file.write("-" * 35 + "\n") temp_file.write("• OpenCV Text Block Detection & Analysis\n") temp_file.write("• Bold Text Recognition for Headers\n") temp_file.write("• Automatic Spacing & Paragraph Detection\n") temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n") temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n") temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n") temp_file.write("• Multi-language Support (English, Thai)\n") temp_file.write("• HTML Intermediate Processing\n") temp_file.write("• Priority-based Pattern Matching\n") temp_file.write("• Document Structure Analysis\n") temp_file.write("• Header Indentation Suppression\n\n") # Add main content temp_file.write("Extracted Text (OpenCV-Enhanced with Text Block Analysis):\n") temp_file.write("-" * 70 + "\n\n") temp_file.write(text_content) temp_file.close() return temp_file.name except Exception as e: logger.error(f"Error creating enhanced TXT file: {e}") temp_file.close() raise def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str: """Create enhanced DOCX file with OpenCV-enhanced formatting, bold detection, and spacing analysis""" try: class OpenCVEnhancedDOCXHTMLParser(HTMLParser): def __init__(self, doc, processor): super().__init__() self.doc = doc self.processor = processor self.current_paragraph = None self.in_table = False self.table_data = [] self.current_table_row = [] self.current_indent_level = 0 self.current_formatting_hint = 'normal_text' self.in_title = False self.in_section_heading = False self.in_page_header = False self.in_content_header = False self.in_opencv_bold_header = False self.current_classes = [] def handle_starttag(self, tag, attrs): attr_dict = dict(attrs) class_attr = attr_dict.get('class', '') self.current_classes = class_attr.split() if 'opencv-bold-header' in class_attr: # OpenCV detected bold header - special styling, no indentation self.current_paragraph = self.doc.add_heading(level=1) self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT self.in_opencv_bold_header = True elif 'page' in class_attr and tag == 'div': if hasattr(self, 'has_content'): self.doc.add_paragraph() self.doc.add_paragraph() self.has_content = True elif 'page-header' in class_attr: self.current_paragraph = self.doc.add_heading(level=1) self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER self.in_page_header = True elif 'content-header' in class_attr: self.current_paragraph = self.doc.add_heading(level=2) self.in_content_header = True elif 'title' in class_attr: self.current_paragraph = self.doc.add_heading(level=1) self.in_title = True elif 'section-heading' in class_attr: self.current_paragraph = self.doc.add_heading(level=2) self.in_section_heading = True elif tag == 'div' and 'paragraph' in class_attr: self.current_paragraph = self.doc.add_paragraph() self._apply_opencv_enhanced_formatting() elif tag == 'table': self.in_table = True self.table_data = [] elif tag == 'tr': self.current_table_row = [] elif tag == 'br': if self.current_paragraph: self.current_paragraph.add_run().add_break() def _apply_opencv_enhanced_formatting(self): """Apply OpenCV-enhanced formatting with bold detection and spacing analysis""" if not self.current_paragraph: return # Check if this is an OpenCV-detected bold header is_opencv_bold_header = 'opencv-bold-header' in self.current_classes if is_opencv_bold_header: # Bold headers get no indentation and special formatting self.current_indent_level = 0 self.current_paragraph.paragraph_format.left_indent = Inches(0) self.current_paragraph.paragraph_format.space_before = Pt(15) self.current_paragraph.paragraph_format.space_after = Pt(12) return # Extract indent level from classes (only for non-bold headers) for cls in self.current_classes: if cls.startswith('indent-level-'): try: self.current_indent_level = int(cls.split('-')[-1]) except ValueError: self.current_indent_level = 0 break # Extract formatting hint from classes formatting_hints = [ 'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary', 'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary', 'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary', 'lettered-primary', 'lettered-secondary', 'roman-primary', 'roman-secondary', 'thai-primary', 'thai-secondary', 'indented_text', 'space-indent' ] for hint in formatting_hints: if hint in self.current_classes: self.current_formatting_hint = hint break else: self.current_formatting_hint = 'normal_text' # Apply indentation (only for non-bold headers) if self.current_indent_level > 0: indent_inches = self.current_indent_level * 0.5 self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches) # Apply hanging indent for bullets and parenthetical items (4 spaces equivalent) if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint: self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.125) # Reduced for 4-space system # Set line spacing and paragraph spacing with OpenCV-enhanced spacing self.current_paragraph.paragraph_format.line_spacing = 1.15 # Apply spacing based on formatting hint and OpenCV analysis if 'primary' in self.current_formatting_hint: self.current_paragraph.paragraph_format.space_before = Pt(12) self.current_paragraph.paragraph_format.space_after = Pt(10) elif 'secondary' in self.current_formatting_hint: self.current_paragraph.paragraph_format.space_before = Pt(10) self.current_paragraph.paragraph_format.space_after = Pt(8) elif 'tertiary' in self.current_formatting_hint: self.current_paragraph.paragraph_format.space_before = Pt(8) self.current_paragraph.paragraph_format.space_after = Pt(6) else: self.current_paragraph.paragraph_format.space_after = Pt(4) def handle_endtag(self, tag): if tag == 'div': if self.in_opencv_bold_header: self.in_opencv_bold_header = False elif self.in_page_header: self.in_page_header = False elif self.in_content_header: self.in_content_header = False elif self.in_title: self.in_title = False elif self.in_section_heading: self.in_section_heading = False self.current_paragraph = None self.current_indent_level = 0 self.current_formatting_hint = 'normal_text' self.current_classes = [] elif tag == 'table': self.in_table = False self._create_enhanced_docx_table() elif tag == 'tr' and self.current_table_row: self.table_data.append(self.current_table_row[:]) self.current_table_row = [] def handle_data(self, data): if data.strip(): # Clean OCR artifacts data = data.replace(':unselected:', '') data = data.replace(':selected:', '') data = data.replace(' ', ' ') if self.in_page_header: page_match = re.search(r'Page (\d+)', data) if page_match: page_num = int(page_match.group(1)) page_header = f"PAGE {page_num}" self.text_parts.append(page_header.center(80)) if self.in_table: self.current_table_row.append(data.strip()) elif self.current_paragraph is not None: # Detect patterns in the text for additional formatting indent_info = self.processor.indent_detector.detect_indentation(data) text_classification = self.processor.indent_detector.classify_text_type(data) run = self.current_paragraph.add_run(data.strip()) # Apply formatting based on context and OpenCV detection if self.in_opencv_bold_header: # Special formatting for OpenCV-detected bold headers run.bold = True run.font.size = Pt(16) run.font.color.rgb = RGBColor(231, 76, 60) # Red color for emphasis self.current_paragraph.paragraph_format.left_indent = Inches(0) # Force no indent elif self.in_title: run.bold = True run.font.size = Pt(16) run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue elif self.in_content_header or text_classification.get('is_header'): run.bold = True run.font.size = Pt(14) run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue elif self.in_section_heading: run.bold = True run.font.size = Pt(14) run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue elif self.in_page_header: page_match = re.search(r'Page (\d+)', data) if page_match: page_num = int(page_match.group(1)) page_header = f"PAGE {page_num}" run.bold = True run.font.size = Pt(14) run.font.color.rgb = RGBColor(44, 62, 80) self.text_parts.append(page_header.center(80)) else: # Apply pattern-specific formatting with OpenCV enhancement self._apply_opencv_pattern_formatting(run, indent_info, text_classification) def _apply_opencv_pattern_formatting(self, run, indent_info, text_classification): """Apply formatting based on detected pattern, classification, and OpenCV analysis""" pattern_type = indent_info.get('pattern_type', 'normal') level = indent_info.get('level', 0) is_numbered = indent_info.get('is_numbered', False) is_bullet = indent_info.get('is_bullet', False) is_lettered = indent_info.get('is_lettered', False) is_roman = indent_info.get('is_roman', False) is_thai = indent_info.get('is_thai', False) is_parenthetical = indent_info.get('is_parenthetical', False) # Base font size with OpenCV-enhanced scaling run.font.size = Pt(11) # Apply formatting based on current formatting hint and detected pattern if 'numbered' in self.current_formatting_hint or is_numbered: if 'primary' in self.current_formatting_hint or level == 1: run.bold = True run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue elif 'secondary' in self.current_formatting_hint or level == 2: run.font.color.rgb = RGBColor(52, 73, 94) # Medium blue elif 'tertiary' in self.current_formatting_hint or level == 3: run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray else: run.font.color.rgb = RGBColor(102, 102, 102) # Gray elif 'parenthetical' in self.current_formatting_hint or is_parenthetical: # Special formatting for parenthetical patterns if 'primary' in self.current_formatting_hint or level == 2: run.bold = True run.font.color.rgb = RGBColor(142, 68, 173) # Purple elif 'secondary' in self.current_formatting_hint or level == 3: run.font.color.rgb = RGBColor(155, 89, 182) # Light purple elif 'tertiary' in self.current_formatting_hint or level == 4: run.font.color.rgb = RGBColor(175, 122, 197) # Lighter purple else: run.font.color.rgb = RGBColor(195, 155, 211) # Very light purple elif 'bullet' in self.current_formatting_hint or is_bullet: if 'primary' in self.current_formatting_hint or level == 1: run.font.color.rgb = RGBColor(52, 152, 219) # Blue elif 'secondary' in self.current_formatting_hint or level == 2: run.font.color.rgb = RGBColor(149, 165, 166) # Gray elif 'tertiary' in self.current_formatting_hint or level == 3: run.font.color.rgb = RGBColor(189, 195, 199) # Light gray else: run.font.color.rgb = RGBColor(189, 195, 199) # Light gray elif 'lettered' in self.current_formatting_hint or is_lettered: run.italic = True if 'primary' in self.current_formatting_hint: run.font.color.rgb = RGBColor(142, 68, 173) # Purple else: run.font.color.rgb = RGBColor(155, 89, 182) # Light purple elif 'roman' in self.current_formatting_hint or is_roman: run.font.color.rgb = RGBColor(211, 84, 0) # Orange run.font.name = 'Times New Roman' # Roman style font elif 'thai' in self.current_formatting_hint or is_thai: if 'primary' in self.current_formatting_hint: run.bold = True run.font.color.rgb = RGBColor(22, 160, 133) # Teal else: run.font.color.rgb = RGBColor(26, 188, 156) # Light teal elif 'space-indent' in self.current_formatting_hint: run.italic = True run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray else: # Default text formatting based on classification and OpenCV if text_classification.get('is_header'): run.bold = True run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue elif text_classification.get('is_list_item'): run.font.color.rgb = RGBColor(52, 152, 219) # Blue else: run.font.color.rgb = RGBColor(0, 0, 0) # Black def _create_enhanced_docx_table(self): """Create table with enhanced formatting""" if not self.table_data: return rows = len(self.table_data) cols = max(len(row) for row in self.table_data) if self.table_data else 1 table = self.doc.add_table(rows=rows, cols=cols) table.style = 'Table Grid' table.alignment = WD_TABLE_ALIGNMENT.LEFT # Fill table data with enhanced formatting for row_idx, row_data in enumerate(self.table_data): table_row = table.rows[row_idx] for col_idx, cell_data in enumerate(row_data): if col_idx < len(table_row.cells): cell = table_row.cells[col_idx] cell.text = str(cell_data) # Style header row if row_idx == 0: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True run.font.size = Pt(10) run.font.color.rgb = RGBColor(44, 62, 80) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add background color to header shading_elm_1 = OxmlElement('w:shd') shading_elm_1.set(qn('w:fill'), 'ECF0F1') paragraph._element.get_or_add_pPr().append(shading_elm_1) else: # Regular data cells for paragraph in cell.paragraphs: for run in paragraph.runs: run.font.size = Pt(10) paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT # Add spacing after table self.doc.add_paragraph() # Create DOCX document timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") temp_file = tempfile.NamedTemporaryFile( suffix=f'_opencv_enhanced_document_{timestamp}.docx', delete=False ) temp_file.close() doc = Document() # Set document margins for better layout sections = doc.sections for section in sections: section.top_margin = Inches(1) section.bottom_margin = Inches(1) section.left_margin = Inches(1) section.right_margin = Inches(1) # Add title with enhanced styling title = doc.add_heading('PDF OCR Extraction Results', 0) title.alignment = WD_ALIGN_PARAGRAPH.CENTER title_run = title.runs[0] title_run.font.color.rgb = RGBColor(44, 62, 80) # Add subtitle subtitle_para = doc.add_paragraph() subtitle_run = subtitle_para.add_run('Enhanced with OpenCV Text Block Analysis & Bold Detection') subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER subtitle_run.italic = True subtitle_run.font.size = Pt(12) subtitle_run.font.color.rgb = RGBColor(102, 102, 102) # Add feature list features_para = doc.add_paragraph() features_run = features_para.add_run('Features: OpenCV Text Block Detection • Bold Text Recognition • Spacing Analysis • Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification • Header Indentation Suppression') features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER features_run.font.size = Pt(9) features_run.font.color.rgb = RGBColor(149, 165, 166) # Add metadata section if metadata_info: doc.add_heading('Processing Information', level=1) meta_para = doc.add_paragraph() meta_run = meta_para.add_run(metadata_info) meta_run.font.size = Pt(10) meta_para.style = 'Intense Quote' # Add background to metadata shading_elm = OxmlElement('w:shd') shading_elm.set(qn('w:fill'), 'F8F9FA') meta_para._element.get_or_add_pPr().append(shading_elm) doc.add_paragraph() # Process content doc.add_heading('Extracted Content', level=1) if html_content and ' 0.8 and len(line.strip()) < 80 and line.strip().isupper() # Simple heuristic for bold headers ) if line.strip().startswith('==='): # Page headers page_header = doc.add_heading(line.strip(), level=1) page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER header_run = page_header.runs[0] header_run.font.color.rgb = RGBColor(44, 62, 80) elif is_opencv_bold_header: # OpenCV-detected bold headers - no indentation heading = doc.add_heading(line.strip(), level=1) heading.alignment = WD_ALIGN_PARAGRAPH.LEFT heading_run = heading.runs[0] heading_run.font.color.rgb = RGBColor(231, 76, 60) # Red for emphasis heading_run.font.size = Pt(16) elif line.strip().startswith('##'): # Section headings heading_text = line.strip().lstrip('#').strip() heading = doc.add_heading(heading_text, level=2) heading_run = heading.runs[0] heading_run.font.color.rgb = RGBColor(52, 73, 94) elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7: # Regular detected headers heading = doc.add_heading(indent_info.get('content', line.strip()), level=2) heading_run = heading.runs[0] heading_run.font.color.rgb = RGBColor(52, 73, 94) else: # Regular content with OpenCV-enhanced formatting para = doc.add_paragraph() # Apply indentation based on detected level using 4 spaces per level (but not for bold headers) level = indent_info.get('level', 0) if level > 0 and not is_opencv_bold_header: # Use 4 spaces equivalent per level (0.25 inches per level) para.paragraph_format.left_indent = Inches(level * 0.25) # Apply pattern-specific formatting using 4 spaces equivalent if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False): para.paragraph_format.first_line_indent = Inches(-0.125) # 4-space equivalent hanging indent # Set proper spacing with OpenCV enhancement para.paragraph_format.line_spacing = 1.15 para.paragraph_format.space_after = Pt(4) # Add content with enhanced formatting content = indent_info.get('content', line.strip()) marker = indent_info.get('pattern_marker', '') # Include marker for non-bullet items if marker and not indent_info.get('is_bullet', False): content = f"{marker} {content}" run = para.add_run(content) run.font.size = Pt(11) # Apply color coding based on pattern type and classification pattern_type = indent_info.get('pattern_type', 'normal') if 'numbered' in pattern_type or 'decimal' in pattern_type: if level == 1: run.bold = True run.font.color.rgb = RGBColor(44, 62, 80) elif level == 2: run.font.color.rgb = RGBColor(52, 73, 94) else: run.font.color.rgb = RGBColor(85, 85, 85) elif 'parenthetical' in pattern_type: if level <= 2: run.bold = True run.font.color.rgb = RGBColor(142, 68, 173) # Purple else: run.font.color.rgb = RGBColor(155, 89, 182) # Light purple elif 'bullet' in pattern_type: run.font.color.rgb = RGBColor(52, 152, 219) elif 'lettered' in pattern_type: run.italic = True run.font.color.rgb = RGBColor(142, 68, 173) elif 'roman' in pattern_type: run.font.color.rgb = RGBColor(211, 84, 0) elif 'thai' in pattern_type: run.font.color.rgb = RGBColor(22, 160, 133) elif text_classification.get('is_list_item'): run.font.color.rgb = RGBColor(52, 152, 219) @staticmethod def create_html_file(html_content: str, metadata_info: str = "") -> str: """Create standalone HTML file with OpenCV-enhanced styling""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") temp_file = tempfile.NamedTemporaryFile( suffix=f'_opencv_enhanced_document_{timestamp}.html', delete=False, mode='w', encoding='utf-8' ) try: # Enhance HTML with better styling including OpenCV features enhanced_html = html_content # Add comprehensive styling if not already present if '''' ) # Wrap content in container if not already wrapped if '' in enhanced_html and '.container' not in enhanced_html: enhanced_html = enhanced_html.replace( '', '''

PDF OCR Extraction Results

Enhanced with OpenCV Text Block Analysis & Bold Detection

OpenCV Features: Text Block Detection • Bold Text Recognition • Automatic Spacing & Paragraph Analysis • Header Indentation Suppression • Visual Text Element Analysis
Text Analysis: Comprehensive Indentation Detection • Parenthetical Patterns ((1), (๑), (a), (i), (ก)) • Multi-level Bullets • Letter & Roman Numerals • Thai Script Support • Pattern Priority Detection • Intelligent Text Classification
''' + (f'' if metadata_info else '') ) enhanced_html = enhanced_html.replace('', '
') temp_file.write(enhanced_html) temp_file.close() return temp_file.name except Exception as e: logger.error(f"Error creating HTML file: {e}") temp_file.close() raise class BackendManager: """Enhanced backend manager with OpenCV text block analysis, bold detection, and comprehensive formatting""" def __init__(self): self.ocr_service = OCRService() self.document_exporter = EnhancedDocumentExporter() self.opencv_analyzer = OpenCVTextAnalyzer() self.processing_history = [] self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100)) # Create directories for temporary files and logs self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_opencv_enhanced' self.temp_dir.mkdir(exist_ok=True) logger.info("OpenCV-enhanced backend manager with text block analysis and bold detection initialized successfully") def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto", preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """ Process PDF with OpenCV-enhanced resolution, text block analysis, and bold detection Args: pdf_path: Path to the PDF file method: OCR method to use preprocessing_options: Dictionary containing preprocessing settings Returns: Dict containing processing results with OpenCV-enhanced analysis """ start_time = datetime.now() # Validate input if not os.path.exists(pdf_path): return { 'success': False, 'error': f"File not found: {pdf_path}", 'text': '', 'html': '', 'method_used': '', 'metadata': {} } # Check file size max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024 file_size = os.path.getsize(pdf_path) if file_size > max_file_size: return { 'success': False, 'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB", 'text': '', 'html': '', 'method_used': '', 'metadata': {} } # Generate file hash for tracking file_hash = self._calculate_file_hash(pdf_path) logger.info(f"Processing PDF with OpenCV text block analysis and bold detection: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)") logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}") # Handle preprocessing if enabled processed_pdf_path = pdf_path preprocessing_applied = False if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False): logger.info("Applying enhanced preprocessing with OpenCV analysis...") try: processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options) preprocessing_applied = True logger.info("OpenCV-enhanced preprocessing completed successfully") except Exception as e: logger.error(f"Preprocessing failed: {e}") processed_pdf_path = pdf_path try: # Process with OpenCV-enhanced OCR result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method) # Add processing metadata processing_time = (datetime.now() - start_time).total_seconds() # Analyze document structure with OpenCV enhancement if successful document_analysis = {} opencv_global_analysis = {} if result['success'] and result['text']: try: text_lines = result['text'].split('\n') detector = EnhancedIndentationDetector() # Perform global OpenCV analysis on the PDF opencv_global_analysis = self._perform_global_opencv_analysis(pdf_path, text_lines) # Enhanced document structure analysis document_analysis = detector.analyze_document_structure_with_opencv(text_lines) if opencv_global_analysis: document_analysis['opencv_global_analysis'] = opencv_global_analysis except Exception as analysis_error: logger.warning(f"Document structure analysis failed: {analysis_error}") document_analysis = {'analysis_failed': True} result['metadata'].update({ 'file_hash': file_hash, 'file_size_mb': round(file_size / (1024*1024), 2), 'processing_time_seconds': round(processing_time, 2), 'timestamp': start_time.isoformat(), 'opencv_enhanced': True, 'opencv_text_block_analysis': True, 'opencv_bold_detection': True, 'opencv_spacing_analysis': True, 'enhanced_processing': True, 'html_processing': True, 'comprehensive_indentation': True, 'parenthetical_patterns_supported': True, 'intelligent_text_classification': True, 'header_indentation_suppression': True, 'header_footer_removed': preprocessing_applied, 'preprocessing_options': preprocessing_options if preprocessing_applied else None, 'document_structure_analysis': document_analysis, 'opencv_global_analysis': opencv_global_analysis }) # Cleanup temporary preprocessed file if preprocessing_applied and processed_pdf_path != pdf_path: try: os.unlink(processed_pdf_path) except: pass # Log results with OpenCV enhancement information if result['success']: text_length = len(result['text']) has_html = bool(result.get('html')) table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0 logger.info(f"OpenCV-enhanced processing completed successfully in {processing_time:.2f}s") logger.info(f"Method used: {result['method_used']}") logger.info(f"Text extracted: {text_length} characters") logger.info(f"HTML generated: {has_html}") logger.info(f"OpenCV text block analysis: Enabled") logger.info(f"OpenCV bold detection: Enabled") logger.info(f"OpenCV spacing analysis: Enabled") logger.info(f"Header indentation suppression: Enabled") if table_count > 0: logger.info(f"Tables detected: {table_count}") if preprocessing_applied: logger.info("Enhanced preprocessing applied") if document_analysis and not document_analysis.get('analysis_failed'): logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}") logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items") if opencv_global_analysis: logger.info(f"OpenCV global analysis: {opencv_global_analysis.get('block_count', 0)} text blocks, {opencv_global_analysis.get('paragraph_count', 0)} paragraphs") logger.info(f"Bold text detected: {opencv_global_analysis.get('bold_text_detected', False)}") # Add to processing history self._add_to_history({ 'timestamp': start_time.isoformat(), 'file_hash': file_hash, 'method_used': result['method_used'], 'success': True, 'text_length': text_length, 'table_count': table_count, 'processing_time': processing_time, 'preprocessing_applied': preprocessing_applied, 'html_generated': has_html, 'opencv_enhanced': True, 'opencv_text_block_analysis': True, 'opencv_bold_detection': True, 'opencv_spacing_analysis': True, 'enhanced_processing': True, 'comprehensive_indentation': True, 'parenthetical_patterns_supported': True, 'intelligent_text_classification': True, 'header_indentation_suppression': True, 'document_analysis': document_analysis, 'opencv_global_analysis': opencv_global_analysis }) else: logger.error(f"OpenCV-enhanced processing failed: {result.get('error', 'Unknown error')}") # Add to processing history self._add_to_history({ 'timestamp': start_time.isoformat(), 'file_hash': file_hash, 'method_requested': method, 'success': False, 'error': result.get('error', 'Unknown error'), 'processing_time': processing_time, 'preprocessing_applied': preprocessing_applied, 'opencv_enhanced': True, 'opencv_text_block_analysis': True, 'opencv_bold_detection': True, 'opencv_spacing_analysis': True, 'enhanced_processing': True, 'comprehensive_indentation': True, 'parenthetical_patterns_supported': True, 'intelligent_text_classification': True, 'header_indentation_suppression': True }) return result except Exception as e: logger.error(f"Unexpected error during OpenCV-enhanced processing: {e}") # Cleanup if preprocessing_applied and processed_pdf_path != pdf_path: try: os.unlink(processed_pdf_path) except: pass # Add to processing history processing_time = (datetime.now() - start_time).total_seconds() self._add_to_history({ 'timestamp': start_time.isoformat(), 'file_hash': file_hash, 'method_requested': method, 'success': False, 'error': str(e), 'processing_time': processing_time, 'opencv_enhanced': True, 'opencv_text_block_analysis': True, 'opencv_bold_detection': True, 'opencv_spacing_analysis': True, 'enhanced_processing': True, 'comprehensive_indentation': True, 'parenthetical_patterns_supported': True, 'intelligent_text_classification': True, 'header_indentation_suppression': True }) return { 'success': False, 'error': f"OpenCV-enhanced processing error: {str(e)}", 'text': '', 'html': '', 'method_used': '', 'metadata': { 'file_hash': file_hash, 'processing_time_seconds': round(processing_time, 2), 'timestamp': start_time.isoformat(), 'opencv_enhanced': True, 'opencv_text_block_analysis': True, 'opencv_bold_detection': True, 'opencv_spacing_analysis': True, 'enhanced_processing': True, 'comprehensive_indentation': True, 'parenthetical_patterns_supported': True, 'intelligent_text_classification': True, 'header_indentation_suppression': True } } def _perform_global_opencv_analysis(self, pdf_path: str, text_lines: List[str]) -> Dict[str, Any]: """Perform global OpenCV analysis on the entire PDF""" try: # Extract first page for global analysis pdf_document = fitz.open(pdf_path) page = pdf_document.load_page(0) # First page # Render page to image mat = fitz.Matrix(2.0, 2.0) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") # Convert to OpenCV format import io from PIL import Image pil_image = Image.open(io.BytesIO(img_data)) img_array = np.array(pil_image) img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) # Perform OpenCV analysis opencv_analysis = self.opencv_analyzer.analyze_text_blocks(img_cv, text_lines) pdf_document.close() return opencv_analysis except Exception as e: logger.error(f"Global OpenCV analysis failed: {e}") return {} def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str: """Apply enhanced preprocessing with high-resolution crop handling and OpenCV analysis""" crop_settings = options.get('crop_settings', {}) per_page_crops = crop_settings.get('per_page_crops', {}) enhanced_resolution = crop_settings.get('enhanced_resolution', True) resolution_scale = crop_settings.get('resolution_scale', 2.0) # Create temporary file for processed PDF timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") temp_pdf_path = self.temp_dir / f"opencv_enhanced_preprocessed_{timestamp}.pdf" doc = fitz.open(pdf_path) new_doc = fitz.open() try: for page_num in range(len(doc)): page = doc.load_page(page_num) page_rect = page.rect # Get crop settings for this page page_crop = per_page_crops.get(page_num, per_page_crops.get(0, { 'top': 0, 'bottom': 0, 'left': 0, 'right': 0 })) top_percent = page_crop.get('top', 0) bottom_percent = page_crop.get('bottom', 0) left_percent = page_crop.get('left', 0) right_percent = page_crop.get('right', 0) # Calculate crop amounts width = page_rect.width height = page_rect.height crop_left = width * (left_percent / 100) crop_right = width * (right_percent / 100) crop_top = height * (top_percent / 100) crop_bottom = height * (bottom_percent / 100) # Calculate new rectangle new_rect = fitz.Rect( page_rect.x0 + crop_left, page_rect.y0 + crop_top, page_rect.x1 - crop_right, page_rect.y1 - crop_bottom ) # Ensure the rectangle is valid if new_rect.width <= 0 or new_rect.height <= 0: logger.warning(f"Invalid crop rectangle for page {page_num}, using original page") new_rect = page_rect # Create new page with enhanced resolution if enabled if enhanced_resolution: new_page = new_doc.new_page( width=new_rect.width, height=new_rect.height ) # Copy content with proper transformation mat = fitz.Matrix(1, 1).prescale(resolution_scale, resolution_scale) new_page.show_pdf_page( new_page.rect, doc, page_num, clip=new_rect ) else: # Standard resolution new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height) new_page.show_pdf_page( new_page.rect, doc, page_num, clip=new_rect ) logger.debug(f"Page {page_num}: Applied OpenCV-enhanced crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%") new_doc.save(str(temp_pdf_path)) logger.info(f"OpenCV-enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages") except Exception as e: logger.error(f"Error in OpenCV-enhanced preprocessing: {e}") raise finally: doc.close() new_doc.close() return str(temp_pdf_path) def create_enhanced_downloads(self, text_content: str, html_content: str, metadata_info: str = "") -> Dict[str, str]: """Create OpenCV-enhanced download files with text block analysis and bold detection""" download_files = {} try: # Create OpenCV-enhanced TXT file txt_path = EnhancedDocumentExporter.create_enhanced_txt_file( text_content, html_content, metadata_info ) download_files['txt'] = txt_path logger.info(f"OpenCV-enhanced TXT file created: {txt_path}") # Create OpenCV-enhanced DOCX file with text block analysis and bold detection try: docx_path = self.document_exporter.create_enhanced_docx_file( text_content, html_content, metadata_info ) download_files['docx'] = docx_path logger.info(f"OpenCV-enhanced DOCX file with text block analysis and bold detection created: {docx_path}") except ImportError: logger.warning("python-docx not available. DOCX creation skipped.") except Exception as e: logger.error(f"OpenCV-enhanced DOCX creation failed: {e}") # Create standalone HTML file with OpenCV enhancements try: html_path = EnhancedDocumentExporter.create_html_file( html_content, metadata_info ) download_files['html'] = html_path logger.info(f"OpenCV-enhanced HTML file created: {html_path}") except Exception as e: logger.error(f"HTML file creation failed: {e}") except Exception as e: logger.error(f"Error creating OpenCV-enhanced downloads: {e}") raise return download_files def get_available_methods(self) -> List[str]: """Get list of available OCR methods""" methods = self.ocr_service.get_available_methods() logger.info(f"Available OpenCV-enhanced OCR methods: {methods}") return methods def get_service_status(self) -> Dict[str, Any]: """Get comprehensive service status with OpenCV enhancements""" available_methods = self.get_available_methods() # Check DOCX support try: import docx docx_available = True except ImportError: docx_available = False # Check OpenCV availability opencv_available = True try: import cv2 except ImportError: opencv_available = False status = { 'service_healthy': True, 'available_methods': available_methods, 'azure_configured': 'azure' in available_methods, 'tesseract_available': 'tesseract' in available_methods, 'pymupdf_available': 'pymupdf' in available_methods, 'total_processed': len(self.processing_history), 'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)), 'temp_dir': str(self.temp_dir), 'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)), 'opencv_available': opencv_available, 'opencv_text_block_analysis': opencv_available, 'opencv_bold_detection': opencv_available, 'opencv_spacing_analysis': opencv_available, 'enhanced_processing': True, 'html_processing': True, 'comprehensive_indentation': True, 'parenthetical_patterns_supported': True, 'intelligent_text_classification': True, 'header_indentation_suppression': True, 'pattern_detection_count': len(EnhancedIndentationDetector().patterns), 'docx_export_available': docx_available, 'enhanced_crop_processing': True, 'multi_resolution_support': True, 'crop_processing_fixed': True, 'document_structure_analysis': True, 'thai_script_support': True, 'multi_level_support': True, 'text_classification_features': True } return status def _calculate_file_hash(self, file_path: str) -> str: """Calculate SHA-256 hash of file""" sha256_hash = hashlib.sha256() try: with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): sha256_hash.update(chunk) return sha256_hash.hexdigest() except Exception as e: logger.error(f"Error calculating file hash: {e}") return f"error_{datetime.now().timestamp()}" def _add_to_history(self, entry: Dict[str, Any]): """Add entry to processing history""" self.processing_history.append(entry) # Limit history size if len(self.processing_history) > self.max_history_size: self.processing_history = self.processing_history[-self.max_history_size:] def cleanup_temp_files(self): """Clean up temporary files""" try: temp_files = list(self.temp_dir.glob('*')) cleaned_count = 0 for temp_file in temp_files: try: # Remove files older than 1 hour if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600): temp_file.unlink() cleaned_count += 1 except Exception as e: logger.warning(f"Could not remove temp file {temp_file}: {e}") if cleaned_count > 0: logger.info(f"Cleaned up {cleaned_count} temporary files") except Exception as e: logger.error(f"Error during cleanup: {e}") def get_enhanced_statistics(self) -> Dict[str, Any]: """Get enhanced processing statistics with OpenCV analysis""" if not self.processing_history: return { 'total_processed': 0, 'success_rate': 0, 'average_processing_time': 0, 'most_used_method': 'N/A', 'total_text_extracted': 0, 'total_tables_processed': 0, 'preprocessing_usage': 0, 'html_generation_rate': 0, 'opencv_enhanced_usage': 0, 'opencv_text_block_analysis_usage': 0, 'opencv_bold_detection_usage': 0, 'opencv_spacing_analysis_usage': 0, 'enhanced_processing_usage': 0, 'comprehensive_indentation_usage': 0, 'parenthetical_patterns_usage': 0, 'text_classification_usage': 0, 'header_indentation_suppression_usage': 0, 'document_analysis_success_rate': 0 } total_processed = len(self.processing_history) successful = [h for h in self.processing_history if h.get('success', False)] success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0 # Calculate statistics processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h] avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0 methods = [h.get('method_used', 'unknown') for h in successful] most_used_method = max(set(methods), key=methods.count) if methods else 'N/A' total_text = sum(h.get('text_length', 0) for h in successful) total_tables = sum(h.get('table_count', 0) for h in successful) preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False)) html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False)) opencv_enhanced = sum(1 for h in self.processing_history if h.get('opencv_enhanced', False)) opencv_text_block_analysis = sum(1 for h in self.processing_history if h.get('opencv_text_block_analysis', False)) opencv_bold_detection = sum(1 for h in self.processing_history if h.get('opencv_bold_detection', False)) opencv_spacing_analysis = sum(1 for h in self.processing_history if h.get('opencv_spacing_analysis', False)) enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False)) comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False)) parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False)) text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False)) header_indentation_suppression = sum(1 for h in self.processing_history if h.get('header_indentation_suppression', False)) # Document analysis statistics doc_analysis_success = sum(1 for h in self.processing_history if h.get('document_analysis', {}) and not h.get('document_analysis', {}).get('analysis_failed', False)) doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0 html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0 opencv_enhanced_rate = (opencv_enhanced / total_processed) * 100 if total_processed > 0 else 0 opencv_text_block_analysis_rate = (opencv_text_block_analysis / total_processed) * 100 if total_processed > 0 else 0 opencv_bold_detection_rate = (opencv_bold_detection / total_processed) * 100 if total_processed > 0 else 0 opencv_spacing_analysis_rate = (opencv_spacing_analysis / total_processed) * 100 if total_processed > 0 else 0 enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0 comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0 parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0 text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0 header_indentation_suppression_rate = (header_indentation_suppression / total_processed) * 100 if total_processed > 0 else 0 return { 'total_processed': total_processed, 'success_rate': round(success_rate, 2), 'average_processing_time': round(avg_processing_time, 2), 'most_used_method': most_used_method, 'total_text_extracted': total_text, 'total_tables_processed': total_tables, 'successful_processes': len(successful), 'failed_processes': total_processed - len(successful), 'preprocessing_usage': preprocessing_usage, 'html_generation_rate': round(html_generation_rate, 2), 'opencv_enhanced_usage': opencv_enhanced, 'opencv_enhanced_rate': round(opencv_enhanced_rate, 2), 'opencv_text_block_analysis_usage': opencv_text_block_analysis, 'opencv_text_block_analysis_rate': round(opencv_text_block_analysis_rate, 2), 'opencv_bold_detection_usage': opencv_bold_detection, 'opencv_bold_detection_rate': round(opencv_bold_detection_rate, 2), 'opencv_spacing_analysis_usage': opencv_spacing_analysis, 'opencv_spacing_analysis_rate': round(opencv_spacing_analysis_rate, 2), 'enhanced_processing_usage': enhanced_processing, 'enhanced_processing_rate': round(enhanced_processing_rate, 2), 'comprehensive_indentation_usage': comprehensive_indentation, 'comprehensive_indentation_rate': round(comprehensive_indentation_rate, 2), 'parenthetical_patterns_usage': parenthetical_patterns, 'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2), 'text_classification_usage': text_classification, 'text_classification_rate': round(text_classification_rate, 2), 'header_indentation_suppression_usage': header_indentation_suppression, 'header_indentation_suppression_rate': round(header_indentation_suppression_rate, 2), 'document_analysis_success_rate': round(doc_analysis_rate, 2) } # Global backend manager instance _backend_manager = None def get_backend_manager() -> BackendManager: """Get global OpenCV-enhanced backend manager instance""" global _backend_manager if _backend_manager is None: _backend_manager = BackendManager() return _backend_manager if __name__ == "__main__": # Test the OpenCV-enhanced backend manager manager = BackendManager() print("OpenCV-Enhanced Backend Manager with Text Block Analysis & Bold Detection Test") print("=" * 110) print(f"Available methods: {manager.get_available_methods()}") print(f"Service status: {manager.get_service_status()}") print(f"Enhanced statistics: {manager.get_enhanced_statistics()}") # Test OpenCV analyzer opencv_analyzer = OpenCVTextAnalyzer() test_image_path = "test_page.png" # This would be a real image path in practice test_text_lines = [ "CHAPTER 1: INTRODUCTION", "1.1. Overview of the System", "This document provides comprehensive information...", "1.2. Key Features", "• Feature one with detailed explanation", "• Feature two with additional notes" ] print(f"\nOpenCV Text Analysis Test:") print("-" * 40) # opencv_analysis = opencv_analyzer.analyze_text_blocks(test_image_path, test_text_lines) # print(f"Analysis result: {opencv_analysis}") # Test indentation detector with OpenCV integration detector = EnhancedIndentationDetector() test_cases = [ "INTRODUCTION TO THE SYSTEM", # Should be detected as bold header "1.2.3. Hierarchical item", "(1) Parenthetical Arabic", "(๑) Parenthetical Thai numeral", "(a) Parenthetical letter", "(i) Parenthetical Roman", "(ก) Parenthetical Thai letter" ] print(f"\nOpenCV-Enhanced Indentation Detection Test:") print("-" * 60) for test_text in test_cases: result = detector.detect_indentation(test_text) classification = detector.classify_text_type(test_text) print(f"Text: {test_text}") print(f" Pattern: {result['pattern_type']}, Level: {result['level']}") print(f" Is Header: {result['is_header']}, Suppress Indent: {result['suppress_indentation']}") print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})") print()