Spaces:
Sleeping
Sleeping
| """ | |
| Backend Management Module - ENHANCED VERSION with OpenCV Text Block Analysis and Bold Detection | |
| Coordinates between UI and OCR services, handles file management and preprocessing with OpenCV integration | |
| """ | |
| import re | |
| import os | |
| import logging | |
| import tempfile | |
| from typing import Dict, Any, List, Optional | |
| from pathlib import Path | |
| import hashlib | |
| import json | |
| from datetime import datetime | |
| import cv2 | |
| import numpy as np | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| from docx.shared import Inches, Pt, RGBColor | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| from docx.enum.table import WD_TABLE_ALIGNMENT | |
| from docx.oxml.shared import OxmlElement, qn | |
| from html.parser import HTMLParser | |
| # Load environment variables | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| from ocr_service import OCRService | |
| from enhanced_indentation import EnhancedIndentationDetector, OpenCVTextAnalyzer | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class EnhancedDocumentExporter: | |
| """Advanced document export with OpenCV-enhanced text analysis, bold detection, and comprehensive formatting""" | |
| def __init__(self): | |
| self.indent_detector = EnhancedIndentationDetector() | |
| self.opencv_analyzer = OpenCVTextAnalyzer() | |
| def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str: | |
| """Create enhanced TXT file with OpenCV-improved formatting and spacing analysis""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| temp_file = tempfile.NamedTemporaryFile( | |
| suffix=f'_extracted_text_opencv_{timestamp}.txt', | |
| delete=False, | |
| mode='w', | |
| encoding='utf-8' | |
| ) | |
| try: | |
| # Add header | |
| temp_file.write("PDF OCR Extraction Results - Enhanced with OpenCV Text Block Analysis & Bold Detection\n") | |
| temp_file.write("=" * 100 + "\n\n") | |
| # Add metadata | |
| if metadata_info: | |
| temp_file.write("Processing Information:\n") | |
| temp_file.write("-" * 25 + "\n") | |
| temp_file.write(metadata_info + "\n\n") | |
| # Add timestamp | |
| temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") | |
| temp_file.write("=" * 100 + "\n\n") | |
| # Add enhanced feature list | |
| temp_file.write("OpenCV-Enhanced Features Applied:\n") | |
| temp_file.write("-" * 35 + "\n") | |
| temp_file.write("• OpenCV Text Block Detection & Analysis\n") | |
| temp_file.write("• Bold Text Recognition for Headers\n") | |
| temp_file.write("• Automatic Spacing & Paragraph Detection\n") | |
| temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n") | |
| temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n") | |
| temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n") | |
| temp_file.write("• Multi-language Support (English, Thai)\n") | |
| temp_file.write("• HTML Intermediate Processing\n") | |
| temp_file.write("• Priority-based Pattern Matching\n") | |
| temp_file.write("• Document Structure Analysis\n") | |
| temp_file.write("• Header Indentation Suppression\n\n") | |
| # Add main content | |
| temp_file.write("Extracted Text (OpenCV-Enhanced with Text Block Analysis):\n") | |
| temp_file.write("-" * 70 + "\n\n") | |
| temp_file.write(text_content) | |
| temp_file.close() | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"Error creating enhanced TXT file: {e}") | |
| temp_file.close() | |
| raise | |
| def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str: | |
| """Create enhanced DOCX file with OpenCV-enhanced formatting, bold detection, and spacing analysis""" | |
| try: | |
| class OpenCVEnhancedDOCXHTMLParser(HTMLParser): | |
| def __init__(self, doc, processor): | |
| super().__init__() | |
| self.doc = doc | |
| self.processor = processor | |
| self.current_paragraph = None | |
| self.in_table = False | |
| self.table_data = [] | |
| self.current_table_row = [] | |
| self.current_indent_level = 0 | |
| self.current_formatting_hint = 'normal_text' | |
| self.in_title = False | |
| self.in_section_heading = False | |
| self.in_page_header = False | |
| self.in_content_header = False | |
| self.in_opencv_bold_header = False | |
| self.current_classes = [] | |
| def handle_starttag(self, tag, attrs): | |
| attr_dict = dict(attrs) | |
| class_attr = attr_dict.get('class', '') | |
| self.current_classes = class_attr.split() | |
| if 'opencv-bold-header' in class_attr: | |
| # OpenCV detected bold header - special styling, no indentation | |
| self.current_paragraph = self.doc.add_heading(level=1) | |
| self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| self.in_opencv_bold_header = True | |
| elif 'page' in class_attr and tag == 'div': | |
| if hasattr(self, 'has_content'): | |
| self.doc.add_paragraph() | |
| self.doc.add_paragraph() | |
| self.has_content = True | |
| elif 'page-header' in class_attr: | |
| self.current_paragraph = self.doc.add_heading(level=1) | |
| self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| self.in_page_header = True | |
| elif 'content-header' in class_attr: | |
| self.current_paragraph = self.doc.add_heading(level=2) | |
| self.in_content_header = True | |
| elif 'title' in class_attr: | |
| self.current_paragraph = self.doc.add_heading(level=1) | |
| self.in_title = True | |
| elif 'section-heading' in class_attr: | |
| self.current_paragraph = self.doc.add_heading(level=2) | |
| self.in_section_heading = True | |
| elif tag == 'div' and 'paragraph' in class_attr: | |
| self.current_paragraph = self.doc.add_paragraph() | |
| self._apply_opencv_enhanced_formatting() | |
| elif tag == 'table': | |
| self.in_table = True | |
| self.table_data = [] | |
| elif tag == 'tr': | |
| self.current_table_row = [] | |
| elif tag == 'br': | |
| if self.current_paragraph: | |
| self.current_paragraph.add_run().add_break() | |
| def _apply_opencv_enhanced_formatting(self): | |
| """Apply OpenCV-enhanced formatting with bold detection and spacing analysis""" | |
| if not self.current_paragraph: | |
| return | |
| # Check if this is an OpenCV-detected bold header | |
| is_opencv_bold_header = 'opencv-bold-header' in self.current_classes | |
| if is_opencv_bold_header: | |
| # Bold headers get no indentation and special formatting | |
| self.current_indent_level = 0 | |
| self.current_paragraph.paragraph_format.left_indent = Inches(0) | |
| self.current_paragraph.paragraph_format.space_before = Pt(15) | |
| self.current_paragraph.paragraph_format.space_after = Pt(12) | |
| return | |
| # Extract indent level from classes (only for non-bold headers) | |
| for cls in self.current_classes: | |
| if cls.startswith('indent-level-'): | |
| try: | |
| self.current_indent_level = int(cls.split('-')[-1]) | |
| except ValueError: | |
| self.current_indent_level = 0 | |
| break | |
| # Extract formatting hint from classes | |
| formatting_hints = [ | |
| 'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary', | |
| 'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary', | |
| 'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary', | |
| 'lettered-primary', 'lettered-secondary', | |
| 'roman-primary', 'roman-secondary', | |
| 'thai-primary', 'thai-secondary', | |
| 'indented_text', 'space-indent' | |
| ] | |
| for hint in formatting_hints: | |
| if hint in self.current_classes: | |
| self.current_formatting_hint = hint | |
| break | |
| else: | |
| self.current_formatting_hint = 'normal_text' | |
| # Apply indentation (only for non-bold headers) | |
| if self.current_indent_level > 0: | |
| indent_inches = self.current_indent_level * 0.5 | |
| self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches) | |
| # Apply hanging indent for bullets and parenthetical items (4 spaces equivalent) | |
| if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint: | |
| self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.125) # Reduced for 4-space system | |
| # Set line spacing and paragraph spacing with OpenCV-enhanced spacing | |
| self.current_paragraph.paragraph_format.line_spacing = 1.15 | |
| # Apply spacing based on formatting hint and OpenCV analysis | |
| if 'primary' in self.current_formatting_hint: | |
| self.current_paragraph.paragraph_format.space_before = Pt(12) | |
| self.current_paragraph.paragraph_format.space_after = Pt(10) | |
| elif 'secondary' in self.current_formatting_hint: | |
| self.current_paragraph.paragraph_format.space_before = Pt(10) | |
| self.current_paragraph.paragraph_format.space_after = Pt(8) | |
| elif 'tertiary' in self.current_formatting_hint: | |
| self.current_paragraph.paragraph_format.space_before = Pt(8) | |
| self.current_paragraph.paragraph_format.space_after = Pt(6) | |
| else: | |
| self.current_paragraph.paragraph_format.space_after = Pt(4) | |
| def handle_endtag(self, tag): | |
| if tag == 'div': | |
| if self.in_opencv_bold_header: | |
| self.in_opencv_bold_header = False | |
| elif self.in_page_header: | |
| self.in_page_header = False | |
| elif self.in_content_header: | |
| self.in_content_header = False | |
| elif self.in_title: | |
| self.in_title = False | |
| elif self.in_section_heading: | |
| self.in_section_heading = False | |
| self.current_paragraph = None | |
| self.current_indent_level = 0 | |
| self.current_formatting_hint = 'normal_text' | |
| self.current_classes = [] | |
| elif tag == 'table': | |
| self.in_table = False | |
| self._create_enhanced_docx_table() | |
| elif tag == 'tr' and self.current_table_row: | |
| self.table_data.append(self.current_table_row[:]) | |
| self.current_table_row = [] | |
| def handle_data(self, data): | |
| if data.strip(): | |
| # Clean OCR artifacts | |
| data = data.replace(':unselected:', '') | |
| data = data.replace(':selected:', '') | |
| data = data.replace(' ', ' ') | |
| if self.in_page_header: | |
| page_match = re.search(r'Page (\d+)', data) | |
| if page_match: | |
| page_num = int(page_match.group(1)) | |
| page_header = f"PAGE {page_num}" | |
| self.text_parts.append(page_header.center(80)) | |
| if self.in_table: | |
| self.current_table_row.append(data.strip()) | |
| elif self.current_paragraph is not None: | |
| # Detect patterns in the text for additional formatting | |
| indent_info = self.processor.indent_detector.detect_indentation(data) | |
| text_classification = self.processor.indent_detector.classify_text_type(data) | |
| run = self.current_paragraph.add_run(data.strip()) | |
| # Apply formatting based on context and OpenCV detection | |
| if self.in_opencv_bold_header: | |
| # Special formatting for OpenCV-detected bold headers | |
| run.bold = True | |
| run.font.size = Pt(16) | |
| run.font.color.rgb = RGBColor(231, 76, 60) # Red color for emphasis | |
| self.current_paragraph.paragraph_format.left_indent = Inches(0) # Force no indent | |
| elif self.in_title: | |
| run.bold = True | |
| run.font.size = Pt(16) | |
| run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue | |
| elif self.in_content_header or text_classification.get('is_header'): | |
| run.bold = True | |
| run.font.size = Pt(14) | |
| run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue | |
| elif self.in_section_heading: | |
| run.bold = True | |
| run.font.size = Pt(14) | |
| run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue | |
| elif self.in_page_header: | |
| page_match = re.search(r'Page (\d+)', data) | |
| if page_match: | |
| page_num = int(page_match.group(1)) | |
| page_header = f"PAGE {page_num}" | |
| run.bold = True | |
| run.font.size = Pt(14) | |
| run.font.color.rgb = RGBColor(44, 62, 80) | |
| self.text_parts.append(page_header.center(80)) | |
| else: | |
| # Apply pattern-specific formatting with OpenCV enhancement | |
| self._apply_opencv_pattern_formatting(run, indent_info, text_classification) | |
| def _apply_opencv_pattern_formatting(self, run, indent_info, text_classification): | |
| """Apply formatting based on detected pattern, classification, and OpenCV analysis""" | |
| pattern_type = indent_info.get('pattern_type', 'normal') | |
| level = indent_info.get('level', 0) | |
| is_numbered = indent_info.get('is_numbered', False) | |
| is_bullet = indent_info.get('is_bullet', False) | |
| is_lettered = indent_info.get('is_lettered', False) | |
| is_roman = indent_info.get('is_roman', False) | |
| is_thai = indent_info.get('is_thai', False) | |
| is_parenthetical = indent_info.get('is_parenthetical', False) | |
| # Base font size with OpenCV-enhanced scaling | |
| run.font.size = Pt(11) | |
| # Apply formatting based on current formatting hint and detected pattern | |
| if 'numbered' in self.current_formatting_hint or is_numbered: | |
| if 'primary' in self.current_formatting_hint or level == 1: | |
| run.bold = True | |
| run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue | |
| elif 'secondary' in self.current_formatting_hint or level == 2: | |
| run.font.color.rgb = RGBColor(52, 73, 94) # Medium blue | |
| elif 'tertiary' in self.current_formatting_hint or level == 3: | |
| run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray | |
| else: | |
| run.font.color.rgb = RGBColor(102, 102, 102) # Gray | |
| elif 'parenthetical' in self.current_formatting_hint or is_parenthetical: | |
| # Special formatting for parenthetical patterns | |
| if 'primary' in self.current_formatting_hint or level == 2: | |
| run.bold = True | |
| run.font.color.rgb = RGBColor(142, 68, 173) # Purple | |
| elif 'secondary' in self.current_formatting_hint or level == 3: | |
| run.font.color.rgb = RGBColor(155, 89, 182) # Light purple | |
| elif 'tertiary' in self.current_formatting_hint or level == 4: | |
| run.font.color.rgb = RGBColor(175, 122, 197) # Lighter purple | |
| else: | |
| run.font.color.rgb = RGBColor(195, 155, 211) # Very light purple | |
| elif 'bullet' in self.current_formatting_hint or is_bullet: | |
| if 'primary' in self.current_formatting_hint or level == 1: | |
| run.font.color.rgb = RGBColor(52, 152, 219) # Blue | |
| elif 'secondary' in self.current_formatting_hint or level == 2: | |
| run.font.color.rgb = RGBColor(149, 165, 166) # Gray | |
| elif 'tertiary' in self.current_formatting_hint or level == 3: | |
| run.font.color.rgb = RGBColor(189, 195, 199) # Light gray | |
| else: | |
| run.font.color.rgb = RGBColor(189, 195, 199) # Light gray | |
| elif 'lettered' in self.current_formatting_hint or is_lettered: | |
| run.italic = True | |
| if 'primary' in self.current_formatting_hint: | |
| run.font.color.rgb = RGBColor(142, 68, 173) # Purple | |
| else: | |
| run.font.color.rgb = RGBColor(155, 89, 182) # Light purple | |
| elif 'roman' in self.current_formatting_hint or is_roman: | |
| run.font.color.rgb = RGBColor(211, 84, 0) # Orange | |
| run.font.name = 'Times New Roman' # Roman style font | |
| elif 'thai' in self.current_formatting_hint or is_thai: | |
| if 'primary' in self.current_formatting_hint: | |
| run.bold = True | |
| run.font.color.rgb = RGBColor(22, 160, 133) # Teal | |
| else: | |
| run.font.color.rgb = RGBColor(26, 188, 156) # Light teal | |
| elif 'space-indent' in self.current_formatting_hint: | |
| run.italic = True | |
| run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray | |
| else: | |
| # Default text formatting based on classification and OpenCV | |
| if text_classification.get('is_header'): | |
| run.bold = True | |
| run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue | |
| elif text_classification.get('is_list_item'): | |
| run.font.color.rgb = RGBColor(52, 152, 219) # Blue | |
| else: | |
| run.font.color.rgb = RGBColor(0, 0, 0) # Black | |
| def _create_enhanced_docx_table(self): | |
| """Create table with enhanced formatting""" | |
| if not self.table_data: | |
| return | |
| rows = len(self.table_data) | |
| cols = max(len(row) for row in self.table_data) if self.table_data else 1 | |
| table = self.doc.add_table(rows=rows, cols=cols) | |
| table.style = 'Table Grid' | |
| table.alignment = WD_TABLE_ALIGNMENT.LEFT | |
| # Fill table data with enhanced formatting | |
| for row_idx, row_data in enumerate(self.table_data): | |
| table_row = table.rows[row_idx] | |
| for col_idx, cell_data in enumerate(row_data): | |
| if col_idx < len(table_row.cells): | |
| cell = table_row.cells[col_idx] | |
| cell.text = str(cell_data) | |
| # Style header row | |
| if row_idx == 0: | |
| for paragraph in cell.paragraphs: | |
| for run in paragraph.runs: | |
| run.bold = True | |
| run.font.size = Pt(10) | |
| run.font.color.rgb = RGBColor(44, 62, 80) | |
| paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| # Add background color to header | |
| shading_elm_1 = OxmlElement('w:shd') | |
| shading_elm_1.set(qn('w:fill'), 'ECF0F1') | |
| paragraph._element.get_or_add_pPr().append(shading_elm_1) | |
| else: | |
| # Regular data cells | |
| for paragraph in cell.paragraphs: | |
| for run in paragraph.runs: | |
| run.font.size = Pt(10) | |
| paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| # Add spacing after table | |
| self.doc.add_paragraph() | |
| # Create DOCX document | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| temp_file = tempfile.NamedTemporaryFile( | |
| suffix=f'_opencv_enhanced_document_{timestamp}.docx', | |
| delete=False | |
| ) | |
| temp_file.close() | |
| doc = Document() | |
| # Set document margins for better layout | |
| sections = doc.sections | |
| for section in sections: | |
| section.top_margin = Inches(1) | |
| section.bottom_margin = Inches(1) | |
| section.left_margin = Inches(1) | |
| section.right_margin = Inches(1) | |
| # Add title with enhanced styling | |
| title = doc.add_heading('PDF OCR Extraction Results', 0) | |
| title.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| title_run = title.runs[0] | |
| title_run.font.color.rgb = RGBColor(44, 62, 80) | |
| # Add subtitle | |
| subtitle_para = doc.add_paragraph() | |
| subtitle_run = subtitle_para.add_run('Enhanced with OpenCV Text Block Analysis & Bold Detection') | |
| subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| subtitle_run.italic = True | |
| subtitle_run.font.size = Pt(12) | |
| subtitle_run.font.color.rgb = RGBColor(102, 102, 102) | |
| # Add feature list | |
| features_para = doc.add_paragraph() | |
| features_run = features_para.add_run('Features: OpenCV Text Block Detection • Bold Text Recognition • Spacing Analysis • Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification • Header Indentation Suppression') | |
| features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| features_run.font.size = Pt(9) | |
| features_run.font.color.rgb = RGBColor(149, 165, 166) | |
| # Add metadata section | |
| if metadata_info: | |
| doc.add_heading('Processing Information', level=1) | |
| meta_para = doc.add_paragraph() | |
| meta_run = meta_para.add_run(metadata_info) | |
| meta_run.font.size = Pt(10) | |
| meta_para.style = 'Intense Quote' | |
| # Add background to metadata | |
| shading_elm = OxmlElement('w:shd') | |
| shading_elm.set(qn('w:fill'), 'F8F9FA') | |
| meta_para._element.get_or_add_pPr().append(shading_elm) | |
| doc.add_paragraph() | |
| # Process content | |
| doc.add_heading('Extracted Content', level=1) | |
| if html_content and '<div' in html_content: | |
| # Parse HTML with OpenCV-enhanced processing | |
| parser = OpenCVEnhancedDOCXHTMLParser(doc, self) | |
| parser.feed(html_content) | |
| else: | |
| # Fallback to text processing with OpenCV enhancement | |
| self._process_text_content_opencv_enhanced(doc, text_content) | |
| # Add enhanced footer | |
| footer_section = doc.sections[0] | |
| footer = footer_section.footer | |
| footer_para = footer.paragraphs[0] | |
| footer_para.text = f"Generated by OpenCV-Enhanced PDF OCR Service with Text Block Analysis & Bold Detection on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" | |
| footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| footer_run = footer_para.runs[0] | |
| footer_run.font.size = Pt(8) | |
| footer_run.font.color.rgb = RGBColor(128, 128, 128) | |
| doc.save(temp_file.name) | |
| logger.info(f"OpenCV-enhanced DOCX file with text block analysis and bold detection created: {temp_file.name}") | |
| return temp_file.name | |
| except ImportError: | |
| raise ImportError("python-docx not installed. Cannot create DOCX files.") | |
| except Exception as e: | |
| logger.error(f"Error creating OpenCV-enhanced DOCX file: {e}") | |
| try: | |
| os.unlink(temp_file.name) | |
| except: | |
| pass | |
| raise | |
| def _process_text_content_opencv_enhanced(self, doc, text_content): | |
| """Process text content with OpenCV-enhanced analysis, bold detection, and spacing""" | |
| paragraphs = text_content.split('\n\n') | |
| for para_text in paragraphs: | |
| if not para_text.strip(): | |
| continue | |
| lines = para_text.split('\n') | |
| for line in lines: | |
| if not line.strip(): | |
| continue | |
| # Detect indentation and classify text with OpenCV enhancement | |
| indent_info = self.indent_detector.detect_indentation(line) | |
| text_classification = self.indent_detector.classify_text_type(line) | |
| # Check for OpenCV-style bold headers (simulated analysis) | |
| is_opencv_bold_header = ( | |
| text_classification.get('is_header') and | |
| text_classification.get('confidence', 0) > 0.8 and | |
| len(line.strip()) < 80 and | |
| line.strip().isupper() # Simple heuristic for bold headers | |
| ) | |
| if line.strip().startswith('==='): | |
| # Page headers | |
| page_header = doc.add_heading(line.strip(), level=1) | |
| page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| header_run = page_header.runs[0] | |
| header_run.font.color.rgb = RGBColor(44, 62, 80) | |
| elif is_opencv_bold_header: | |
| # OpenCV-detected bold headers - no indentation | |
| heading = doc.add_heading(line.strip(), level=1) | |
| heading.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| heading_run = heading.runs[0] | |
| heading_run.font.color.rgb = RGBColor(231, 76, 60) # Red for emphasis | |
| heading_run.font.size = Pt(16) | |
| elif line.strip().startswith('##'): | |
| # Section headings | |
| heading_text = line.strip().lstrip('#').strip() | |
| heading = doc.add_heading(heading_text, level=2) | |
| heading_run = heading.runs[0] | |
| heading_run.font.color.rgb = RGBColor(52, 73, 94) | |
| elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7: | |
| # Regular detected headers | |
| heading = doc.add_heading(indent_info.get('content', line.strip()), level=2) | |
| heading_run = heading.runs[0] | |
| heading_run.font.color.rgb = RGBColor(52, 73, 94) | |
| else: | |
| # Regular content with OpenCV-enhanced formatting | |
| para = doc.add_paragraph() | |
| # Apply indentation based on detected level using 4 spaces per level (but not for bold headers) | |
| level = indent_info.get('level', 0) | |
| if level > 0 and not is_opencv_bold_header: | |
| # Use 4 spaces equivalent per level (0.25 inches per level) | |
| para.paragraph_format.left_indent = Inches(level * 0.25) | |
| # Apply pattern-specific formatting using 4 spaces equivalent | |
| if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False): | |
| para.paragraph_format.first_line_indent = Inches(-0.125) # 4-space equivalent hanging indent | |
| # Set proper spacing with OpenCV enhancement | |
| para.paragraph_format.line_spacing = 1.15 | |
| para.paragraph_format.space_after = Pt(4) | |
| # Add content with enhanced formatting | |
| content = indent_info.get('content', line.strip()) | |
| marker = indent_info.get('pattern_marker', '') | |
| # Include marker for non-bullet items | |
| if marker and not indent_info.get('is_bullet', False): | |
| content = f"{marker} {content}" | |
| run = para.add_run(content) | |
| run.font.size = Pt(11) | |
| # Apply color coding based on pattern type and classification | |
| pattern_type = indent_info.get('pattern_type', 'normal') | |
| if 'numbered' in pattern_type or 'decimal' in pattern_type: | |
| if level == 1: | |
| run.bold = True | |
| run.font.color.rgb = RGBColor(44, 62, 80) | |
| elif level == 2: | |
| run.font.color.rgb = RGBColor(52, 73, 94) | |
| else: | |
| run.font.color.rgb = RGBColor(85, 85, 85) | |
| elif 'parenthetical' in pattern_type: | |
| if level <= 2: | |
| run.bold = True | |
| run.font.color.rgb = RGBColor(142, 68, 173) # Purple | |
| else: | |
| run.font.color.rgb = RGBColor(155, 89, 182) # Light purple | |
| elif 'bullet' in pattern_type: | |
| run.font.color.rgb = RGBColor(52, 152, 219) | |
| elif 'lettered' in pattern_type: | |
| run.italic = True | |
| run.font.color.rgb = RGBColor(142, 68, 173) | |
| elif 'roman' in pattern_type: | |
| run.font.color.rgb = RGBColor(211, 84, 0) | |
| elif 'thai' in pattern_type: | |
| run.font.color.rgb = RGBColor(22, 160, 133) | |
| elif text_classification.get('is_list_item'): | |
| run.font.color.rgb = RGBColor(52, 152, 219) | |
| def create_html_file(html_content: str, metadata_info: str = "") -> str: | |
| """Create standalone HTML file with OpenCV-enhanced styling""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| temp_file = tempfile.NamedTemporaryFile( | |
| suffix=f'_opencv_enhanced_document_{timestamp}.html', | |
| delete=False, | |
| mode='w', | |
| encoding='utf-8' | |
| ) | |
| try: | |
| # Enhance HTML with better styling including OpenCV features | |
| enhanced_html = html_content | |
| # Add comprehensive styling if not already present | |
| if '<style>' not in enhanced_html: | |
| enhanced_html = enhanced_html.replace( | |
| '<head>', | |
| '''<head> | |
| <style> | |
| body { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| line-height: 1.6; | |
| margin: 20px; | |
| background-color: #f9f9f9; | |
| } | |
| .container { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| background-color: white; | |
| padding: 30px; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
| } | |
| .header { | |
| text-align: center; | |
| margin-bottom: 30px; | |
| border-bottom: 3px solid #2c3e50; | |
| padding-bottom: 20px; | |
| } | |
| .metadata { | |
| background-color: #ecf0f1; | |
| padding: 15px; | |
| border-radius: 5px; | |
| margin-bottom: 25px; | |
| border-left: 4px solid #3498db; | |
| } | |
| .opencv-features { | |
| background-color: #e8f5e8; | |
| padding: 10px; | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| border-left: 4px solid #27ae60; | |
| font-size: 0.9em; | |
| } | |
| .opencv-bold-header { | |
| font-weight: bold; | |
| color: #e74c3c; | |
| font-size: 1.3em; | |
| margin: 20px 0 15px 0; | |
| border-left: 4px solid #e74c3c; | |
| padding-left: 12px; | |
| background-color: #fdf2f2; | |
| } | |
| .text-analysis-features { | |
| background-color: #fff9e7; | |
| padding: 10px; | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| border-left: 4px solid #f39c12; | |
| font-size: 0.9em; | |
| } | |
| </style>''' | |
| ) | |
| # Wrap content in container if not already wrapped | |
| if '<body>' in enhanced_html and '.container' not in enhanced_html: | |
| enhanced_html = enhanced_html.replace( | |
| '<body>', | |
| '''<body> | |
| <div class="container"> | |
| <div class="header"> | |
| <h1>PDF OCR Extraction Results</h1> | |
| <p>Enhanced with OpenCV Text Block Analysis & Bold Detection</p> | |
| </div> | |
| <div class="opencv-features"> | |
| <strong>OpenCV Features:</strong> Text Block Detection • Bold Text Recognition • | |
| Automatic Spacing & Paragraph Analysis • Header Indentation Suppression • | |
| Visual Text Element Analysis | |
| </div> | |
| <div class="text-analysis-features"> | |
| <strong>Text Analysis:</strong> Comprehensive Indentation Detection • | |
| Parenthetical Patterns ((1), (๑), (a), (i), (ก)) • Multi-level Bullets • | |
| Letter & Roman Numerals • Thai Script Support • Pattern Priority Detection • | |
| Intelligent Text Classification | |
| </div>''' + | |
| (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '') | |
| ) | |
| enhanced_html = enhanced_html.replace('</body>', '</div></body>') | |
| temp_file.write(enhanced_html) | |
| temp_file.close() | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"Error creating HTML file: {e}") | |
| temp_file.close() | |
| raise | |
| class BackendManager: | |
| """Enhanced backend manager with OpenCV text block analysis, bold detection, and comprehensive formatting""" | |
| def __init__(self): | |
| self.ocr_service = OCRService() | |
| self.document_exporter = EnhancedDocumentExporter() | |
| self.opencv_analyzer = OpenCVTextAnalyzer() | |
| self.processing_history = [] | |
| self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100)) | |
| # Create directories for temporary files and logs | |
| self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_opencv_enhanced' | |
| self.temp_dir.mkdir(exist_ok=True) | |
| logger.info("OpenCV-enhanced backend manager with text block analysis and bold detection initialized successfully") | |
| def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto", | |
| preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: | |
| """ | |
| Process PDF with OpenCV-enhanced resolution, text block analysis, and bold detection | |
| Args: | |
| pdf_path: Path to the PDF file | |
| method: OCR method to use | |
| preprocessing_options: Dictionary containing preprocessing settings | |
| Returns: | |
| Dict containing processing results with OpenCV-enhanced analysis | |
| """ | |
| start_time = datetime.now() | |
| # Validate input | |
| if not os.path.exists(pdf_path): | |
| return { | |
| 'success': False, | |
| 'error': f"File not found: {pdf_path}", | |
| 'text': '', | |
| 'html': '', | |
| 'method_used': '', | |
| 'metadata': {} | |
| } | |
| # Check file size | |
| max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024 | |
| file_size = os.path.getsize(pdf_path) | |
| if file_size > max_file_size: | |
| return { | |
| 'success': False, | |
| 'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB", | |
| 'text': '', | |
| 'html': '', | |
| 'method_used': '', | |
| 'metadata': {} | |
| } | |
| # Generate file hash for tracking | |
| file_hash = self._calculate_file_hash(pdf_path) | |
| logger.info(f"Processing PDF with OpenCV text block analysis and bold detection: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)") | |
| logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}") | |
| # Handle preprocessing if enabled | |
| processed_pdf_path = pdf_path | |
| preprocessing_applied = False | |
| if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False): | |
| logger.info("Applying enhanced preprocessing with OpenCV analysis...") | |
| try: | |
| processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options) | |
| preprocessing_applied = True | |
| logger.info("OpenCV-enhanced preprocessing completed successfully") | |
| except Exception as e: | |
| logger.error(f"Preprocessing failed: {e}") | |
| processed_pdf_path = pdf_path | |
| try: | |
| # Process with OpenCV-enhanced OCR | |
| result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method) | |
| # Add processing metadata | |
| processing_time = (datetime.now() - start_time).total_seconds() | |
| # Analyze document structure with OpenCV enhancement if successful | |
| document_analysis = {} | |
| opencv_global_analysis = {} | |
| if result['success'] and result['text']: | |
| try: | |
| text_lines = result['text'].split('\n') | |
| detector = EnhancedIndentationDetector() | |
| # Perform global OpenCV analysis on the PDF | |
| opencv_global_analysis = self._perform_global_opencv_analysis(pdf_path, text_lines) | |
| # Enhanced document structure analysis | |
| document_analysis = detector.analyze_document_structure_with_opencv(text_lines) | |
| if opencv_global_analysis: | |
| document_analysis['opencv_global_analysis'] = opencv_global_analysis | |
| except Exception as analysis_error: | |
| logger.warning(f"Document structure analysis failed: {analysis_error}") | |
| document_analysis = {'analysis_failed': True} | |
| result['metadata'].update({ | |
| 'file_hash': file_hash, | |
| 'file_size_mb': round(file_size / (1024*1024), 2), | |
| 'processing_time_seconds': round(processing_time, 2), | |
| 'timestamp': start_time.isoformat(), | |
| 'opencv_enhanced': True, | |
| 'opencv_text_block_analysis': True, | |
| 'opencv_bold_detection': True, | |
| 'opencv_spacing_analysis': True, | |
| 'enhanced_processing': True, | |
| 'html_processing': True, | |
| 'comprehensive_indentation': True, | |
| 'parenthetical_patterns_supported': True, | |
| 'intelligent_text_classification': True, | |
| 'header_indentation_suppression': True, | |
| 'header_footer_removed': preprocessing_applied, | |
| 'preprocessing_options': preprocessing_options if preprocessing_applied else None, | |
| 'document_structure_analysis': document_analysis, | |
| 'opencv_global_analysis': opencv_global_analysis | |
| }) | |
| # Cleanup temporary preprocessed file | |
| if preprocessing_applied and processed_pdf_path != pdf_path: | |
| try: | |
| os.unlink(processed_pdf_path) | |
| except: | |
| pass | |
| # Log results with OpenCV enhancement information | |
| if result['success']: | |
| text_length = len(result['text']) | |
| has_html = bool(result.get('html')) | |
| table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0 | |
| logger.info(f"OpenCV-enhanced processing completed successfully in {processing_time:.2f}s") | |
| logger.info(f"Method used: {result['method_used']}") | |
| logger.info(f"Text extracted: {text_length} characters") | |
| logger.info(f"HTML generated: {has_html}") | |
| logger.info(f"OpenCV text block analysis: Enabled") | |
| logger.info(f"OpenCV bold detection: Enabled") | |
| logger.info(f"OpenCV spacing analysis: Enabled") | |
| logger.info(f"Header indentation suppression: Enabled") | |
| if table_count > 0: | |
| logger.info(f"Tables detected: {table_count}") | |
| if preprocessing_applied: | |
| logger.info("Enhanced preprocessing applied") | |
| if document_analysis and not document_analysis.get('analysis_failed'): | |
| logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}") | |
| logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items") | |
| if opencv_global_analysis: | |
| logger.info(f"OpenCV global analysis: {opencv_global_analysis.get('block_count', 0)} text blocks, {opencv_global_analysis.get('paragraph_count', 0)} paragraphs") | |
| logger.info(f"Bold text detected: {opencv_global_analysis.get('bold_text_detected', False)}") | |
| # Add to processing history | |
| self._add_to_history({ | |
| 'timestamp': start_time.isoformat(), | |
| 'file_hash': file_hash, | |
| 'method_used': result['method_used'], | |
| 'success': True, | |
| 'text_length': text_length, | |
| 'table_count': table_count, | |
| 'processing_time': processing_time, | |
| 'preprocessing_applied': preprocessing_applied, | |
| 'html_generated': has_html, | |
| 'opencv_enhanced': True, | |
| 'opencv_text_block_analysis': True, | |
| 'opencv_bold_detection': True, | |
| 'opencv_spacing_analysis': True, | |
| 'enhanced_processing': True, | |
| 'comprehensive_indentation': True, | |
| 'parenthetical_patterns_supported': True, | |
| 'intelligent_text_classification': True, | |
| 'header_indentation_suppression': True, | |
| 'document_analysis': document_analysis, | |
| 'opencv_global_analysis': opencv_global_analysis | |
| }) | |
| else: | |
| logger.error(f"OpenCV-enhanced processing failed: {result.get('error', 'Unknown error')}") | |
| # Add to processing history | |
| self._add_to_history({ | |
| 'timestamp': start_time.isoformat(), | |
| 'file_hash': file_hash, | |
| 'method_requested': method, | |
| 'success': False, | |
| 'error': result.get('error', 'Unknown error'), | |
| 'processing_time': processing_time, | |
| 'preprocessing_applied': preprocessing_applied, | |
| 'opencv_enhanced': True, | |
| 'opencv_text_block_analysis': True, | |
| 'opencv_bold_detection': True, | |
| 'opencv_spacing_analysis': True, | |
| 'enhanced_processing': True, | |
| 'comprehensive_indentation': True, | |
| 'parenthetical_patterns_supported': True, | |
| 'intelligent_text_classification': True, | |
| 'header_indentation_suppression': True | |
| }) | |
| return result | |
| except Exception as e: | |
| logger.error(f"Unexpected error during OpenCV-enhanced processing: {e}") | |
| # Cleanup | |
| if preprocessing_applied and processed_pdf_path != pdf_path: | |
| try: | |
| os.unlink(processed_pdf_path) | |
| except: | |
| pass | |
| # Add to processing history | |
| processing_time = (datetime.now() - start_time).total_seconds() | |
| self._add_to_history({ | |
| 'timestamp': start_time.isoformat(), | |
| 'file_hash': file_hash, | |
| 'method_requested': method, | |
| 'success': False, | |
| 'error': str(e), | |
| 'processing_time': processing_time, | |
| 'opencv_enhanced': True, | |
| 'opencv_text_block_analysis': True, | |
| 'opencv_bold_detection': True, | |
| 'opencv_spacing_analysis': True, | |
| 'enhanced_processing': True, | |
| 'comprehensive_indentation': True, | |
| 'parenthetical_patterns_supported': True, | |
| 'intelligent_text_classification': True, | |
| 'header_indentation_suppression': True | |
| }) | |
| return { | |
| 'success': False, | |
| 'error': f"OpenCV-enhanced processing error: {str(e)}", | |
| 'text': '', | |
| 'html': '', | |
| 'method_used': '', | |
| 'metadata': { | |
| 'file_hash': file_hash, | |
| 'processing_time_seconds': round(processing_time, 2), | |
| 'timestamp': start_time.isoformat(), | |
| 'opencv_enhanced': True, | |
| 'opencv_text_block_analysis': True, | |
| 'opencv_bold_detection': True, | |
| 'opencv_spacing_analysis': True, | |
| 'enhanced_processing': True, | |
| 'comprehensive_indentation': True, | |
| 'parenthetical_patterns_supported': True, | |
| 'intelligent_text_classification': True, | |
| 'header_indentation_suppression': True | |
| } | |
| } | |
| def _perform_global_opencv_analysis(self, pdf_path: str, text_lines: List[str]) -> Dict[str, Any]: | |
| """Perform global OpenCV analysis on the entire PDF""" | |
| try: | |
| # Extract first page for global analysis | |
| pdf_document = fitz.open(pdf_path) | |
| page = pdf_document.load_page(0) # First page | |
| # Render page to image | |
| mat = fitz.Matrix(2.0, 2.0) | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("png") | |
| # Convert to OpenCV format | |
| import io | |
| from PIL import Image | |
| pil_image = Image.open(io.BytesIO(img_data)) | |
| img_array = np.array(pil_image) | |
| img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) | |
| # Perform OpenCV analysis | |
| opencv_analysis = self.opencv_analyzer.analyze_text_blocks(img_cv, text_lines) | |
| pdf_document.close() | |
| return opencv_analysis | |
| except Exception as e: | |
| logger.error(f"Global OpenCV analysis failed: {e}") | |
| return {} | |
| def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str: | |
| """Apply enhanced preprocessing with high-resolution crop handling and OpenCV analysis""" | |
| crop_settings = options.get('crop_settings', {}) | |
| per_page_crops = crop_settings.get('per_page_crops', {}) | |
| enhanced_resolution = crop_settings.get('enhanced_resolution', True) | |
| resolution_scale = crop_settings.get('resolution_scale', 2.0) | |
| # Create temporary file for processed PDF | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| temp_pdf_path = self.temp_dir / f"opencv_enhanced_preprocessed_{timestamp}.pdf" | |
| doc = fitz.open(pdf_path) | |
| new_doc = fitz.open() | |
| try: | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| page_rect = page.rect | |
| # Get crop settings for this page | |
| page_crop = per_page_crops.get(page_num, per_page_crops.get(0, { | |
| 'top': 0, 'bottom': 0, 'left': 0, 'right': 0 | |
| })) | |
| top_percent = page_crop.get('top', 0) | |
| bottom_percent = page_crop.get('bottom', 0) | |
| left_percent = page_crop.get('left', 0) | |
| right_percent = page_crop.get('right', 0) | |
| # Calculate crop amounts | |
| width = page_rect.width | |
| height = page_rect.height | |
| crop_left = width * (left_percent / 100) | |
| crop_right = width * (right_percent / 100) | |
| crop_top = height * (top_percent / 100) | |
| crop_bottom = height * (bottom_percent / 100) | |
| # Calculate new rectangle | |
| new_rect = fitz.Rect( | |
| page_rect.x0 + crop_left, | |
| page_rect.y0 + crop_top, | |
| page_rect.x1 - crop_right, | |
| page_rect.y1 - crop_bottom | |
| ) | |
| # Ensure the rectangle is valid | |
| if new_rect.width <= 0 or new_rect.height <= 0: | |
| logger.warning(f"Invalid crop rectangle for page {page_num}, using original page") | |
| new_rect = page_rect | |
| # Create new page with enhanced resolution if enabled | |
| if enhanced_resolution: | |
| new_page = new_doc.new_page( | |
| width=new_rect.width, | |
| height=new_rect.height | |
| ) | |
| # Copy content with proper transformation | |
| mat = fitz.Matrix(1, 1).prescale(resolution_scale, resolution_scale) | |
| new_page.show_pdf_page( | |
| new_page.rect, | |
| doc, | |
| page_num, | |
| clip=new_rect | |
| ) | |
| else: | |
| # Standard resolution | |
| new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height) | |
| new_page.show_pdf_page( | |
| new_page.rect, | |
| doc, | |
| page_num, | |
| clip=new_rect | |
| ) | |
| logger.debug(f"Page {page_num}: Applied OpenCV-enhanced crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%") | |
| new_doc.save(str(temp_pdf_path)) | |
| logger.info(f"OpenCV-enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages") | |
| except Exception as e: | |
| logger.error(f"Error in OpenCV-enhanced preprocessing: {e}") | |
| raise | |
| finally: | |
| doc.close() | |
| new_doc.close() | |
| return str(temp_pdf_path) | |
| def create_enhanced_downloads(self, text_content: str, html_content: str, | |
| metadata_info: str = "") -> Dict[str, str]: | |
| """Create OpenCV-enhanced download files with text block analysis and bold detection""" | |
| download_files = {} | |
| try: | |
| # Create OpenCV-enhanced TXT file | |
| txt_path = EnhancedDocumentExporter.create_enhanced_txt_file( | |
| text_content, html_content, metadata_info | |
| ) | |
| download_files['txt'] = txt_path | |
| logger.info(f"OpenCV-enhanced TXT file created: {txt_path}") | |
| # Create OpenCV-enhanced DOCX file with text block analysis and bold detection | |
| try: | |
| docx_path = self.document_exporter.create_enhanced_docx_file( | |
| text_content, html_content, metadata_info | |
| ) | |
| download_files['docx'] = docx_path | |
| logger.info(f"OpenCV-enhanced DOCX file with text block analysis and bold detection created: {docx_path}") | |
| except ImportError: | |
| logger.warning("python-docx not available. DOCX creation skipped.") | |
| except Exception as e: | |
| logger.error(f"OpenCV-enhanced DOCX creation failed: {e}") | |
| # Create standalone HTML file with OpenCV enhancements | |
| try: | |
| html_path = EnhancedDocumentExporter.create_html_file( | |
| html_content, metadata_info | |
| ) | |
| download_files['html'] = html_path | |
| logger.info(f"OpenCV-enhanced HTML file created: {html_path}") | |
| except Exception as e: | |
| logger.error(f"HTML file creation failed: {e}") | |
| except Exception as e: | |
| logger.error(f"Error creating OpenCV-enhanced downloads: {e}") | |
| raise | |
| return download_files | |
| def get_available_methods(self) -> List[str]: | |
| """Get list of available OCR methods""" | |
| methods = self.ocr_service.get_available_methods() | |
| logger.info(f"Available OpenCV-enhanced OCR methods: {methods}") | |
| return methods | |
| def get_service_status(self) -> Dict[str, Any]: | |
| """Get comprehensive service status with OpenCV enhancements""" | |
| available_methods = self.get_available_methods() | |
| # Check DOCX support | |
| try: | |
| import docx | |
| docx_available = True | |
| except ImportError: | |
| docx_available = False | |
| # Check OpenCV availability | |
| opencv_available = True | |
| try: | |
| import cv2 | |
| except ImportError: | |
| opencv_available = False | |
| status = { | |
| 'service_healthy': True, | |
| 'available_methods': available_methods, | |
| 'azure_configured': 'azure' in available_methods, | |
| 'tesseract_available': 'tesseract' in available_methods, | |
| 'pymupdf_available': 'pymupdf' in available_methods, | |
| 'total_processed': len(self.processing_history), | |
| 'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)), | |
| 'temp_dir': str(self.temp_dir), | |
| 'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)), | |
| 'opencv_available': opencv_available, | |
| 'opencv_text_block_analysis': opencv_available, | |
| 'opencv_bold_detection': opencv_available, | |
| 'opencv_spacing_analysis': opencv_available, | |
| 'enhanced_processing': True, | |
| 'html_processing': True, | |
| 'comprehensive_indentation': True, | |
| 'parenthetical_patterns_supported': True, | |
| 'intelligent_text_classification': True, | |
| 'header_indentation_suppression': True, | |
| 'pattern_detection_count': len(EnhancedIndentationDetector().patterns), | |
| 'docx_export_available': docx_available, | |
| 'enhanced_crop_processing': True, | |
| 'multi_resolution_support': True, | |
| 'crop_processing_fixed': True, | |
| 'document_structure_analysis': True, | |
| 'thai_script_support': True, | |
| 'multi_level_support': True, | |
| 'text_classification_features': True | |
| } | |
| return status | |
| def _calculate_file_hash(self, file_path: str) -> str: | |
| """Calculate SHA-256 hash of file""" | |
| sha256_hash = hashlib.sha256() | |
| try: | |
| with open(file_path, "rb") as f: | |
| for chunk in iter(lambda: f.read(4096), b""): | |
| sha256_hash.update(chunk) | |
| return sha256_hash.hexdigest() | |
| except Exception as e: | |
| logger.error(f"Error calculating file hash: {e}") | |
| return f"error_{datetime.now().timestamp()}" | |
| def _add_to_history(self, entry: Dict[str, Any]): | |
| """Add entry to processing history""" | |
| self.processing_history.append(entry) | |
| # Limit history size | |
| if len(self.processing_history) > self.max_history_size: | |
| self.processing_history = self.processing_history[-self.max_history_size:] | |
| def cleanup_temp_files(self): | |
| """Clean up temporary files""" | |
| try: | |
| temp_files = list(self.temp_dir.glob('*')) | |
| cleaned_count = 0 | |
| for temp_file in temp_files: | |
| try: | |
| # Remove files older than 1 hour | |
| if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600): | |
| temp_file.unlink() | |
| cleaned_count += 1 | |
| except Exception as e: | |
| logger.warning(f"Could not remove temp file {temp_file}: {e}") | |
| if cleaned_count > 0: | |
| logger.info(f"Cleaned up {cleaned_count} temporary files") | |
| except Exception as e: | |
| logger.error(f"Error during cleanup: {e}") | |
| def get_enhanced_statistics(self) -> Dict[str, Any]: | |
| """Get enhanced processing statistics with OpenCV analysis""" | |
| if not self.processing_history: | |
| return { | |
| 'total_processed': 0, | |
| 'success_rate': 0, | |
| 'average_processing_time': 0, | |
| 'most_used_method': 'N/A', | |
| 'total_text_extracted': 0, | |
| 'total_tables_processed': 0, | |
| 'preprocessing_usage': 0, | |
| 'html_generation_rate': 0, | |
| 'opencv_enhanced_usage': 0, | |
| 'opencv_text_block_analysis_usage': 0, | |
| 'opencv_bold_detection_usage': 0, | |
| 'opencv_spacing_analysis_usage': 0, | |
| 'enhanced_processing_usage': 0, | |
| 'comprehensive_indentation_usage': 0, | |
| 'parenthetical_patterns_usage': 0, | |
| 'text_classification_usage': 0, | |
| 'header_indentation_suppression_usage': 0, | |
| 'document_analysis_success_rate': 0 | |
| } | |
| total_processed = len(self.processing_history) | |
| successful = [h for h in self.processing_history if h.get('success', False)] | |
| success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0 | |
| # Calculate statistics | |
| processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h] | |
| avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0 | |
| methods = [h.get('method_used', 'unknown') for h in successful] | |
| most_used_method = max(set(methods), key=methods.count) if methods else 'N/A' | |
| total_text = sum(h.get('text_length', 0) for h in successful) | |
| total_tables = sum(h.get('table_count', 0) for h in successful) | |
| preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False)) | |
| html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False)) | |
| opencv_enhanced = sum(1 for h in self.processing_history if h.get('opencv_enhanced', False)) | |
| opencv_text_block_analysis = sum(1 for h in self.processing_history if h.get('opencv_text_block_analysis', False)) | |
| opencv_bold_detection = sum(1 for h in self.processing_history if h.get('opencv_bold_detection', False)) | |
| opencv_spacing_analysis = sum(1 for h in self.processing_history if h.get('opencv_spacing_analysis', False)) | |
| enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False)) | |
| comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False)) | |
| parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False)) | |
| text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False)) | |
| header_indentation_suppression = sum(1 for h in self.processing_history if h.get('header_indentation_suppression', False)) | |
| # Document analysis statistics | |
| doc_analysis_success = sum(1 for h in self.processing_history | |
| if h.get('document_analysis', {}) and not h.get('document_analysis', {}).get('analysis_failed', False)) | |
| doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0 | |
| html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0 | |
| opencv_enhanced_rate = (opencv_enhanced / total_processed) * 100 if total_processed > 0 else 0 | |
| opencv_text_block_analysis_rate = (opencv_text_block_analysis / total_processed) * 100 if total_processed > 0 else 0 | |
| opencv_bold_detection_rate = (opencv_bold_detection / total_processed) * 100 if total_processed > 0 else 0 | |
| opencv_spacing_analysis_rate = (opencv_spacing_analysis / total_processed) * 100 if total_processed > 0 else 0 | |
| enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0 | |
| comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0 | |
| parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0 | |
| text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0 | |
| header_indentation_suppression_rate = (header_indentation_suppression / total_processed) * 100 if total_processed > 0 else 0 | |
| return { | |
| 'total_processed': total_processed, | |
| 'success_rate': round(success_rate, 2), | |
| 'average_processing_time': round(avg_processing_time, 2), | |
| 'most_used_method': most_used_method, | |
| 'total_text_extracted': total_text, | |
| 'total_tables_processed': total_tables, | |
| 'successful_processes': len(successful), | |
| 'failed_processes': total_processed - len(successful), | |
| 'preprocessing_usage': preprocessing_usage, | |
| 'html_generation_rate': round(html_generation_rate, 2), | |
| 'opencv_enhanced_usage': opencv_enhanced, | |
| 'opencv_enhanced_rate': round(opencv_enhanced_rate, 2), | |
| 'opencv_text_block_analysis_usage': opencv_text_block_analysis, | |
| 'opencv_text_block_analysis_rate': round(opencv_text_block_analysis_rate, 2), | |
| 'opencv_bold_detection_usage': opencv_bold_detection, | |
| 'opencv_bold_detection_rate': round(opencv_bold_detection_rate, 2), | |
| 'opencv_spacing_analysis_usage': opencv_spacing_analysis, | |
| 'opencv_spacing_analysis_rate': round(opencv_spacing_analysis_rate, 2), | |
| 'enhanced_processing_usage': enhanced_processing, | |
| 'enhanced_processing_rate': round(enhanced_processing_rate, 2), | |
| 'comprehensive_indentation_usage': comprehensive_indentation, | |
| 'comprehensive_indentation_rate': round(comprehensive_indentation_rate, 2), | |
| 'parenthetical_patterns_usage': parenthetical_patterns, | |
| 'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2), | |
| 'text_classification_usage': text_classification, | |
| 'text_classification_rate': round(text_classification_rate, 2), | |
| 'header_indentation_suppression_usage': header_indentation_suppression, | |
| 'header_indentation_suppression_rate': round(header_indentation_suppression_rate, 2), | |
| 'document_analysis_success_rate': round(doc_analysis_rate, 2) | |
| } | |
| # Global backend manager instance | |
| _backend_manager = None | |
| def get_backend_manager() -> BackendManager: | |
| """Get global OpenCV-enhanced backend manager instance""" | |
| global _backend_manager | |
| if _backend_manager is None: | |
| _backend_manager = BackendManager() | |
| return _backend_manager | |
| if __name__ == "__main__": | |
| # Test the OpenCV-enhanced backend manager | |
| manager = BackendManager() | |
| print("OpenCV-Enhanced Backend Manager with Text Block Analysis & Bold Detection Test") | |
| print("=" * 110) | |
| print(f"Available methods: {manager.get_available_methods()}") | |
| print(f"Service status: {manager.get_service_status()}") | |
| print(f"Enhanced statistics: {manager.get_enhanced_statistics()}") | |
| # Test OpenCV analyzer | |
| opencv_analyzer = OpenCVTextAnalyzer() | |
| test_image_path = "test_page.png" # This would be a real image path in practice | |
| test_text_lines = [ | |
| "CHAPTER 1: INTRODUCTION", | |
| "1.1. Overview of the System", | |
| "This document provides comprehensive information...", | |
| "1.2. Key Features", | |
| "• Feature one with detailed explanation", | |
| "• Feature two with additional notes" | |
| ] | |
| print(f"\nOpenCV Text Analysis Test:") | |
| print("-" * 40) | |
| # opencv_analysis = opencv_analyzer.analyze_text_blocks(test_image_path, test_text_lines) | |
| # print(f"Analysis result: {opencv_analysis}") | |
| # Test indentation detector with OpenCV integration | |
| detector = EnhancedIndentationDetector() | |
| test_cases = [ | |
| "INTRODUCTION TO THE SYSTEM", # Should be detected as bold header | |
| "1.2.3. Hierarchical item", | |
| "(1) Parenthetical Arabic", | |
| "(๑) Parenthetical Thai numeral", | |
| "(a) Parenthetical letter", | |
| "(i) Parenthetical Roman", | |
| "(ก) Parenthetical Thai letter" | |
| ] | |
| print(f"\nOpenCV-Enhanced Indentation Detection Test:") | |
| print("-" * 60) | |
| for test_text in test_cases: | |
| result = detector.detect_indentation(test_text) | |
| classification = detector.classify_text_type(test_text) | |
| print(f"Text: {test_text}") | |
| print(f" Pattern: {result['pattern_type']}, Level: {result['level']}") | |
| print(f" Is Header: {result['is_header']}, Suppress Indent: {result['suppress_indentation']}") | |
| print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})") | |
| print() |