""" OCR Service Module - ENHANCED VERSION with OpenCV Text Block Analysis and Bold Detection Handles PDF to text conversion with OpenCV-based spacing analysis, bold text detection, and improved formatting """ import re import os import logging from typing import Optional, Dict, Any, Tuple, List import tempfile from pathlib import Path import cv2 import numpy as np # Load environment variables from dotenv import load_dotenv load_dotenv() # Azure Document Intelligence from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.core.exceptions import AzureError # Fallback OCR libraries try: import pytesseract from PIL import Image TESSERACT_AVAILABLE = True except ImportError: TESSERACT_AVAILABLE = False import fitz # PyMuPDF # Enhanced indentation detection with OpenCV from enhanced_indentation import EnhancedIndentationDetector, OpenCVTextAnalyzer # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class EnhancedHTMLProcessor: """Process OCR results through HTML with OpenCV-enhanced text block analysis and bold detection""" def __init__(self): self.indent_detector = EnhancedIndentationDetector() self.opencv_analyzer = OpenCVTextAnalyzer() @staticmethod def create_html_from_azure_result(analysis_result, page_images=None) -> str: """Create structured HTML from Azure Document Intelligence result with OpenCV enhancement""" processor = EnhancedHTMLProcessor() html_parts = ['
'] html_parts.append('') if not analysis_result.pages: html_parts.append('No content found
') return '\n'.join(html_parts) for page_num, page in enumerate(analysis_result.pages, 1): html_parts.append(f'