""" Mock OCR Service for Testing This is a simplified version for testing when PaddleOCR models cannot be downloaded """ import logging from typing import Dict, List, Any import numpy as np import cv2 from PIL import Image logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class MockOCRService: """ Mock OCR Service for testing purposes. Returns simulated OCR results with proper structure. """ def __init__(self, use_gpu: bool = False, lang: str = 'en'): """Initialize Mock OCR Service""" self.use_gpu = use_gpu self.lang = lang logger.info(f"Initializing Mock OCR Service (GPU: {use_gpu}, Language: {lang})") logger.warning("Using MOCK OCR SERVICE - not real OCR! For testing structure only.") def process_image(self, image_path: str) -> Dict[str, Any]: """ Process an image and return mock structured OCR results Args: image_path: Path to the image file Returns: Dictionary containing mock structured OCR results """ # Load image to get dimensions image = cv2.imread(image_path) if image is None: raise ValueError(f"Cannot read image from {image_path}") height, width = image.shape[:2] logger.info(f"Processing image: {width}x{height}") # Return mock structured data return { "image_width": width, "image_height": height, "blocks": [ { "block_id": "block_0", "block_type": "header", "bounding_box": [ [int(width * 0.1), int(height * 0.05)], [int(width * 0.9), int(height * 0.05)], [int(width * 0.9), int(height * 0.15)], [int(width * 0.1), int(height * 0.15)] ], "lines": [ { "line_id": "line_0", "text": "Sample Document Title (Mock OCR)", "bounding_box": [ [int(width * 0.1), int(height * 0.05)], [int(width * 0.9), int(height * 0.05)], [int(width * 0.9), int(height * 0.15)], [int(width * 0.1), int(height * 0.15)] ], "font_size_estimate": int((height * 0.1) * 0.75), "words": [ { "word": "Sample", "bounding_box": [ [int(width * 0.1), int(height * 0.05)], [int(width * 0.25), int(height * 0.05)], [int(width * 0.25), int(height * 0.15)], [int(width * 0.1), int(height * 0.15)] ], "confidence": 0.95, "characters": [ { "char": c, "bounding_box": [ [int(width * (0.1 + i * 0.025)), int(height * 0.05)], [int(width * (0.1 + (i + 1) * 0.025)), int(height * 0.05)], [int(width * (0.1 + (i + 1) * 0.025)), int(height * 0.15)], [int(width * (0.1 + i * 0.025)), int(height * 0.15)] ], "confidence": 0.95 } for i, c in enumerate("Sample") ] }, { "word": "Document", "bounding_box": [ [int(width * 0.27), int(height * 0.05)], [int(width * 0.50), int(height * 0.05)], [int(width * 0.50), int(height * 0.15)], [int(width * 0.27), int(height * 0.15)] ], "confidence": 0.93, "characters": [] }, { "word": "Title", "bounding_box": [ [int(width * 0.52), int(height * 0.05)], [int(width * 0.68), int(height * 0.05)], [int(width * 0.68), int(height * 0.15)], [int(width * 0.52), int(height * 0.15)] ], "confidence": 0.96, "characters": [] } ] } ] }, { "block_id": "block_1", "block_type": "paragraph", "bounding_box": [ [int(width * 0.1), int(height * 0.2)], [int(width * 0.9), int(height * 0.2)], [int(width * 0.9), int(height * 0.6)], [int(width * 0.1), int(height * 0.6)] ], "lines": [ { "line_id": f"line_{i + 1}", "text": f"This is line {i + 1} of the mock paragraph content.", "bounding_box": [ [int(width * 0.1), int(height * (0.2 + i * 0.08))], [int(width * 0.9), int(height * (0.2 + i * 0.08))], [int(width * 0.9), int(height * (0.2 + (i + 1) * 0.08))], [int(width * 0.1), int(height * (0.2 + (i + 1) * 0.08))] ], "font_size_estimate": 12, "words": [] } for i in range(5) ] } ] }