Spaces:
Running
Running
| import base64 | |
| import requests | |
| import io | |
| import logging | |
| import numpy as np | |
| from typing import Optional, List | |
| from PIL import Image, ImageEnhance, UnidentifiedImageError | |
| import binascii | |
| try: | |
| from paddleocr import PaddleOCR | |
| except ImportError: | |
| PaddleOCR = None | |
| logger = logging.getLogger(__name__) | |
| class OCRProcessor: | |
| """ | |
| Handles OCR text extraction using PaddleOCR and image preprocessing. | |
| """ | |
| _os_instance = None # Singleton for OCR engine | |
| def __init__(self, max_size_bytes: int = 5 * 1024 * 1024): # 5MB limit | |
| self.max_size = max_size_bytes | |
| self.ocr_engine = None | |
| def engine(self): | |
| """Lazy load PaddleOCR engine.""" | |
| if self.ocr_engine is None: | |
| if PaddleOCR: | |
| logger.info("Initializing PaddleOCR engine...") | |
| # deterministic=True ensures consistent results | |
| self.ocr_engine = PaddleOCR(use_angle_cls=True, lang='en') | |
| else: | |
| logger.error("PaddleOCR not installed.") | |
| return None | |
| return self.ocr_engine | |
| def extract_text(self, headers_b64: Optional[str] = None, image_data: Optional[str] = None) -> str: | |
| """ | |
| Extract text from base64 image data. | |
| Arg 'headers_b64' is for backward compat/legacy signature matching if any, | |
| but we expect 'image_data' (base64 string). | |
| Args: | |
| image_data: Base64 string of the image. | |
| Returns: | |
| Extracted text string or empty string on failure. | |
| """ | |
| # Handle positional args if someone calls extract_text(b64) | |
| target_b64 = image_data or headers_b64 | |
| if not target_b64: | |
| return "" | |
| if not self.engine: | |
| return "OCR Engine Unavailable" | |
| try: | |
| # 1. Decode Base64 to Array | |
| if ";base64," in target_b64: | |
| _, target_b64 = target_b64.split(";base64,") | |
| try: | |
| img_bytes = base64.b64decode(target_b64) | |
| img = Image.open(io.BytesIO(img_bytes)).convert("RGB") | |
| img_arr = np.array(img) | |
| except (binascii.Error, UnidentifiedImageError, ValueError) as e: | |
| logger.error(f"Image decoding failed: {e}") | |
| return f"Error: Invalid image data ({str(e)})" | |
| # 2. Run OCR | |
| result = self.engine.ocr(img_arr, cls=True) | |
| # 3. Parse Results | |
| extracted_lines = [] | |
| if result and result[0]: | |
| for line in result[0]: | |
| text = line[1][0] | |
| confidence = line[1][1] | |
| if confidence > 0.5: # Confidence threshold | |
| extracted_lines.append(text) | |
| full_text = "\n".join(extracted_lines) | |
| logger.info(f"OCR extracted {len(full_text)} chars.") | |
| return full_text | |
| except Exception as e: | |
| logger.error(f"OCR Failed: {e}") | |
| return f"Error reading image: {e}" | |
| def optimize_base64(self, b64_string: str) -> str: | |
| """ | |
| Optimize base64 image: resize to max 1024px and convert to JPEG. | |
| Returns optimized base64 string. | |
| """ | |
| try: | |
| if ";base64," in b64_string: | |
| _, data = b64_string.split(";base64,") | |
| else: | |
| data = b64_string | |
| img_data = base64.b64decode(data) | |
| img = Image.open(io.BytesIO(img_data)) | |
| max_dim = 1024 | |
| if max(img.size) > max_dim: | |
| img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS) | |
| if img.mode in ('RGBA', 'P'): | |
| img = img.convert('RGB') | |
| buffer = io.BytesIO() | |
| img.save(buffer, format="JPEG", quality=85) | |
| return base64.b64encode(buffer.getvalue()).decode('utf-8') | |
| except Exception as e: | |
| logger.warning(f"Image optimization failed: {e}") | |
| return b64_string | |
| def download_image_as_base64(self, url: str) -> Optional[str]: | |
| """Download image from URL and return as base64 string.""" | |
| try: | |
| response = requests.get(url, timeout=10, stream=True) | |
| response.raise_for_status() | |
| if len(response.content) > self.max_size: | |
| return None | |
| b64 = base64.b64encode(response.content).decode('utf-8') | |
| return self.optimize_base64(b64) | |
| except Exception as e: | |
| logger.error(f"Image download failed: {e}") | |
| return None | |