File size: 4,666 Bytes
1df75cb
565a379
 
 
 
1df75cb
 
ed6c1cd
 
1df75cb
 
 
 
565a379
 
 
 
 
1df75cb
565a379
 
1df75cb
 
565a379
 
1df75cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed6c1cd
 
 
 
 
 
 
1df75cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565a379
 
 
 
 
 
 
 
1df75cb
565a379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1df75cb
565a379
 
 
1df75cb
565a379
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

import base64
import requests
import io
import logging
import numpy as np
from typing import Optional, List
from PIL import Image, ImageEnhance, UnidentifiedImageError
import binascii
try:
    from paddleocr import PaddleOCR
except ImportError:
    PaddleOCR = None

logger = logging.getLogger(__name__)

class OCRProcessor:
    """
    Handles OCR text extraction using PaddleOCR and image preprocessing.
    """
    
    _os_instance = None # Singleton for OCR engine

    def __init__(self, max_size_bytes: int = 5 * 1024 * 1024): # 5MB limit
        self.max_size = max_size_bytes
        self.ocr_engine = None

    @property
    def engine(self):
        """Lazy load PaddleOCR engine."""
        if self.ocr_engine is None:
            if PaddleOCR:
                logger.info("Initializing PaddleOCR engine...")
                # deterministic=True ensures consistent results
                self.ocr_engine = PaddleOCR(use_angle_cls=True, lang='en')
            else:
                logger.error("PaddleOCR not installed.")
                return None
        return self.ocr_engine

    def extract_text(self, headers_b64: Optional[str] = None, image_data: Optional[str] = None) -> str:
        """
        Extract text from base64 image data.
        Arg 'headers_b64' is for backward compat/legacy signature matching if any, 
        but we expect 'image_data' (base64 string).
        
        Args:
            image_data: Base64 string of the image.
            
        Returns:
            Extracted text string or empty string on failure.
        """
        # Handle positional args if someone calls extract_text(b64)
        target_b64 = image_data or headers_b64 
        if not target_b64:
            return ""

        if not self.engine:
            return "OCR Engine Unavailable"

        try:
            # 1. Decode Base64 to Array
            if ";base64," in target_b64:
                _, target_b64 = target_b64.split(";base64,")
            
            try:
                img_bytes = base64.b64decode(target_b64)
                img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
                img_arr = np.array(img)
            except (binascii.Error, UnidentifiedImageError, ValueError) as e:
                 logger.error(f"Image decoding failed: {e}")
                 return f"Error: Invalid image data ({str(e)})"

            # 2. Run OCR
            result = self.engine.ocr(img_arr, cls=True)
            
            # 3. Parse Results
            extracted_lines = []
            if result and result[0]:
                for line in result[0]:
                    text = line[1][0]
                    confidence = line[1][1]
                    if confidence > 0.5: # Confidence threshold
                        extracted_lines.append(text)
            
            full_text = "\n".join(extracted_lines)
            logger.info(f"OCR extracted {len(full_text)} chars.")
            return full_text

        except Exception as e:
            logger.error(f"OCR Failed: {e}")
            return f"Error reading image: {e}"

    def optimize_base64(self, b64_string: str) -> str:
        """
        Optimize base64 image: resize to max 1024px and convert to JPEG.
        Returns optimized base64 string.
        """
        try:
             if ";base64," in b64_string:
                _, data = b64_string.split(";base64,")
             else:
                data = b64_string

             img_data = base64.b64decode(data)
             img = Image.open(io.BytesIO(img_data))
             
             max_dim = 1024
             if max(img.size) > max_dim:
                 img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
             
             if img.mode in ('RGBA', 'P'):
                 img = img.convert('RGB')
                 
             buffer = io.BytesIO()
             img.save(buffer, format="JPEG", quality=85)
             return base64.b64encode(buffer.getvalue()).decode('utf-8')
        except Exception as e:
            logger.warning(f"Image optimization failed: {e}")
            return b64_string

    def download_image_as_base64(self, url: str) -> Optional[str]:
        """Download image from URL and return as base64 string."""
        try:
            response = requests.get(url, timeout=10, stream=True)
            response.raise_for_status()
            if len(response.content) > self.max_size:
                return None
            b64 = base64.b64encode(response.content).decode('utf-8')
            return self.optimize_base64(b64)
        except Exception as e:
            logger.error(f"Image download failed: {e}")
            return None