File size: 11,721 Bytes
52a0fe9
 
 
 
 
 
 
 
 
 
 
a2aa7c3
 
 
 
 
 
52a0fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2aa7c3
 
 
 
 
52a0fe9
f4a6b1e
a2aa7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52a0fe9
a2aa7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
52a0fe9
 
 
 
a2aa7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
483f7ec
a2aa7c3
 
 
52a0fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
"""
Image OCR extraction using EasyOCR (primary) and Tesseract (fallback).
Includes advanced image preprocessing for maximum accuracy.
"""
import time
import os
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from models.schemas import ExtractionResult, DocumentMetadata
import config

try:
    import google.generativeai as genai
    GEMINI_AVAILABLE = True
except ImportError:
    GEMINI_AVAILABLE = False

# --- OCR Engine Detection ---

try:
    import easyocr
    EASYOCR_AVAILABLE = True
except ImportError:
    EASYOCR_AVAILABLE = False

try:
    import pytesseract
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False


# Global reader instance for EasyOCR (lazy loaded)
_EASY_READER = None

def get_easyocr_reader():
    """Get or create the EasyOCR reader instance."""
    global _EASY_READER
    if _EASY_READER is None and EASYOCR_AVAILABLE:
        try:
            # Initialize with configured languages and GPU setting
            _EASY_READER = easyocr.Reader(config.EASYOCR_LANGS, gpu=config.EASYOCR_GPU)
        except Exception as e:
            print(f"Error initializing EasyOCR: {e}")
            return None
    return _EASY_READER


def _configure_tesseract():
    """Configure tesseract path from config."""
    if config.TESSERACT_CMD and TESSERACT_AVAILABLE:
        pytesseract.pytesseract.tesseract_cmd = config.TESSERACT_CMD
        return True
    elif TESSERACT_AVAILABLE:
        try:
            pytesseract.get_tesseract_version()
            return True
        except Exception:
            return False
    return False


def _preprocess_image(image: Image.Image) -> Image.Image:
    """Preprocess image for maximum OCR accuracy."""
    # 1. Convert to grayscale
    if image.mode != "L":
        image = image.convert("L")

    # 2. Dynamic Contrast / Lighting correction
    image = ImageOps.autocontrast(image)

    # 3. Resize to optimal DPI (approx 300)
    width, height = image.size
    if width < 1500 or height < 1500:
        scale = max(1800 / width, 1800 / height, 2.0)
        new_size = (int(width * scale), int(height * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)

    # 4. Sharpening (Unsharp Mask equivalent)
    image = image.filter(ImageFilter.SHARPEN)
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(1.8)

    # 5. Denoising
    image = image.filter(ImageFilter.MedianFilter(size=3))

    return image


def _reconstruct_from_boxes(results: list) -> str:
    """ Reconstruct text layout from bounding boxes.
        Sort by top, then group by 'lines' based on y-coordinate.
    """
    if not results:
        return ""

    # Sort results by top y-coordinate
    results.sort(key=lambda x: x[0][0][1])

    lines = []
    if results:
        current_line = [results[0]]
        for i in range(1, len(results)):
            # If the current block's mid-y is within the previous block's height range
            prev_box = results[i-1][0]
            curr_box = results[i][0]
            
            prev_y_center = (prev_box[0][1] + prev_box[2][1]) / 2
            curr_y_center = (curr_box[0][1] + curr_box[2][1]) / 2
            
            # Threshold for 'same line' is approx 1/3 of the box height
            height = prev_box[2][1] - prev_box[0][1]
            if abs(curr_y_center - prev_y_center) < (height * 0.5):
                current_line.append(results[i])
            else:
                lines.append(current_line)
                current_line = [results[i]]
        lines.append(current_line)

    final_text = []
    for line in lines:
        # Sort each line by left x-coordinate
        line.sort(key=lambda x: x[0][0][0])
        line_text = []
        for i, res in enumerate(line):
            # Add relative spacing based on horizontal gap
            if i > 0:
                prev_right = line[i-1][0][1][0]
                curr_left = res[0][0][0]
                gap = curr_left - prev_right
                # If gap is significant, add spaces
                char_width = (res[0][1][0] - res[0][0][0]) / (len(res[1]) or 1)
                num_spaces = int(gap / (char_width * 1.5))
                line_text.append(" " * max(1, num_spaces))
            
            line_text.append(res[1])
        final_text.append(" ".join(line_text))

    return "\n".join(final_text)


def extract_image_gemini(file_path: str) -> ExtractionResult:
    """Extract text from an image using Gemini 1.5 Flash for perfect layout alignment."""
    if not config.GEMINI_API_KEY:
        return ExtractionResult(success=False, error_message="Gemini API Key missing", raw_text="", metadata=DocumentMetadata())

    start_time = time.time()
    try:
        genai.configure(api_key=config.GEMINI_API_KEY)
        model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
        
        image = Image.open(file_path)
        
        # Prompt for perfect extraction with layout preservation
        prompt = (
            "Perform OCR on this image. Extract EVERY bit of text correctly. "
            "Maintain the original layout, columns, and spacing exactly as they appear. "
            "Do not add any explanations, markdown, or commentary. Output only the extracted text."
        )
        
        response = model.generate_content([prompt, image])
        text = response.text.strip()
        
        if text:
            elapsed = (time.time() - start_time) * 1000
            metadata = DocumentMetadata(
                title=os.path.basename(file_path),
                page_count=1,
                word_count=len(text.split()),
                character_count=len(text),
                file_type="Image (Gemini AI)",
                extra={
                    "image_width": image.width,
                    "image_height": image.height,
                    "ocr_engine": "Gemini 1.5 Flash",
                    "accuracy": "Perfect (Vision-Language Model)"
                }
            )
            return ExtractionResult(
                raw_text=text,
                metadata=metadata,
                success=True,
                extraction_time_ms=elapsed
            )
    except Exception as e:
        print(f"Gemini OCR failed: {e}")
    
    return ExtractionResult(success=False, error_message="Gemini failed", raw_text="", metadata=DocumentMetadata())


def extract_image(file_path: str) -> ExtractionResult:
    """Extract text from an image using the best available OCR engine (Gemini -> EasyOCR -> Tesseract)."""
    start_time = time.time()
    
    # 0. Check for Gemini (Best quality, layout aware)
    if GEMINI_AVAILABLE and config.is_gemini_available():
        result = extract_image_gemini(file_path)
        if result.success:
            return result

    # 1. Check for EasyOCR (Preferred local)
    if EASYOCR_AVAILABLE:
        try:
            reader = get_easyocr_reader()
            if reader:
                # Get original dimensions for metadata
                with Image.open(file_path) as img:
                    original_size = img.size

                # EasyOCR works well with both original and preprocessed images
                # We'll use a slightly preprocessed version for consistency
                # Perform OCR with layout awareness
                # Adjusting thresholds for better numeric and tabular capture
                results = reader.readtext(
                    file_path, 
                    detail=1, 
                    paragraph=False, # We want individual boxes for layout reconstruction
                    canvas_size=1200, # Shrunk to detect huge fonts (like certificate names) that CRAFT misses
                    contrast_ths=0.1  # Reset to 0.1 so colored/light text isn't dropped
                )
                
                # Reconstruct full layout from bounding boxes
                text = _reconstruct_from_boxes(results)
                
                if text.strip():
                    elapsed = (time.time() - start_time) * 1000
                    metadata = DocumentMetadata(
                        title=os.path.basename(file_path),
                        page_count=1,
                        word_count=len(text.split()),
                        character_count=len(text),
                        file_type="Image (EasyOCR)",
                        extra={
                            "image_width": original_size[0],
                            "image_height": original_size[1],
                            "ocr_engine": "EasyOCR",
                            "accuracy": "High (Deep Learning)"
                        }
                    )
                    return ExtractionResult(
                        raw_text=text.strip(),
                        metadata=metadata,
                        success=True,
                        extraction_time_ms=elapsed
                    )
        except Exception as e:
            print(f"EasyOCR extraction failed, falling back to Tesseract: {e}")

    # 2. Fallback to Tesseract
    if TESSERACT_AVAILABLE and _configure_tesseract():
        try:
            image = Image.open(file_path)
            original_size = image.size
            processed_image = _preprocess_image(image)
            
            custom_config = f"--oem 3 --psm 6 -l {config.TESSERACT_LANG}"
            text = pytesseract.image_to_string(processed_image, config=custom_config)
            
            # Confidence
            try:
                data = pytesseract.image_to_data(processed_image, config=custom_config, output_type=pytesseract.Output.DICT)
                confidences = [int(c) for c in data["conf"] if int(c) > 0]
                avg_confidence = sum(confidences) / len(confidences) if confidences else 0
            except Exception:
                avg_confidence = 0

            elapsed = (time.time() - start_time) * 1000
            if text.strip():
                metadata = DocumentMetadata(
                    title=os.path.basename(file_path),
                    page_count=1,
                    word_count=len(text.split()),
                    character_count=len(text),
                    file_type="Image (Tesseract)",
                    extra={
                        "image_width": original_size[0],
                        "image_height": original_size[1],
                        "ocr_confidence": round(avg_confidence, 2),
                        "ocr_engine": "Tesseract"
                    }
                )
                return ExtractionResult(
                    raw_text=text.strip(),
                    metadata=metadata,
                    success=True,
                    extraction_time_ms=elapsed
                )
        except Exception as e:
            print(f"Tesseract extraction failed: {e}")

    # 3. Failure cases
    elapsed = (time.time() - start_time) * 1000
    
    if not EASYOCR_AVAILABLE and not TESSERACT_AVAILABLE:
        error_msg = "No OCR libraries installed. Please run 'pip install easyocr'."
    elif not EASYOCR_AVAILABLE and TESSERACT_AVAILABLE:
        error_msg = "EasyOCR is not installed, and Tesseract binary was not found or failed. Please run 'pip install easyocr' for best results."
    elif EASYOCR_AVAILABLE and not TESSERACT_AVAILABLE:
        error_msg = "EasyOCR failed to extract text, and Tesseract is not installed."
    else:
        error_msg = "OCR extraction failed. Both EasyOCR and Tesseract engines were unable to extract text from this image."
    
    return ExtractionResult(
        raw_text="",
        metadata=DocumentMetadata(file_type="Image (OCR)"),
        success=False,
        error_message=error_msg,
        extraction_time_ms=elapsed,
    )