File size: 1,004 Bytes
2a72045
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pytesseract
from PIL import Image
import io
import os

class OCREngine:
    def __init__(self):
        # On Render, tesseract is usually in /usr/bin/tesseract
        # On Windows, we use the path provided by the user
        if os.name == 'nt':
            pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    def extract_text(self, image_content: bytes) -> str:
        try:
            image = Image.open(io.BytesIO(image_content))
            
            # Basic preprocessing: Resize if too large
            if image.width > 2000 or image.height > 2000:
                image.thumbnail((2000, 2000))
            
            # Convert to grayscale for better OCR
            image = image.convert('L')
            
            text = pytesseract.image_to_string(image)
            return text.strip()
        except Exception as e:
            print(f"OCR Error: {e}")
            return f"Error extracting text: {str(e)}"

ocr_engine = OCREngine()