File size: 968 Bytes
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
PhilVerify — OCR Module (Tesseract)
Extracts text from images using pytesseract.
Falls back gracefully if Tesseract not installed.
"""
import io
import logging

logger = logging.getLogger(__name__)

# Supported languages: Filipino (fil) + English (eng)
_TESSERACT_LANG = "fil+eng"


async def extract_text_from_image(image_bytes: bytes) -> str:
    """
    Run Tesseract OCR on image bytes. Returns extracted text string.
    """
    try:
        import pytesseract
        from PIL import Image

        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        text = pytesseract.image_to_string(image, lang=_TESSERACT_LANG)
        text = text.strip()
        logger.info("OCR extracted %d chars from image", len(text))
        return text
    except ImportError:
        logger.warning("pytesseract / Pillow not installed — OCR unavailable")
        return ""
    except Exception as e:
        logger.error("OCR failed: %s", e)
        return ""