Spaces:
Running
Running
File size: 968 Bytes
6c9b8f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | """
PhilVerify — OCR Module (Tesseract)
Extracts text from images using pytesseract.
Falls back gracefully if Tesseract not installed.
"""
import io
import logging
logger = logging.getLogger(__name__)
# Supported languages: Filipino (fil) + English (eng)
_TESSERACT_LANG = "fil+eng"
async def extract_text_from_image(image_bytes: bytes) -> str:
"""
Run Tesseract OCR on image bytes. Returns extracted text string.
"""
try:
import pytesseract
from PIL import Image
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
text = pytesseract.image_to_string(image, lang=_TESSERACT_LANG)
text = text.strip()
logger.info("OCR extracted %d chars from image", len(text))
return text
except ImportError:
logger.warning("pytesseract / Pillow not installed — OCR unavailable")
return ""
except Exception as e:
logger.error("OCR failed: %s", e)
return ""
|