ai-agent-app / backend /scripts /ingest /ocr_extractor.py
MinhTai's picture
deploy: ead37d0 wiki overhaul + animation v2 + Oracle fixes
c036214
"""OCR text extraction using Tesseract (Vietnamese language model required)."""
from __future__ import annotations
import sys
def ocr_image(path: str) -> str:
try:
import pytesseract
from PIL import Image
except ImportError:
print("[WARN] pytesseract/Pillow not installed", file=sys.stderr)
return ""
try:
img = Image.open(path)
return pytesseract.image_to_string(img, lang="vie")
except Exception as e:
print(f"[WARN] OCR failed for {path}: {e}", file=sys.stderr)
return ""
def ocr_pdf(path: str) -> list[str]:
try:
import pytesseract
from pdf2image import convert_from_path
except ImportError:
print("[WARN] pytesseract/pdf2image not installed", file=sys.stderr)
return []
try:
images = convert_from_path(path)
return [pytesseract.image_to_string(img, lang="vie") for img in images]
except Exception as e:
print(f"[WARN] OCR PDF failed for {path}: {e}", file=sys.stderr)
return []