Spaces:
Running
Running
| """OCR text extraction using Tesseract (Vietnamese language model required).""" | |
| from __future__ import annotations | |
| import sys | |
| def ocr_image(path: str) -> str: | |
| try: | |
| import pytesseract | |
| from PIL import Image | |
| except ImportError: | |
| print("[WARN] pytesseract/Pillow not installed", file=sys.stderr) | |
| return "" | |
| try: | |
| img = Image.open(path) | |
| return pytesseract.image_to_string(img, lang="vie") | |
| except Exception as e: | |
| print(f"[WARN] OCR failed for {path}: {e}", file=sys.stderr) | |
| return "" | |
| def ocr_pdf(path: str) -> list[str]: | |
| try: | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| except ImportError: | |
| print("[WARN] pytesseract/pdf2image not installed", file=sys.stderr) | |
| return [] | |
| try: | |
| images = convert_from_path(path) | |
| return [pytesseract.image_to_string(img, lang="vie") for img in images] | |
| except Exception as e: | |
| print(f"[WARN] OCR PDF failed for {path}: {e}", file=sys.stderr) | |
| return [] | |