| """ |
| OCR processor menggunakan Tesseract via pytesseract. |
| |
| Kenapa ganti dari EasyOCR: |
| - EasyOCR: ~500MB RAM, ~15-20s load time (download detection + recognition models) |
| - Tesseract: 0MB model load (binary + lang packs sudah di-install di image), |
| load time ~0.1s, RAM overhead ~50MB saat proses |
| - Accuracy untuk dokumen/teks standard: comparable |
| - Tesseract binary + tesseract-ocr-ind sudah ada di Dockerfile |
| |
| Trade-off: EasyOCR lebih akurat untuk teks miring/deformed. |
| Untuk use case RAG (extract teks dari dokumen, screenshot), Tesseract cukup. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from typing import List |
| from dataclasses import dataclass, field |
| import subprocess |
|
|
| import numpy as np |
| import cv2 |
| from loguru import logger |
|
|
| from ..config import get_cv_settings |
| from ..processors.image_preprocessor import ImageInput |
|
|
|
|
| @dataclass |
| class OCRBox: |
| text: str |
| confidence: float |
| bbox: list |
|
|
| def to_dict(self) -> dict: |
| return { |
| "text": self.text, |
| "confidence": round(self.confidence, 4), |
| "bbox": self.bbox, |
| } |
|
|
|
|
| @dataclass |
| class OCRResult: |
| full_text: str |
| boxes: List[OCRBox] = field(default_factory=list) |
| language: str = "" |
| engine: str = "" |
|
|
| @property |
| def word_count(self) -> int: |
| return len(self.full_text.split()) |
|
|
|
|
| class OCRProcessor: |
| """ |
| OCR via Tesseract (pytesseract) — ringan, instant load. |
| Tidak ada model download, tidak ada torch dependency. |
| |
| Preprocessing: CLAHE + sharpen untuk improve akurasi pada gambar gelap/buram. |
| """ |
|
|
| MIN_OCR_DIM = 1000 |
|
|
| def __init__(self): |
| settings = get_cv_settings() |
| self.engine = "tesseract" |
|
|
| |
| raw_langs = [l.strip() for l in settings.ocr_languages.split(",")] |
| tess_map = {"en": "eng", "id": "ind", "eng": "eng", "ind": "ind"} |
| tess_langs = [tess_map.get(l, l) for l in raw_langs] |
|
|
| |
| available = self._get_available_langs() |
| self.languages = [l for l in tess_langs if l in available] |
| if not self.languages: |
| logger.warning("Tidak ada tesseract lang yang cocok, fallback ke 'eng'") |
| self.languages = ["eng"] |
|
|
| self.lang_str = "+".join(self.languages) |
| logger.info(f"Loading OCR (tesseract) for languages: {self.languages}") |
|
|
| |
| try: |
| import pytesseract |
| self.pytesseract = pytesseract |
| ver = pytesseract.get_tesseract_version() |
| logger.info(f"OCR processor ready. Tesseract {ver}") |
| except Exception as e: |
| logger.error(f"Gagal init Tesseract: {e}") |
| raise |
|
|
| @staticmethod |
| def _get_available_langs() -> set: |
| """Ambil daftar lang pack yang ter-install di sistem.""" |
| try: |
| result = subprocess.run( |
| ["tesseract", "--list-langs"], |
| capture_output=True, text=True, timeout=5 |
| ) |
| langs = set() |
| for line in result.stdout.splitlines() + result.stderr.splitlines(): |
| line = line.strip() |
| if line and not line.startswith("List") and not line.startswith("Tess"): |
| langs.add(line) |
| return langs |
| except Exception: |
| return {"eng"} |
|
|
| def _preprocess_for_ocr(self, img: np.ndarray) -> np.ndarray: |
| """ |
| Preprocessing untuk improve Tesseract accuracy: |
| - Upscale jika terlalu kecil |
| - Grayscale |
| - CLAHE contrast enhancement |
| - Sharpen |
| - Threshold adaptif (optional — skip kalau gambar sudah clear) |
| """ |
| try: |
| h, w = img.shape[:2] |
|
|
| |
| if max(h, w) < self.MIN_OCR_DIM: |
| scale = self.MIN_OCR_DIM / max(h, w) |
| img = cv2.resize(img, (int(w * scale), int(h * scale)), |
| interpolation=cv2.INTER_CUBIC) |
|
|
| |
| if len(img.shape) == 3: |
| gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
| else: |
| gray = img.copy() |
|
|
| |
| clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8)) |
| enhanced = clahe.apply(gray) |
|
|
| |
| kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32) |
| sharpened = cv2.filter2D(enhanced, -1, kernel) |
|
|
| return sharpened |
|
|
| except Exception as e: |
| logger.warning(f"OCR preprocessing fallback: {e}") |
| if len(img.shape) == 3: |
| return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
| return img |
|
|
| def extract_text( |
| self, |
| image: ImageInput, |
| detail: bool = True, |
| paragraph: bool = False, |
| ) -> OCRResult: |
| """Extract teks dari gambar menggunakan Tesseract.""" |
| logger.debug(f"Running Tesseract OCR on {image.width}x{image.height} image") |
|
|
| try: |
| processed = self._preprocess_for_ocr(image.numpy.copy()) |
|
|
| |
| data = self.pytesseract.image_to_data( |
| processed, |
| lang=self.lang_str, |
| config="--psm 3 --oem 3", |
| output_type=self.pytesseract.Output.DICT, |
| ) |
|
|
| boxes = [] |
| for i in range(len(data["text"])): |
| text = str(data["text"][i]).strip() |
| conf = float(data["conf"][i]) |
|
|
| if not text or conf < 10: |
| continue |
|
|
| x = data["left"][i] |
| y = data["top"][i] |
| w = data["width"][i] |
| h = data["height"][i] |
|
|
| |
| bbox = [ |
| [float(x), float(y)], |
| [float(x + w), float(y)], |
| [float(x + w), float(y + h)], |
| [float(x), float(y + h)], |
| ] |
|
|
| boxes.append(OCRBox( |
| text=text, |
| confidence=conf / 100.0, |
| bbox=bbox, |
| )) |
|
|
| |
| full_text = self.pytesseract.image_to_string( |
| processed, |
| lang=self.lang_str, |
| config="--psm 3 --oem 3", |
| ).strip() |
|
|
| return OCRResult( |
| full_text=full_text, |
| boxes=boxes, |
| language=self.lang_str, |
| engine="tesseract", |
| ) |
|
|
| except Exception as e: |
| logger.error(f"OCR error: {e}") |
| |
| try: |
| text = self.pytesseract.image_to_string(image.numpy, lang="eng") |
| return OCRResult(full_text=text.strip(), boxes=[], language="eng", engine="tesseract") |
| except Exception as e2: |
| logger.error(f"OCR fallback juga gagal: {e2}") |
| return OCRResult(full_text="", boxes=[], language=self.lang_str, engine="tesseract") |
|
|
| def extract_text_simple(self, image: ImageInput) -> str: |
| result = self.extract_text(image) |
| return result.full_text |
|
|