""" OCR processor menggunakan Tesseract via pytesseract. Kenapa ganti dari EasyOCR: - EasyOCR: ~500MB RAM, ~15-20s load time (download detection + recognition models) - Tesseract: 0MB model load (binary + lang packs sudah di-install di image), load time ~0.1s, RAM overhead ~50MB saat proses - Accuracy untuk dokumen/teks standard: comparable - Tesseract binary + tesseract-ocr-ind sudah ada di Dockerfile Trade-off: EasyOCR lebih akurat untuk teks miring/deformed. Untuk use case RAG (extract teks dari dokumen, screenshot), Tesseract cukup. """ from __future__ import annotations from typing import List from dataclasses import dataclass, field import subprocess import numpy as np import cv2 from loguru import logger from ..config import get_cv_settings from ..processors.image_preprocessor import ImageInput @dataclass class OCRBox: text: str confidence: float bbox: list def to_dict(self) -> dict: return { "text": self.text, "confidence": round(self.confidence, 4), "bbox": self.bbox, } @dataclass class OCRResult: full_text: str boxes: List[OCRBox] = field(default_factory=list) language: str = "" engine: str = "" @property def word_count(self) -> int: return len(self.full_text.split()) class OCRProcessor: """ OCR via Tesseract (pytesseract) — ringan, instant load. Tidak ada model download, tidak ada torch dependency. Preprocessing: CLAHE + sharpen untuk improve akurasi pada gambar gelap/buram. """ MIN_OCR_DIM = 1000 # Upscale gambar kecil def __init__(self): settings = get_cv_settings() self.engine = "tesseract" # Parse languages: "en,id" -> "eng+ind" (tesseract format) raw_langs = [l.strip() for l in settings.ocr_languages.split(",")] tess_map = {"en": "eng", "id": "ind", "eng": "eng", "ind": "ind"} tess_langs = [tess_map.get(l, l) for l in raw_langs] # Filter ke lang yang benar-benar ada di sistem available = self._get_available_langs() self.languages = [l for l in tess_langs if l in available] if not self.languages: logger.warning("Tidak ada tesseract lang yang cocok, fallback ke 'eng'") self.languages = ["eng"] self.lang_str = "+".join(self.languages) logger.info(f"Loading OCR (tesseract) for languages: {self.languages}") # Verify tesseract binary works try: import pytesseract self.pytesseract = pytesseract ver = pytesseract.get_tesseract_version() logger.info(f"OCR processor ready. Tesseract {ver}") except Exception as e: logger.error(f"Gagal init Tesseract: {e}") raise @staticmethod def _get_available_langs() -> set: """Ambil daftar lang pack yang ter-install di sistem.""" try: result = subprocess.run( ["tesseract", "--list-langs"], capture_output=True, text=True, timeout=5 ) langs = set() for line in result.stdout.splitlines() + result.stderr.splitlines(): line = line.strip() if line and not line.startswith("List") and not line.startswith("Tess"): langs.add(line) return langs except Exception: return {"eng"} def _preprocess_for_ocr(self, img: np.ndarray) -> np.ndarray: """ Preprocessing untuk improve Tesseract accuracy: - Upscale jika terlalu kecil - Grayscale - CLAHE contrast enhancement - Sharpen - Threshold adaptif (optional — skip kalau gambar sudah clear) """ try: h, w = img.shape[:2] # Upscale if max(h, w) < self.MIN_OCR_DIM: scale = self.MIN_OCR_DIM / max(h, w) img = cv2.resize(img, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_CUBIC) # Grayscale if len(img.shape) == 3: gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) else: gray = img.copy() # CLAHE clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # Sharpen kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32) sharpened = cv2.filter2D(enhanced, -1, kernel) return sharpened # grayscale single-channel — Tesseract handles this fine except Exception as e: logger.warning(f"OCR preprocessing fallback: {e}") if len(img.shape) == 3: return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) return img def extract_text( self, image: ImageInput, detail: bool = True, paragraph: bool = False, ) -> OCRResult: """Extract teks dari gambar menggunakan Tesseract.""" logger.debug(f"Running Tesseract OCR on {image.width}x{image.height} image") try: processed = self._preprocess_for_ocr(image.numpy.copy()) # Get detailed output with bounding boxes data = self.pytesseract.image_to_data( processed, lang=self.lang_str, config="--psm 3 --oem 3", output_type=self.pytesseract.Output.DICT, ) boxes = [] for i in range(len(data["text"])): text = str(data["text"][i]).strip() conf = float(data["conf"][i]) if not text or conf < 10: # Tesseract conf is 0-100 continue x = data["left"][i] y = data["top"][i] w = data["width"][i] h = data["height"][i] # Convert to EasyOCR-compatible bbox format [[x1,y1],[x2,y1],[x2,y2],[x1,y2]] bbox = [ [float(x), float(y)], [float(x + w), float(y)], [float(x + w), float(y + h)], [float(x), float(y + h)], ] boxes.append(OCRBox( text=text, confidence=conf / 100.0, # normalize ke 0-1 bbox=bbox, )) # Build full text (preserve layout via pytesseract string output) full_text = self.pytesseract.image_to_string( processed, lang=self.lang_str, config="--psm 3 --oem 3", ).strip() return OCRResult( full_text=full_text, boxes=boxes, language=self.lang_str, engine="tesseract", ) except Exception as e: logger.error(f"OCR error: {e}") # Last resort fallback try: text = self.pytesseract.image_to_string(image.numpy, lang="eng") return OCRResult(full_text=text.strip(), boxes=[], language="eng", engine="tesseract") except Exception as e2: logger.error(f"OCR fallback juga gagal: {e2}") return OCRResult(full_text="", boxes=[], language=self.lang_str, engine="tesseract") def extract_text_simple(self, image: ImageInput) -> str: result = self.extract_text(image) return result.full_text