ai-rag / cv_module /src /processors /ocr_processor.py
robrtt's picture
Clean rebuild: all features fixed
7d07e42
"""
OCR processor menggunakan Tesseract via pytesseract.
Kenapa ganti dari EasyOCR:
- EasyOCR: ~500MB RAM, ~15-20s load time (download detection + recognition models)
- Tesseract: 0MB model load (binary + lang packs sudah di-install di image),
load time ~0.1s, RAM overhead ~50MB saat proses
- Accuracy untuk dokumen/teks standard: comparable
- Tesseract binary + tesseract-ocr-ind sudah ada di Dockerfile
Trade-off: EasyOCR lebih akurat untuk teks miring/deformed.
Untuk use case RAG (extract teks dari dokumen, screenshot), Tesseract cukup.
"""
from __future__ import annotations
from typing import List
from dataclasses import dataclass, field
import subprocess
import numpy as np
import cv2
from loguru import logger
from ..config import get_cv_settings
from ..processors.image_preprocessor import ImageInput
@dataclass
class OCRBox:
text: str
confidence: float
bbox: list
def to_dict(self) -> dict:
return {
"text": self.text,
"confidence": round(self.confidence, 4),
"bbox": self.bbox,
}
@dataclass
class OCRResult:
full_text: str
boxes: List[OCRBox] = field(default_factory=list)
language: str = ""
engine: str = ""
@property
def word_count(self) -> int:
return len(self.full_text.split())
class OCRProcessor:
"""
OCR via Tesseract (pytesseract) — ringan, instant load.
Tidak ada model download, tidak ada torch dependency.
Preprocessing: CLAHE + sharpen untuk improve akurasi pada gambar gelap/buram.
"""
MIN_OCR_DIM = 1000 # Upscale gambar kecil
def __init__(self):
settings = get_cv_settings()
self.engine = "tesseract"
# Parse languages: "en,id" -> "eng+ind" (tesseract format)
raw_langs = [l.strip() for l in settings.ocr_languages.split(",")]
tess_map = {"en": "eng", "id": "ind", "eng": "eng", "ind": "ind"}
tess_langs = [tess_map.get(l, l) for l in raw_langs]
# Filter ke lang yang benar-benar ada di sistem
available = self._get_available_langs()
self.languages = [l for l in tess_langs if l in available]
if not self.languages:
logger.warning("Tidak ada tesseract lang yang cocok, fallback ke 'eng'")
self.languages = ["eng"]
self.lang_str = "+".join(self.languages)
logger.info(f"Loading OCR (tesseract) for languages: {self.languages}")
# Verify tesseract binary works
try:
import pytesseract
self.pytesseract = pytesseract
ver = pytesseract.get_tesseract_version()
logger.info(f"OCR processor ready. Tesseract {ver}")
except Exception as e:
logger.error(f"Gagal init Tesseract: {e}")
raise
@staticmethod
def _get_available_langs() -> set:
"""Ambil daftar lang pack yang ter-install di sistem."""
try:
result = subprocess.run(
["tesseract", "--list-langs"],
capture_output=True, text=True, timeout=5
)
langs = set()
for line in result.stdout.splitlines() + result.stderr.splitlines():
line = line.strip()
if line and not line.startswith("List") and not line.startswith("Tess"):
langs.add(line)
return langs
except Exception:
return {"eng"}
def _preprocess_for_ocr(self, img: np.ndarray) -> np.ndarray:
"""
Preprocessing untuk improve Tesseract accuracy:
- Upscale jika terlalu kecil
- Grayscale
- CLAHE contrast enhancement
- Sharpen
- Threshold adaptif (optional — skip kalau gambar sudah clear)
"""
try:
h, w = img.shape[:2]
# Upscale
if max(h, w) < self.MIN_OCR_DIM:
scale = self.MIN_OCR_DIM / max(h, w)
img = cv2.resize(img, (int(w * scale), int(h * scale)),
interpolation=cv2.INTER_CUBIC)
# Grayscale
if len(img.shape) == 3:
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
else:
gray = img.copy()
# CLAHE
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# Sharpen
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32)
sharpened = cv2.filter2D(enhanced, -1, kernel)
return sharpened # grayscale single-channel — Tesseract handles this fine
except Exception as e:
logger.warning(f"OCR preprocessing fallback: {e}")
if len(img.shape) == 3:
return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
return img
def extract_text(
self,
image: ImageInput,
detail: bool = True,
paragraph: bool = False,
) -> OCRResult:
"""Extract teks dari gambar menggunakan Tesseract."""
logger.debug(f"Running Tesseract OCR on {image.width}x{image.height} image")
try:
processed = self._preprocess_for_ocr(image.numpy.copy())
# Get detailed output with bounding boxes
data = self.pytesseract.image_to_data(
processed,
lang=self.lang_str,
config="--psm 3 --oem 3",
output_type=self.pytesseract.Output.DICT,
)
boxes = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf = float(data["conf"][i])
if not text or conf < 10: # Tesseract conf is 0-100
continue
x = data["left"][i]
y = data["top"][i]
w = data["width"][i]
h = data["height"][i]
# Convert to EasyOCR-compatible bbox format [[x1,y1],[x2,y1],[x2,y2],[x1,y2]]
bbox = [
[float(x), float(y)],
[float(x + w), float(y)],
[float(x + w), float(y + h)],
[float(x), float(y + h)],
]
boxes.append(OCRBox(
text=text,
confidence=conf / 100.0, # normalize ke 0-1
bbox=bbox,
))
# Build full text (preserve layout via pytesseract string output)
full_text = self.pytesseract.image_to_string(
processed,
lang=self.lang_str,
config="--psm 3 --oem 3",
).strip()
return OCRResult(
full_text=full_text,
boxes=boxes,
language=self.lang_str,
engine="tesseract",
)
except Exception as e:
logger.error(f"OCR error: {e}")
# Last resort fallback
try:
text = self.pytesseract.image_to_string(image.numpy, lang="eng")
return OCRResult(full_text=text.strip(), boxes=[], language="eng", engine="tesseract")
except Exception as e2:
logger.error(f"OCR fallback juga gagal: {e2}")
return OCRResult(full_text="", boxes=[], language=self.lang_str, engine="tesseract")
def extract_text_simple(self, image: ImageInput) -> str:
result = self.extract_text(image)
return result.full_text