File size: 7,514 Bytes
7d07e42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | """
OCR processor menggunakan Tesseract via pytesseract.
Kenapa ganti dari EasyOCR:
- EasyOCR: ~500MB RAM, ~15-20s load time (download detection + recognition models)
- Tesseract: 0MB model load (binary + lang packs sudah di-install di image),
load time ~0.1s, RAM overhead ~50MB saat proses
- Accuracy untuk dokumen/teks standard: comparable
- Tesseract binary + tesseract-ocr-ind sudah ada di Dockerfile
Trade-off: EasyOCR lebih akurat untuk teks miring/deformed.
Untuk use case RAG (extract teks dari dokumen, screenshot), Tesseract cukup.
"""
from __future__ import annotations
from typing import List
from dataclasses import dataclass, field
import subprocess
import numpy as np
import cv2
from loguru import logger
from ..config import get_cv_settings
from ..processors.image_preprocessor import ImageInput
@dataclass
class OCRBox:
text: str
confidence: float
bbox: list
def to_dict(self) -> dict:
return {
"text": self.text,
"confidence": round(self.confidence, 4),
"bbox": self.bbox,
}
@dataclass
class OCRResult:
full_text: str
boxes: List[OCRBox] = field(default_factory=list)
language: str = ""
engine: str = ""
@property
def word_count(self) -> int:
return len(self.full_text.split())
class OCRProcessor:
"""
OCR via Tesseract (pytesseract) — ringan, instant load.
Tidak ada model download, tidak ada torch dependency.
Preprocessing: CLAHE + sharpen untuk improve akurasi pada gambar gelap/buram.
"""
MIN_OCR_DIM = 1000 # Upscale gambar kecil
def __init__(self):
settings = get_cv_settings()
self.engine = "tesseract"
# Parse languages: "en,id" -> "eng+ind" (tesseract format)
raw_langs = [l.strip() for l in settings.ocr_languages.split(",")]
tess_map = {"en": "eng", "id": "ind", "eng": "eng", "ind": "ind"}
tess_langs = [tess_map.get(l, l) for l in raw_langs]
# Filter ke lang yang benar-benar ada di sistem
available = self._get_available_langs()
self.languages = [l for l in tess_langs if l in available]
if not self.languages:
logger.warning("Tidak ada tesseract lang yang cocok, fallback ke 'eng'")
self.languages = ["eng"]
self.lang_str = "+".join(self.languages)
logger.info(f"Loading OCR (tesseract) for languages: {self.languages}")
# Verify tesseract binary works
try:
import pytesseract
self.pytesseract = pytesseract
ver = pytesseract.get_tesseract_version()
logger.info(f"OCR processor ready. Tesseract {ver}")
except Exception as e:
logger.error(f"Gagal init Tesseract: {e}")
raise
@staticmethod
def _get_available_langs() -> set:
"""Ambil daftar lang pack yang ter-install di sistem."""
try:
result = subprocess.run(
["tesseract", "--list-langs"],
capture_output=True, text=True, timeout=5
)
langs = set()
for line in result.stdout.splitlines() + result.stderr.splitlines():
line = line.strip()
if line and not line.startswith("List") and not line.startswith("Tess"):
langs.add(line)
return langs
except Exception:
return {"eng"}
def _preprocess_for_ocr(self, img: np.ndarray) -> np.ndarray:
"""
Preprocessing untuk improve Tesseract accuracy:
- Upscale jika terlalu kecil
- Grayscale
- CLAHE contrast enhancement
- Sharpen
- Threshold adaptif (optional — skip kalau gambar sudah clear)
"""
try:
h, w = img.shape[:2]
# Upscale
if max(h, w) < self.MIN_OCR_DIM:
scale = self.MIN_OCR_DIM / max(h, w)
img = cv2.resize(img, (int(w * scale), int(h * scale)),
interpolation=cv2.INTER_CUBIC)
# Grayscale
if len(img.shape) == 3:
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
else:
gray = img.copy()
# CLAHE
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# Sharpen
kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32)
sharpened = cv2.filter2D(enhanced, -1, kernel)
return sharpened # grayscale single-channel — Tesseract handles this fine
except Exception as e:
logger.warning(f"OCR preprocessing fallback: {e}")
if len(img.shape) == 3:
return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
return img
def extract_text(
self,
image: ImageInput,
detail: bool = True,
paragraph: bool = False,
) -> OCRResult:
"""Extract teks dari gambar menggunakan Tesseract."""
logger.debug(f"Running Tesseract OCR on {image.width}x{image.height} image")
try:
processed = self._preprocess_for_ocr(image.numpy.copy())
# Get detailed output with bounding boxes
data = self.pytesseract.image_to_data(
processed,
lang=self.lang_str,
config="--psm 3 --oem 3",
output_type=self.pytesseract.Output.DICT,
)
boxes = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf = float(data["conf"][i])
if not text or conf < 10: # Tesseract conf is 0-100
continue
x = data["left"][i]
y = data["top"][i]
w = data["width"][i]
h = data["height"][i]
# Convert to EasyOCR-compatible bbox format [[x1,y1],[x2,y1],[x2,y2],[x1,y2]]
bbox = [
[float(x), float(y)],
[float(x + w), float(y)],
[float(x + w), float(y + h)],
[float(x), float(y + h)],
]
boxes.append(OCRBox(
text=text,
confidence=conf / 100.0, # normalize ke 0-1
bbox=bbox,
))
# Build full text (preserve layout via pytesseract string output)
full_text = self.pytesseract.image_to_string(
processed,
lang=self.lang_str,
config="--psm 3 --oem 3",
).strip()
return OCRResult(
full_text=full_text,
boxes=boxes,
language=self.lang_str,
engine="tesseract",
)
except Exception as e:
logger.error(f"OCR error: {e}")
# Last resort fallback
try:
text = self.pytesseract.image_to_string(image.numpy, lang="eng")
return OCRResult(full_text=text.strip(), boxes=[], language="eng", engine="tesseract")
except Exception as e2:
logger.error(f"OCR fallback juga gagal: {e2}")
return OCRResult(full_text="", boxes=[], language=self.lang_str, engine="tesseract")
def extract_text_simple(self, image: ImageInput) -> str:
result = self.extract_text(image)
return result.full_text
|