Spaces:

robrtt
/

ai-rag

Running

App Files Files Community

ai-rag / cv_module /src /processors /ocr_processor.py

robrtt

Clean rebuild: all features fixed

7d07e42 4 days ago

raw

history blame contribute delete

7.51 kB

	"""
	OCR processor menggunakan Tesseract via pytesseract.

	Kenapa ganti dari EasyOCR:
	- EasyOCR: ~500MB RAM, ~15-20s load time (download detection + recognition models)
	- Tesseract: 0MB model load (binary + lang packs sudah di-install di image),
	load time ~0.1s, RAM overhead ~50MB saat proses
	- Accuracy untuk dokumen/teks standard: comparable
	- Tesseract binary + tesseract-ocr-ind sudah ada di Dockerfile

	Trade-off: EasyOCR lebih akurat untuk teks miring/deformed.
	Untuk use case RAG (extract teks dari dokumen, screenshot), Tesseract cukup.
	"""

	from __future__ import annotations

	from typing import List
	from dataclasses import dataclass, field
	import subprocess

	import numpy as np
	import cv2
	from loguru import logger

	from ..config import get_cv_settings
	from ..processors.image_preprocessor import ImageInput


	@dataclass
	class OCRBox:
	text: str
	confidence: float
	bbox: list

	def to_dict(self) -> dict:
	return {
	"text": self.text,
	"confidence": round(self.confidence, 4),
	"bbox": self.bbox,
	}


	@dataclass
	class OCRResult:
	full_text: str
	boxes: List[OCRBox] = field(default_factory=list)
	language: str = ""
	engine: str = ""

	@property
	def word_count(self) -> int:
	return len(self.full_text.split())


	class OCRProcessor:
	"""
	OCR via Tesseract (pytesseract) — ringan, instant load.
	Tidak ada model download, tidak ada torch dependency.

	Preprocessing: CLAHE + sharpen untuk improve akurasi pada gambar gelap/buram.
	"""

	MIN_OCR_DIM = 1000 # Upscale gambar kecil

	def __init__(self):
	settings = get_cv_settings()
	self.engine = "tesseract"

	# Parse languages: "en,id" -> "eng+ind" (tesseract format)
	raw_langs = [l.strip() for l in settings.ocr_languages.split(",")]
	tess_map = {"en": "eng", "id": "ind", "eng": "eng", "ind": "ind"}
	tess_langs = [tess_map.get(l, l) for l in raw_langs]

	# Filter ke lang yang benar-benar ada di sistem
	available = self._get_available_langs()
	self.languages = [l for l in tess_langs if l in available]
	if not self.languages:
	logger.warning("Tidak ada tesseract lang yang cocok, fallback ke 'eng'")
	self.languages = ["eng"]

	self.lang_str = "+".join(self.languages)
	logger.info(f"Loading OCR (tesseract) for languages: {self.languages}")

	# Verify tesseract binary works
	try:
	import pytesseract
	self.pytesseract = pytesseract
	ver = pytesseract.get_tesseract_version()
	logger.info(f"OCR processor ready. Tesseract {ver}")
	except Exception as e:
	logger.error(f"Gagal init Tesseract: {e}")
	raise

	@staticmethod
	def _get_available_langs() -> set:
	"""Ambil daftar lang pack yang ter-install di sistem."""
	try:
	result = subprocess.run(
	["tesseract", "--list-langs"],
	capture_output=True, text=True, timeout=5
	)
	langs = set()
	for line in result.stdout.splitlines() + result.stderr.splitlines():
	line = line.strip()
	if line and not line.startswith("List") and not line.startswith("Tess"):
	langs.add(line)
	return langs
	except Exception:
	return {"eng"}

	def _preprocess_for_ocr(self, img: np.ndarray) -> np.ndarray:
	"""
	Preprocessing untuk improve Tesseract accuracy:
	- Upscale jika terlalu kecil
	- Grayscale
	- CLAHE contrast enhancement
	- Sharpen
	- Threshold adaptif (optional — skip kalau gambar sudah clear)
	"""
	try:
	h, w = img.shape[:2]

	# Upscale
	if max(h, w) < self.MIN_OCR_DIM:
	scale = self.MIN_OCR_DIM / max(h, w)
	img = cv2.resize(img, (int(w * scale), int(h * scale)),
	interpolation=cv2.INTER_CUBIC)

	# Grayscale
	if len(img.shape) == 3:
	gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
	else:
	gray = img.copy()

	# CLAHE
	clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
	enhanced = clahe.apply(gray)

	# Sharpen
	kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32)
	sharpened = cv2.filter2D(enhanced, -1, kernel)

	return sharpened # grayscale single-channel — Tesseract handles this fine

	except Exception as e:
	logger.warning(f"OCR preprocessing fallback: {e}")
	if len(img.shape) == 3:
	return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
	return img

	def extract_text(
	self,
	image: ImageInput,
	detail: bool = True,
	paragraph: bool = False,
	) -> OCRResult:
	"""Extract teks dari gambar menggunakan Tesseract."""
	logger.debug(f"Running Tesseract OCR on {image.width}x{image.height} image")

	try:
	processed = self._preprocess_for_ocr(image.numpy.copy())

	# Get detailed output with bounding boxes
	data = self.pytesseract.image_to_data(
	processed,
	lang=self.lang_str,
	config="--psm 3 --oem 3",
	output_type=self.pytesseract.Output.DICT,
	)

	boxes = []
	for i in range(len(data["text"])):
	text = str(data["text"][i]).strip()
	conf = float(data["conf"][i])

	if not text or conf < 10: # Tesseract conf is 0-100
	continue

	x = data["left"][i]
	y = data["top"][i]
	w = data["width"][i]
	h = data["height"][i]

	# Convert to EasyOCR-compatible bbox format [[x1,y1],[x2,y1],[x2,y2],[x1,y2]]
	bbox = [
	[float(x), float(y)],
	[float(x + w), float(y)],
	[float(x + w), float(y + h)],
	[float(x), float(y + h)],
	]

	boxes.append(OCRBox(
	text=text,
	confidence=conf / 100.0, # normalize ke 0-1
	bbox=bbox,
	))

	# Build full text (preserve layout via pytesseract string output)
	full_text = self.pytesseract.image_to_string(
	processed,
	lang=self.lang_str,
	config="--psm 3 --oem 3",
	).strip()

	return OCRResult(
	full_text=full_text,
	boxes=boxes,
	language=self.lang_str,
	engine="tesseract",
	)

	except Exception as e:
	logger.error(f"OCR error: {e}")
	# Last resort fallback
	try:
	text = self.pytesseract.image_to_string(image.numpy, lang="eng")
	return OCRResult(full_text=text.strip(), boxes=[], language="eng", engine="tesseract")
	except Exception as e2:
	logger.error(f"OCR fallback juga gagal: {e2}")
	return OCRResult(full_text="", boxes=[], language=self.lang_str, engine="tesseract")

	def extract_text_simple(self, image: ImageInput) -> str:
	result = self.extract_text(image)
	return result.full_text