Spaces:

robrtt
/

ai-rag

Running

File size: 7,514 Bytes

7d07e42

"""
OCR processor menggunakan Tesseract via pytesseract.

Kenapa ganti dari EasyOCR:
- EasyOCR: ~500MB RAM, ~15-20s load time (download detection + recognition models)
- Tesseract: 0MB model load (binary + lang packs sudah di-install di image),
  load time ~0.1s, RAM overhead ~50MB saat proses
- Accuracy untuk dokumen/teks standard: comparable
- Tesseract binary + tesseract-ocr-ind sudah ada di Dockerfile

Trade-off: EasyOCR lebih akurat untuk teks miring/deformed.
Untuk use case RAG (extract teks dari dokumen, screenshot), Tesseract cukup.
"""

from __future__ import annotations

from typing import List
from dataclasses import dataclass, field
import subprocess

import numpy as np
import cv2
from loguru import logger

from ..config import get_cv_settings
from ..processors.image_preprocessor import ImageInput


@dataclass
class OCRBox:
    text: str
    confidence: float
    bbox: list

    def to_dict(self) -> dict:
        return {
            "text": self.text,
            "confidence": round(self.confidence, 4),
            "bbox": self.bbox,
        }


@dataclass
class OCRResult:
    full_text: str
    boxes: List[OCRBox] = field(default_factory=list)
    language: str = ""
    engine: str = ""

    @property
    def word_count(self) -> int:
        return len(self.full_text.split())


class OCRProcessor:
    """
    OCR via Tesseract (pytesseract) — ringan, instant load.
    Tidak ada model download, tidak ada torch dependency.

    Preprocessing: CLAHE + sharpen untuk improve akurasi pada gambar gelap/buram.
    """

    MIN_OCR_DIM = 1000  # Upscale gambar kecil

    def __init__(self):
        settings = get_cv_settings()
        self.engine = "tesseract"

        # Parse languages: "en,id" -> "eng+ind" (tesseract format)
        raw_langs = [l.strip() for l in settings.ocr_languages.split(",")]
        tess_map = {"en": "eng", "id": "ind", "eng": "eng", "ind": "ind"}
        tess_langs = [tess_map.get(l, l) for l in raw_langs]

        # Filter ke lang yang benar-benar ada di sistem
        available = self._get_available_langs()
        self.languages = [l for l in tess_langs if l in available]
        if not self.languages:
            logger.warning("Tidak ada tesseract lang yang cocok, fallback ke 'eng'")
            self.languages = ["eng"]

        self.lang_str = "+".join(self.languages)
        logger.info(f"Loading OCR (tesseract) for languages: {self.languages}")

        # Verify tesseract binary works
        try:
            import pytesseract
            self.pytesseract = pytesseract
            ver = pytesseract.get_tesseract_version()
            logger.info(f"OCR processor ready. Tesseract {ver}")
        except Exception as e:
            logger.error(f"Gagal init Tesseract: {e}")
            raise

    @staticmethod
    def _get_available_langs() -> set:
        """Ambil daftar lang pack yang ter-install di sistem."""
        try:
            result = subprocess.run(
                ["tesseract", "--list-langs"],
                capture_output=True, text=True, timeout=5
            )
            langs = set()
            for line in result.stdout.splitlines() + result.stderr.splitlines():
                line = line.strip()
                if line and not line.startswith("List") and not line.startswith("Tess"):
                    langs.add(line)
            return langs
        except Exception:
            return {"eng"}

    def _preprocess_for_ocr(self, img: np.ndarray) -> np.ndarray:
        """
        Preprocessing untuk improve Tesseract accuracy:
        - Upscale jika terlalu kecil
        - Grayscale
        - CLAHE contrast enhancement
        - Sharpen
        - Threshold adaptif (optional — skip kalau gambar sudah clear)
        """
        try:
            h, w = img.shape[:2]

            # Upscale
            if max(h, w) < self.MIN_OCR_DIM:
                scale = self.MIN_OCR_DIM / max(h, w)
                img = cv2.resize(img, (int(w * scale), int(h * scale)),
                                 interpolation=cv2.INTER_CUBIC)

            # Grayscale
            if len(img.shape) == 3:
                gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
            else:
                gray = img.copy()

            # CLAHE
            clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
            enhanced = clahe.apply(gray)

            # Sharpen
            kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32)
            sharpened = cv2.filter2D(enhanced, -1, kernel)

            return sharpened  # grayscale single-channel — Tesseract handles this fine

        except Exception as e:
            logger.warning(f"OCR preprocessing fallback: {e}")
            if len(img.shape) == 3:
                return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
            return img

    def extract_text(
        self,
        image: ImageInput,
        detail: bool = True,
        paragraph: bool = False,
    ) -> OCRResult:
        """Extract teks dari gambar menggunakan Tesseract."""
        logger.debug(f"Running Tesseract OCR on {image.width}x{image.height} image")

        try:
            processed = self._preprocess_for_ocr(image.numpy.copy())

            # Get detailed output with bounding boxes
            data = self.pytesseract.image_to_data(
                processed,
                lang=self.lang_str,
                config="--psm 3 --oem 3",
                output_type=self.pytesseract.Output.DICT,
            )

            boxes = []
            for i in range(len(data["text"])):
                text = str(data["text"][i]).strip()
                conf = float(data["conf"][i])

                if not text or conf < 10:  # Tesseract conf is 0-100
                    continue

                x = data["left"][i]
                y = data["top"][i]
                w = data["width"][i]
                h = data["height"][i]

                # Convert to EasyOCR-compatible bbox format [[x1,y1],[x2,y1],[x2,y2],[x1,y2]]
                bbox = [
                    [float(x), float(y)],
                    [float(x + w), float(y)],
                    [float(x + w), float(y + h)],
                    [float(x), float(y + h)],
                ]

                boxes.append(OCRBox(
                    text=text,
                    confidence=conf / 100.0,  # normalize ke 0-1
                    bbox=bbox,
                ))

            # Build full text (preserve layout via pytesseract string output)
            full_text = self.pytesseract.image_to_string(
                processed,
                lang=self.lang_str,
                config="--psm 3 --oem 3",
            ).strip()

            return OCRResult(
                full_text=full_text,
                boxes=boxes,
                language=self.lang_str,
                engine="tesseract",
            )

        except Exception as e:
            logger.error(f"OCR error: {e}")
            # Last resort fallback
            try:
                text = self.pytesseract.image_to_string(image.numpy, lang="eng")
                return OCRResult(full_text=text.strip(), boxes=[], language="eng", engine="tesseract")
            except Exception as e2:
                logger.error(f"OCR fallback juga gagal: {e2}")
                return OCRResult(full_text="", boxes=[], language=self.lang_str, engine="tesseract")

    def extract_text_simple(self, image: ImageInput) -> str:
        result = self.extract_text(image)
        return result.full_text