File size: 1,880 Bytes
0588e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
import logging
import easyocr
from PIL import Image
import numpy as np

logger = logging.getLogger(__name__)


class EasyOCRModel:
    """
    OCR menggunakan EasyOCR.
    Lebih cocok untuk screenshot app Gojek/Grab/Shopee.
    """

    def __init__(self, lang: list = ["en", "id"]):
        print(f"[EasyOCR] Loading model | lang={lang}")
        # gpu=False kalau tidak punya GPU
        self.reader = easyocr.Reader(lang, gpu=False)
        print("[EasyOCR] Model loaded ✅")

    def extract(self, image: Image.Image) -> str:
        """Extract raw text dari image."""
        img_array = np.array(image)
        results = self.reader.readtext(img_array)

        # results = [(bbox, text, confidence), ...]
        lines = [text for (_, text, conf) in results if conf > 0.3]
        raw = "\n".join(lines)

        print(f"[EasyOCR] RAW TEXT:\n{'='*40}\n{raw}\n{'='*40}")
        return raw

    def extract_with_confidence(self, image: Image.Image, min_conf: float = 0.3) -> list:
        """Extract text beserta confidence score."""
        img_array = np.array(image)
        results = self.reader.readtext(img_array)

        filtered = [
            {"text": text, "confidence": round(conf, 2)}
            for (_, text, conf) in results
            if conf > min_conf
        ]

        print(f"[EasyOCR] Extracted {len(filtered)} text blocks")
        return filtered

    def extract_clean(self, image: Image.Image, receipt_type: str = "unknown") -> str:
        """Extract + clean sekaligus."""
        raw = self.extract(image)
        cleaned = self.clean(raw)
        print(f"[EasyOCR] CLEANED:\n{'='*40}\n{cleaned}\n{'='*40}")
        return cleaned

    def clean(self, text: str) -> str:
        """Basic cleaning hasil OCR."""
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r'[ \t]{2,}', ' ', text)
        return text.strip()