receipt-ocr / models /easyocr_ocr.py
Khaw100
deploy receipt ocr api
0588e55
import re
import logging
import easyocr
from PIL import Image
import numpy as np
logger = logging.getLogger(__name__)
class EasyOCRModel:
"""
OCR menggunakan EasyOCR.
Lebih cocok untuk screenshot app Gojek/Grab/Shopee.
"""
def __init__(self, lang: list = ["en", "id"]):
print(f"[EasyOCR] Loading model | lang={lang}")
# gpu=False kalau tidak punya GPU
self.reader = easyocr.Reader(lang, gpu=False)
print("[EasyOCR] Model loaded ✅")
def extract(self, image: Image.Image) -> str:
"""Extract raw text dari image."""
img_array = np.array(image)
results = self.reader.readtext(img_array)
# results = [(bbox, text, confidence), ...]
lines = [text for (_, text, conf) in results if conf > 0.3]
raw = "\n".join(lines)
print(f"[EasyOCR] RAW TEXT:\n{'='*40}\n{raw}\n{'='*40}")
return raw
def extract_with_confidence(self, image: Image.Image, min_conf: float = 0.3) -> list:
"""Extract text beserta confidence score."""
img_array = np.array(image)
results = self.reader.readtext(img_array)
filtered = [
{"text": text, "confidence": round(conf, 2)}
for (_, text, conf) in results
if conf > min_conf
]
print(f"[EasyOCR] Extracted {len(filtered)} text blocks")
return filtered
def extract_clean(self, image: Image.Image, receipt_type: str = "unknown") -> str:
"""Extract + clean sekaligus."""
raw = self.extract(image)
cleaned = self.clean(raw)
print(f"[EasyOCR] CLEANED:\n{'='*40}\n{cleaned}\n{'='*40}")
return cleaned
def clean(self, text: str) -> str:
"""Basic cleaning hasil OCR."""
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'[ \t]{2,}', ' ', text)
return text.strip()