Spaces:
Sleeping
Sleeping
| import re | |
| import logging | |
| import easyocr | |
| from PIL import Image | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| class EasyOCRModel: | |
| """ | |
| OCR menggunakan EasyOCR. | |
| Lebih cocok untuk screenshot app Gojek/Grab/Shopee. | |
| """ | |
| def __init__(self, lang: list = ["en", "id"]): | |
| print(f"[EasyOCR] Loading model | lang={lang}") | |
| # gpu=False kalau tidak punya GPU | |
| self.reader = easyocr.Reader(lang, gpu=False) | |
| print("[EasyOCR] Model loaded ✅") | |
| def extract(self, image: Image.Image) -> str: | |
| """Extract raw text dari image.""" | |
| img_array = np.array(image) | |
| results = self.reader.readtext(img_array) | |
| # results = [(bbox, text, confidence), ...] | |
| lines = [text for (_, text, conf) in results if conf > 0.3] | |
| raw = "\n".join(lines) | |
| print(f"[EasyOCR] RAW TEXT:\n{'='*40}\n{raw}\n{'='*40}") | |
| return raw | |
| def extract_with_confidence(self, image: Image.Image, min_conf: float = 0.3) -> list: | |
| """Extract text beserta confidence score.""" | |
| img_array = np.array(image) | |
| results = self.reader.readtext(img_array) | |
| filtered = [ | |
| {"text": text, "confidence": round(conf, 2)} | |
| for (_, text, conf) in results | |
| if conf > min_conf | |
| ] | |
| print(f"[EasyOCR] Extracted {len(filtered)} text blocks") | |
| return filtered | |
| def extract_clean(self, image: Image.Image, receipt_type: str = "unknown") -> str: | |
| """Extract + clean sekaligus.""" | |
| raw = self.extract(image) | |
| cleaned = self.clean(raw) | |
| print(f"[EasyOCR] CLEANED:\n{'='*40}\n{cleaned}\n{'='*40}") | |
| return cleaned | |
| def clean(self, text: str) -> str: | |
| """Basic cleaning hasil OCR.""" | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| text = re.sub(r'[ \t]{2,}', ' ', text) | |
| return text.strip() |