import os os.environ["PADDLE_DISABLE_ONEDNN"] = "1" import re import logging from PIL import Image import numpy as np logger = logging.getLogger(__name__) class PaddleOCRModel: """ OCR menggunakan PaddleOCR v2.7.3. Bagus untuk: - Screenshot app (Gojek, Grab, Shopee) - Struk minimarket - Multi-bahasa termasuk Indonesia - Lebih cepat dari EasyOCR """ def __init__(self, lang: str = "en"): from paddleocr import PaddleOCR print(f"[PaddleOCR] Loading model | lang={lang}") self.ocr = PaddleOCR( use_angle_cls=True, lang=lang, use_gpu=False, show_log=False ) print("[PaddleOCR] Model loaded ✅") def extract(self, image: Image.Image) -> str: """Extract raw text dari image.""" img_array = np.array(image.convert("RGB")) results = self.ocr.ocr(img_array, cls=True) if not results or not results[0]: print("[PaddleOCR] No text detected") return "" lines = [ line[1][0] for line in results[0] if line[1][1] > 0.3 ] raw = "\n".join(lines) print(f"[PaddleOCR] RAW TEXT:\n{'='*40}\n{raw}\n{'='*40}") return raw def extract_with_confidence(self, image: Image.Image, min_conf: float = 0.3) -> list: """Extract text beserta confidence score.""" img_array = np.array(image.convert("RGB")) results = self.ocr.ocr(img_array, cls=True) if not results or not results[0]: return [] filtered = [ {"text": line[1][0], "confidence": round(line[1][1], 2)} for line in results[0] if line[1][1] > min_conf ] print(f"[PaddleOCR] Extracted {len(filtered)} text blocks") for item in filtered: print(f" conf={item['confidence']:.2f} | {item['text']}") return filtered def extract_clean(self, image: Image.Image, receipt_type: str = "unknown") -> str: """Extract + clean sekaligus.""" raw = self.extract(image) cleaned = self.clean(raw) print(f"[PaddleOCR] CLEANED:\n{'='*40}\n{cleaned}\n{'='*40}") return cleaned def clean(self, text: str) -> str: """Basic cleaning hasil OCR.""" text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r'[ \t]{2,}', ' ', text) return text.strip()