Spaces:
Sleeping
Sleeping
| import os | |
| os.environ["PADDLE_DISABLE_ONEDNN"] = "1" | |
| import re | |
| import logging | |
| from PIL import Image | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| class PaddleOCRModel: | |
| """ | |
| OCR menggunakan PaddleOCR v2.7.3. | |
| Bagus untuk: | |
| - Screenshot app (Gojek, Grab, Shopee) | |
| - Struk minimarket | |
| - Multi-bahasa termasuk Indonesia | |
| - Lebih cepat dari EasyOCR | |
| """ | |
| def __init__(self, lang: str = "en"): | |
| from paddleocr import PaddleOCR | |
| print(f"[PaddleOCR] Loading model | lang={lang}") | |
| self.ocr = PaddleOCR( | |
| use_angle_cls=True, | |
| lang=lang, | |
| use_gpu=False, | |
| show_log=False | |
| ) | |
| print("[PaddleOCR] Model loaded ✅") | |
| def extract(self, image: Image.Image) -> str: | |
| """Extract raw text dari image.""" | |
| img_array = np.array(image.convert("RGB")) | |
| results = self.ocr.ocr(img_array, cls=True) | |
| if not results or not results[0]: | |
| print("[PaddleOCR] No text detected") | |
| return "" | |
| lines = [ | |
| line[1][0] | |
| for line in results[0] | |
| if line[1][1] > 0.3 | |
| ] | |
| raw = "\n".join(lines) | |
| print(f"[PaddleOCR] RAW TEXT:\n{'='*40}\n{raw}\n{'='*40}") | |
| return raw | |
| def extract_with_confidence(self, image: Image.Image, min_conf: float = 0.3) -> list: | |
| """Extract text beserta confidence score.""" | |
| img_array = np.array(image.convert("RGB")) | |
| results = self.ocr.ocr(img_array, cls=True) | |
| if not results or not results[0]: | |
| return [] | |
| filtered = [ | |
| {"text": line[1][0], "confidence": round(line[1][1], 2)} | |
| for line in results[0] | |
| if line[1][1] > min_conf | |
| ] | |
| print(f"[PaddleOCR] Extracted {len(filtered)} text blocks") | |
| for item in filtered: | |
| print(f" conf={item['confidence']:.2f} | {item['text']}") | |
| return filtered | |
| def extract_clean(self, image: Image.Image, receipt_type: str = "unknown") -> str: | |
| """Extract + clean sekaligus.""" | |
| raw = self.extract(image) | |
| cleaned = self.clean(raw) | |
| print(f"[PaddleOCR] CLEANED:\n{'='*40}\n{cleaned}\n{'='*40}") | |
| return cleaned | |
| def clean(self, text: str) -> str: | |
| """Basic cleaning hasil OCR.""" | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| text = re.sub(r'[ \t]{2,}', ' ', text) | |
| return text.strip() |