receipt-ocr / models /paddleocr_ocr.py
Khaw100
deploy receipt ocr api
0588e55
import os
os.environ["PADDLE_DISABLE_ONEDNN"] = "1"
import re
import logging
from PIL import Image
import numpy as np
logger = logging.getLogger(__name__)
class PaddleOCRModel:
"""
OCR menggunakan PaddleOCR v2.7.3.
Bagus untuk:
- Screenshot app (Gojek, Grab, Shopee)
- Struk minimarket
- Multi-bahasa termasuk Indonesia
- Lebih cepat dari EasyOCR
"""
def __init__(self, lang: str = "en"):
from paddleocr import PaddleOCR
print(f"[PaddleOCR] Loading model | lang={lang}")
self.ocr = PaddleOCR(
use_angle_cls=True,
lang=lang,
use_gpu=False,
show_log=False
)
print("[PaddleOCR] Model loaded ✅")
def extract(self, image: Image.Image) -> str:
"""Extract raw text dari image."""
img_array = np.array(image.convert("RGB"))
results = self.ocr.ocr(img_array, cls=True)
if not results or not results[0]:
print("[PaddleOCR] No text detected")
return ""
lines = [
line[1][0]
for line in results[0]
if line[1][1] > 0.3
]
raw = "\n".join(lines)
print(f"[PaddleOCR] RAW TEXT:\n{'='*40}\n{raw}\n{'='*40}")
return raw
def extract_with_confidence(self, image: Image.Image, min_conf: float = 0.3) -> list:
"""Extract text beserta confidence score."""
img_array = np.array(image.convert("RGB"))
results = self.ocr.ocr(img_array, cls=True)
if not results or not results[0]:
return []
filtered = [
{"text": line[1][0], "confidence": round(line[1][1], 2)}
for line in results[0]
if line[1][1] > min_conf
]
print(f"[PaddleOCR] Extracted {len(filtered)} text blocks")
for item in filtered:
print(f" conf={item['confidence']:.2f} | {item['text']}")
return filtered
def extract_clean(self, image: Image.Image, receipt_type: str = "unknown") -> str:
"""Extract + clean sekaligus."""
raw = self.extract(image)
cleaned = self.clean(raw)
print(f"[PaddleOCR] CLEANED:\n{'='*40}\n{cleaned}\n{'='*40}")
return cleaned
def clean(self, text: str) -> str:
"""Basic cleaning hasil OCR."""
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'[ \t]{2,}', ' ', text)
return text.strip()