Receipt_OCR / utils.py
RickyGM15's picture
Upload folder using huggingface_hub
d1e4f85 verified
from paddleocr import PaddleOCR
import cv2
import json
import numpy as np
from PIL import Image
from torchvision import transforms
# Khởi tạo PaddleOCR một lần
ocr_engine = PaddleOCR(
use_angle_cls=False,
lang='en',
rec=False,
use_gpu=True,
gpu_mem=500, # Giới hạn bộ nhớ GPU (MB)
det_limit_side_len=1280
)
def detect_text_boxes(image: np.ndarray):
"""
Detect text bounding boxes from numpy image array (OpenCV format)
Returns list of dicts with box coordinates.
"""
if not isinstance(image, np.ndarray):
raise ValueError("Input must be numpy array (OpenCV format)")
result = ocr_engine.ocr(image, cls=False)
boxes = []
for idx, item in enumerate(result[0]):
points = item[0]
boxes.append({
"id": idx + 1,
"text": "", # empty because recognition disabled
"box": points
})
return boxes
def preprocess_image(image: Image.Image):
"""Chuẩn bị ảnh đầu vào cho model"""
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
return transform(image)
def decode_predictions(predictions, tokenizer):
"""Chuyển đổi model output thành text"""
texts = []
for pred in predictions:
tokens = []
for token_id in pred:
if token_id == tokenizer.special_tokens['eos']:
break
tokens.append(token_id.item())
texts.append(tokenizer.decode(tokens))
return texts
def crop_and_resize_line(image_rgb, box, target_height=48):
pts = np.array(box, dtype=np.float32)
x, y, w, h = cv2.boundingRect(pts)
roi = image_rgb[y:y+h, x:x+w]
ratio = target_height / float(h)
new_w = max(1, int(w * ratio))
resized = cv2.resize(roi, (new_w, target_height))
return resized
def sort_annotations_by_top(annotations):
return sorted(annotations, key=lambda x: min(point[1] for point in x['box']))