from paddleocr import PaddleOCR import cv2 import json import numpy as np from PIL import Image from torchvision import transforms # Khởi tạo PaddleOCR một lần ocr_engine = PaddleOCR( use_angle_cls=False, lang='en', rec=False, use_gpu=True, gpu_mem=500, # Giới hạn bộ nhớ GPU (MB) det_limit_side_len=1280 ) def detect_text_boxes(image: np.ndarray): """ Detect text bounding boxes from numpy image array (OpenCV format) Returns list of dicts with box coordinates. """ if not isinstance(image, np.ndarray): raise ValueError("Input must be numpy array (OpenCV format)") result = ocr_engine.ocr(image, cls=False) boxes = [] for idx, item in enumerate(result[0]): points = item[0] boxes.append({ "id": idx + 1, "text": "", # empty because recognition disabled "box": points }) return boxes def preprocess_image(image: Image.Image): """Chuẩn bị ảnh đầu vào cho model""" transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) return transform(image) def decode_predictions(predictions, tokenizer): """Chuyển đổi model output thành text""" texts = [] for pred in predictions: tokens = [] for token_id in pred: if token_id == tokenizer.special_tokens['eos']: break tokens.append(token_id.item()) texts.append(tokenizer.decode(tokens)) return texts def crop_and_resize_line(image_rgb, box, target_height=48): pts = np.array(box, dtype=np.float32) x, y, w, h = cv2.boundingRect(pts) roi = image_rgb[y:y+h, x:x+w] ratio = target_height / float(h) new_w = max(1, int(w * ratio)) resized = cv2.resize(roi, (new_w, target_height)) return resized def sort_annotations_by_top(annotations): return sorted(annotations, key=lambda x: min(point[1] for point in x['box']))