File size: 2,144 Bytes
e141a7d
 
 
 
 
 
 
 
 
 
 
 
d1e4f85
 
e141a7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from paddleocr import PaddleOCR
import cv2
import json
import numpy as np
from PIL import Image
from torchvision import transforms

# Khởi tạo PaddleOCR một lần
ocr_engine = PaddleOCR(
    use_angle_cls=False,
    lang='en',
    rec=False,
    use_gpu=True, 
    gpu_mem=500,   # Giới hạn bộ nhớ GPU (MB)
    det_limit_side_len=1280
)

def detect_text_boxes(image: np.ndarray):
    """

    Detect text bounding boxes from numpy image array (OpenCV format)

    Returns list of dicts with box coordinates.

    """
    if not isinstance(image, np.ndarray):
        raise ValueError("Input must be numpy array (OpenCV format)")
    
    result = ocr_engine.ocr(image, cls=False)
    boxes = []
    for idx, item in enumerate(result[0]):
        points = item[0]
        boxes.append({
            "id": idx + 1,
            "text": "",  # empty because recognition disabled
            "box": points
        })
    return boxes

def preprocess_image(image: Image.Image):
    """Chuẩn bị ảnh đầu vào cho model"""
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    return transform(image)

def decode_predictions(predictions, tokenizer):
    """Chuyển đổi model output thành text"""
    texts = []
    for pred in predictions:
        tokens = []
        for token_id in pred:
            if token_id == tokenizer.special_tokens['eos']:
                break
            tokens.append(token_id.item())
        texts.append(tokenizer.decode(tokens))
    return texts

def crop_and_resize_line(image_rgb, box, target_height=48):
    pts = np.array(box, dtype=np.float32)
    x, y, w, h = cv2.boundingRect(pts)
    roi = image_rgb[y:y+h, x:x+w]
    ratio = target_height / float(h)
    new_w = max(1, int(w * ratio))
    resized = cv2.resize(roi, (new_w, target_height))
    return resized

def sort_annotations_by_top(annotations):
    return sorted(annotations, key=lambda x: min(point[1] for point in x['box']))