File size: 3,662 Bytes
c6905fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196a03f
 
 
 
c6905fb
 
 
 
 
 
 
 
 
 
 
 
 
 
196a03f
c6905fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196a03f
c6905fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196a03f
 
 
 
 
 
 
c6905fb
 
 
 
 
 
 
 
 
196a03f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from pathlib import Path

import cv2
import numpy as np
from openvino import Core
from pydantic import BaseModel


class KanaData(BaseModel):
    class Category(BaseModel):
        seion: list[str]
        dakuon: list[str]
        handakuon: list[str]
        youon: list[str]

    category: Category
    hiragana: dict[str, list[str]]
    katakana: dict[str, list[str]]
    spell: dict[str, list[str]]

    @classmethod
    def load(cls, path):
        return cls.model_validate_json(Path(path).read_text())


class Recognizer:
    class Result(BaseModel):
        char: str
        prob: float

    def __init__(self, model_path, char_list_path, device="CPU", blank="[blank]"):
        core = Core()
        self.model = core.read_model(model_path)
        self.compiled_model = core.compile_model(self.model, device)
        self.infer_request = self.compiled_model.create_infer_request()

        # (batch_size, channel, width, height)
        _, _, self.input_height, self.input_width = self.model.inputs[0].shape
        self.input_tensor_name = self.model.inputs[0].get_any_name()
        self.output_tensor_name = self.model.outputs[0].get_any_name()

        with open(char_list_path, "rt", encoding="UTF-8") as fp:
            self.chars = [blank] + fp.read().split("\n")

    def recognize(self, image, top_k=10):
        image = self.preprocess(image, self.input_height, self.input_width)[None, :, :, :]

        for _ in range(2):
            self.infer_request.infer(inputs={self.input_tensor_name: image})
            preds = self.infer_request.get_tensor(self.output_tensor_name).data[:]

        return self.ctc_decode(preds, top_k)

    def preprocess(self, image, height, width, invert=False):
        src: np.ndarray = cv2.cvtColor(image, cv2.COLOR_RGBA2GRAY)
        src = (255 - src) if invert else src

        ratio = float(src.shape[1]) / float(src.shape[0])
        dsize = (int(height * ratio), height)
        rsz = cv2.resize(src, dsize, interpolation=cv2.INTER_AREA).astype(np.float32)

        img = rsz[None, :, :]  # [h,w] -> [c,h,w]
        _, h, w = img.shape

        # right edge padding
        return np.pad(img, ((0, 0), (0, height - h), (0, width - w)), mode="edge")

    def ctc_decode(self, preds, top_k) -> tuple[list, list[list[Result]]]:
        index, texts, nbest = 0, list(), list()

        preds_index: np.ndarray = np.argmax(preds, 2)
        preds_index = preds_index.transpose(1, 0)
        preds_index_reshape = preds_index.reshape(-1)
        preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0])

        for step in preds_sizes:
            t = preds_index_reshape[index : index + step]

            if t.shape[0] == 0:
                continue

            char_list = []
            for i in range(step):
                if t[i] == 0:
                    continue

                # removing repeated characters and blank.
                if i > 0 and t[i - 1] == t[i]:
                    continue

                char_list.append(self.chars[t[i]])

                # process n-best
                probs = self.softmax(preds[i][0])
                k_indices = np.argsort(-probs)[:top_k]
                k_probs = probs[k_indices]
                k_results = [
                    Recognizer.Result(char=self.chars[j], prob=prob)
                    for j, prob in zip(k_indices, k_probs)
                ]
                nbest.append(k_results)

            text = "".join(char_list)
            texts.append(text)

            index += step

        return texts, nbest

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x, axis=0)