wd929 commited on
Commit
d4d0529
·
1 Parent(s): ba543a7

Upload model_repo for zipformer offline

Browse files
model_repo_offline/decoder/1/.gitkeep ADDED
File without changes
model_repo_offline/decoder/1/decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66e66ba71bc62f558aa2b246fb1c7d5f2e7b6b423e2c5ada327c3a2d40fd7ee
3
+ size 1041576
model_repo_offline/decoder/config.pbtxt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "decoder"
16
+ backend: "onnxruntime"
17
+ default_model_filename: "decoder.onnx"
18
+
19
+ max_batch_size: 512
20
+ input [
21
+ {
22
+ name: "y"
23
+ data_type: TYPE_INT64
24
+ dims: [2]
25
+ }
26
+ ]
27
+
28
+ output [
29
+ {
30
+ name: "decoder_out"
31
+ data_type: TYPE_FP32
32
+ dims: [ -1, -1 ]
33
+ }
34
+ ]
35
+
36
+ dynamic_batching {
37
+ }
38
+
39
+ instance_group [
40
+ {
41
+ count: 1
42
+ kind: KIND_GPU
43
+ }
44
+ ]
model_repo_offline/encoder/1/.gitkeep ADDED
File without changes
model_repo_offline/encoder/1/encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f586e039864febaf6a65a5bb1fbf9e1ade230fafa3b82fc9c53e03d7dc06c40
3
+ size 353036839
model_repo_offline/encoder/config.pbtxt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "encoder"
16
+ backend: "onnxruntime"
17
+ default_model_filename: "encoder.onnx"
18
+
19
+ max_batch_size: 512
20
+ input [
21
+ {
22
+ name: "x"
23
+ data_type: TYPE_FP32
24
+ dims: [-1, 80]
25
+ },
26
+ {
27
+ name: "x_lens"
28
+ data_type: TYPE_INT64
29
+ dims: [1]
30
+ reshape: { shape: [ ] }
31
+ }
32
+ ]
33
+ output [
34
+ {
35
+ name: "encoder_out"
36
+ data_type: TYPE_FP32
37
+ dims: [-1, -1 ]
38
+ },
39
+ {
40
+ name: "encoder_out_lens"
41
+ data_type: TYPE_INT64
42
+ dims: [1]
43
+ reshape: { shape: [ ] }
44
+ }
45
+ ]
46
+
47
+ dynamic_batching {
48
+ preferred_batch_size: [ 16, 32 ]
49
+ }
50
+
51
+ instance_group [
52
+ {
53
+ count: 1
54
+ kind: KIND_GPU
55
+ }
56
+ ]
model_repo_offline/feature_extractor/1/__pycache__/model.cpython-38.pyc ADDED
Binary file (4.77 kB). View file
 
model_repo_offline/feature_extractor/1/model.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import triton_python_backend_utils as pb_utils
16
+ from torch.utils.dlpack import to_dlpack
17
+ import torch
18
+ import numpy as np
19
+ import kaldifeat
20
+ import _kaldifeat
21
+ from typing import List
22
+ import json
23
+
24
+ class Fbank(torch.nn.Module):
25
+ def __init__(self, opts):
26
+ super(Fbank, self).__init__()
27
+ self.fbank = kaldifeat.Fbank(opts)
28
+
29
+ def forward(self, waves: List[torch.Tensor]):
30
+ return self.fbank(waves)
31
+
32
+
33
+ class TritonPythonModel:
34
+ """Your Python model must use the same class name. Every Python model
35
+ that is created must have "TritonPythonModel" as the class name.
36
+ """
37
+
38
+ def initialize(self, args):
39
+ """`initialize` is called only once when the model is being loaded.
40
+ Implementing `initialize` function is optional. This function allows
41
+ the model to initialize any state associated with this model.
42
+
43
+ Parameters
44
+ ----------
45
+ args : dict
46
+ Both keys and values are strings. The dictionary keys and values are:
47
+ * model_config: A JSON string containing the model configuration
48
+ * model_instance_kind: A string containing model instance kind
49
+ * model_instance_device_id: A string containing model instance device ID
50
+ * model_repository: Model repository path
51
+ * model_version: Model version
52
+ * model_name: Model name
53
+ """
54
+ self.model_config = model_config = json.loads(args['model_config'])
55
+ self.max_batch_size = max(model_config["max_batch_size"], 1)
56
+
57
+ if "GPU" in model_config["instance_group"][0]["kind"]:
58
+ self.device = "cuda"
59
+ else:
60
+ self.device = "cpu"
61
+
62
+ # Get OUTPUT0 configuration
63
+ output0_config = pb_utils.get_output_config_by_name(
64
+ model_config, "speech")
65
+ # Convert Triton types to numpy types
66
+ output0_dtype = pb_utils.triton_string_to_numpy(
67
+ output0_config['data_type'])
68
+ if output0_dtype == np.float32:
69
+ self.output0_dtype = torch.float32
70
+ else:
71
+ self.output0_dtype = torch.float16
72
+
73
+ # Get OUTPUT1 configuration
74
+ output1_config = pb_utils.get_output_config_by_name(
75
+ model_config, "speech_lengths")
76
+ # Convert Triton types to numpy types
77
+ self.output1_dtype = pb_utils.triton_string_to_numpy(
78
+ output1_config['data_type'])
79
+
80
+ params = self.model_config['parameters']
81
+ opts = kaldifeat.FbankOptions()
82
+ opts.frame_opts.dither = 0
83
+ opts.frame_opts.snip_edges = False
84
+ for li in params.items():
85
+ key, value = li
86
+ value = value["string_value"]
87
+ if key == "num_mel_bins":
88
+ opts.mel_opts.num_bins = int(value)
89
+ elif key == "frame_shift_in_ms":
90
+ opts.frame_opts.frame_shift_ms = float(value)
91
+ elif key == "frame_length_in_ms":
92
+ opts.frame_opts.frame_length_ms = float(value)
93
+ elif key == "sample_rate":
94
+ opts.frame_opts.samp_freq = int(value)
95
+ opts.device = torch.device(self.device)
96
+ self.opts = opts
97
+ self.feature_extractor = Fbank(self.opts)
98
+ self.feature_size = opts.mel_opts.num_bins
99
+
100
+ def execute(self, requests):
101
+ """`execute` must be implemented in every Python model. `execute`
102
+ function receives a list of pb_utils.InferenceRequest as the only
103
+ argument. This function is called when an inference is requested
104
+ for this model.
105
+
106
+ Parameters
107
+ ----------
108
+ requests : list
109
+ A list of pb_utils.InferenceRequest
110
+
111
+ Returns
112
+ -------
113
+ list
114
+ A list of pb_utils.InferenceResponse. The length of this list must
115
+ be the same as `requests`
116
+ """
117
+ batch_count = []
118
+ total_waves = []
119
+ batch_len = []
120
+ responses = []
121
+ for request in requests:
122
+ input0 = pb_utils.get_input_tensor_by_name(request, "wav")
123
+ input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
124
+
125
+ cur_b_wav = input0.as_numpy()
126
+ cur_b_wav_lens = input1.as_numpy() # b x 1
127
+ cur_batch = cur_b_wav.shape[0]
128
+ cur_len = cur_b_wav.shape[1]
129
+ batch_count.append(cur_batch)
130
+ batch_len.append(cur_len)
131
+ for wav, wav_len in zip(cur_b_wav, cur_b_wav_lens):
132
+ wav_len = wav_len[0]
133
+ wav = torch.tensor(wav[0:wav_len], dtype=torch.float32,
134
+ device=self.device)
135
+ total_waves.append(wav)
136
+
137
+ features = self.feature_extractor(total_waves)
138
+ for b, l in zip(batch_count, batch_len):
139
+ expect_feat_len = _kaldifeat.num_frames(l, self.opts.frame_opts)
140
+ speech = torch.zeros((b, expect_feat_len, self.feature_size),
141
+ dtype=self.output0_dtype, device=self.device)
142
+ speech_lengths = torch.zeros((b, 1), dtype=torch.int64, device=self.device)
143
+ for i in range(b):
144
+ f = features.pop(0)
145
+ f_l = f.shape[0]
146
+ speech[i, 0: f_l, :] = f.to(self.output0_dtype)
147
+ speech_lengths[i][0] = f_l
148
+ speech = speech.cpu()
149
+ speech_lengths = speech_lengths.cpu()
150
+ out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
151
+ out1 = pb_utils.Tensor.from_dlpack("speech_lengths",
152
+ to_dlpack(speech_lengths))
153
+ inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
154
+ responses.append(inference_response)
155
+ return responses
model_repo_offline/feature_extractor/config.pbtxt ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "feature_extractor"
16
+ backend: "python"
17
+ max_batch_size: 512
18
+
19
+ parameters [
20
+ {
21
+ key: "num_mel_bins",
22
+ value: { string_value: "80"}
23
+ },
24
+ {
25
+ key: "frame_shift_in_ms"
26
+ value: { string_value: "10"}
27
+ },
28
+ {
29
+ key: "frame_length_in_ms"
30
+ value: { string_value: "25"}
31
+ },
32
+ {
33
+ key: "sample_rate"
34
+ value: { string_value: "16000"}
35
+ }
36
+
37
+ ]
38
+
39
+ input [
40
+ {
41
+ name: "wav"
42
+ data_type: TYPE_FP32
43
+ dims: [-1]
44
+ },
45
+ {
46
+ name: "wav_lens"
47
+ data_type: TYPE_INT32
48
+ dims: [1]
49
+ }
50
+ ]
51
+
52
+ output [
53
+ {
54
+ name: "speech"
55
+ data_type: TYPE_FP32
56
+ dims: [-1, 80]
57
+ },
58
+ {
59
+ name: "speech_lengths"
60
+ data_type: TYPE_INT64
61
+ dims: [1]
62
+ }
63
+ ]
64
+
65
+ dynamic_batching {
66
+ }
67
+ instance_group [
68
+ {
69
+ count: 1
70
+ kind: KIND_GPU
71
+ }
72
+ ]
model_repo_offline/joiner/1/.gitkeep ADDED
File without changes
model_repo_offline/joiner/1/joiner.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:761d2fd80adf49d75cf13e408468e8c735126d8baec527c56c797afffb71250a
3
+ size 1026490
model_repo_offline/joiner/config.pbtxt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "joiner"
16
+ backend: "onnxruntime"
17
+ default_model_filename: "joiner.onnx"
18
+
19
+ max_batch_size: 512
20
+ input [
21
+ {
22
+ name: "projected_encoder_out"
23
+ data_type: TYPE_FP32
24
+ dims: [ 512 ]
25
+ },
26
+ {
27
+ name: "projected_decoder_out"
28
+ data_type: TYPE_FP32
29
+ dims: [ 512 ]
30
+ }
31
+ ]
32
+
33
+ output [
34
+ {
35
+ name: "logit"
36
+ data_type: TYPE_FP32
37
+ dims: [ 500 ]
38
+ }
39
+ ]
40
+
41
+ dynamic_batching {
42
+ }
43
+
44
+ instance_group [
45
+ {
46
+ count: 1
47
+ kind: KIND_GPU
48
+ }
49
+ ]
model_repo_offline/joiner_decoder_proj/1/joiner_decoder_proj.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:230fa585dd50f78e28ecf80285c46dabc974f929e3872eb4a4bc73e8ab8e5832
3
+ size 1050893
model_repo_offline/joiner_decoder_proj/config.pbtxt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "joiner_decoder_proj"
16
+ backend: "onnxruntime"
17
+ default_model_filename: "joiner_decoder_proj.onnx"
18
+
19
+ max_batch_size: 512
20
+ input [
21
+ {
22
+ name: "decoder_out"
23
+ data_type: TYPE_FP32
24
+ dims: [ 512 ]
25
+ }
26
+ ]
27
+ output [
28
+ {
29
+ name: "projected_decoder_out"
30
+ data_type: TYPE_FP32
31
+ dims: [ 512 ]
32
+ }
33
+ ]
34
+
35
+ dynamic_batching {
36
+ }
37
+
38
+ instance_group [
39
+ {
40
+ count: 1
41
+ kind: KIND_GPU
42
+ }
43
+ ]
model_repo_offline/joiner_encoder_proj/1/joiner_encoder_proj.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:754cab31c25f9f2d48ce34291c8564a593cea5cb6114be82b28cbecaa3b3e9e5
3
+ size 788749
model_repo_offline/joiner_encoder_proj/config.pbtxt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "joiner_encoder_proj"
16
+ backend: "onnxruntime"
17
+ default_model_filename: "joiner_encoder_proj.onnx"
18
+
19
+ max_batch_size: 512
20
+ input [
21
+ {
22
+ name: "encoder_out"
23
+ data_type: TYPE_FP32
24
+ dims: [ 384 ]
25
+ }
26
+ ]
27
+ output [
28
+ {
29
+ name: "projected_encoder_out"
30
+ data_type: TYPE_FP32
31
+ dims: [ 512 ]
32
+ }
33
+ ]
34
+
35
+ dynamic_batching {
36
+ }
37
+
38
+ instance_group [
39
+ {
40
+ count: 2
41
+ kind: KIND_GPU
42
+ }
43
+ ]
model_repo_offline/scorer/1/__pycache__/model.cpython-38.pyc ADDED
Binary file (5.33 kB). View file
 
model_repo_offline/scorer/1/__pycache__/search.cpython-38.pyc ADDED
Binary file (3.06 kB). View file
 
model_repo_offline/scorer/1/model.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import triton_python_backend_utils as pb_utils
17
+ import numpy as np
18
+
19
+ import json
20
+
21
+ import torch
22
+ from torch.utils.dlpack import from_dlpack, to_dlpack
23
+ import sentencepiece as spm
24
+ from icefall.lexicon import Lexicon
25
+
26
+ from search import greedy_search
27
+
28
+ class TritonPythonModel:
29
+ """Your Python model must use the same class name. Every Python model
30
+ that is created must have "TritonPythonModel" as the class name.
31
+ """
32
+
33
+ def initialize(self, args):
34
+ """`initialize` is called only once when the model is being loaded.
35
+ Implementing `initialize` function is optional. This function allows
36
+ the model to initialize any state associated with this model.
37
+
38
+ Parameters
39
+ ----------
40
+ args : dict
41
+ Both keys and values are strings. The dictionary keys and values are:
42
+ * model_config: A JSON string containing the model configuration
43
+ * model_instance_kind: A string containing model instance kind
44
+ * model_instance_device_id: A string containing model instance device ID
45
+ * model_repository: Model repository path
46
+ * model_version: Model version
47
+ * model_name: Model name
48
+ """
49
+ self.model_config = model_config = json.loads(args['model_config'])
50
+ self.max_batch_size = max(model_config["max_batch_size"], 1)
51
+
52
+ # Get OUTPUT0 configuration
53
+ output0_config = pb_utils.get_output_config_by_name(
54
+ model_config, "OUTPUT0")
55
+ # Convert Triton types to numpy types
56
+ self.out0_dtype = pb_utils.triton_string_to_numpy(
57
+ output0_config['data_type'])
58
+
59
+ model_instance_kind = args['model_instance_kind']
60
+ model_instance_device_id = args['model_instance_device_id']
61
+ if model_instance_kind == 'GPU':
62
+ self.device = f'cuda:{model_instance_device_id}'
63
+ else:
64
+ self.device= 'cpu'
65
+
66
+ # Get INPUT configuration
67
+ encoder_config = pb_utils.get_input_config_by_name(
68
+ model_config, "encoder_out")
69
+ self.data_type = pb_utils.triton_string_to_numpy(
70
+ encoder_config['data_type'])
71
+ if self.data_type == np.float32:
72
+ self.torch_dtype = torch.float32
73
+ else:
74
+ assert self.data_type == np.float16
75
+ self.torch_dtype = torch.float16
76
+
77
+ self.encoder_dim = encoder_config['dims'][-1]
78
+
79
+
80
+ self.init_parameters(self.model_config['parameters'])
81
+
82
+ def init_parameters(self, parameters):
83
+ for key,value in parameters.items():
84
+ parameters[key] = value["string_value"]
85
+ self.context_size = int(parameters['context_size'])
86
+ self.decoding_method = parameters['decoding_method']
87
+ if 'bpe' in parameters['tokenizer_file']:
88
+ sp = spm.SentencePieceProcessor()
89
+ sp.load(parameters['tokenizer_file'])
90
+ self.blank_id = sp.piece_to_id("<blk>")
91
+ self.unk_id = sp.piece_to_id("<unk>")
92
+ self.vocab_size = sp.get_piece_size()
93
+ self.tokenizer = sp
94
+ else:
95
+ assert 'char' in parameters['tokenizer_file']
96
+ lexicon = Lexicon(parameters['tokenizer_file'])
97
+ self.unk_id = lexicon.token_table["<unk>"]
98
+ self.blank_id = lexicon.token_table["<blk>"]
99
+ self.vocab_size = max(lexicon.tokens) + 1
100
+ self.tokenizer = lexicon
101
+
102
+ def execute(self, requests):
103
+ """`execute` must be implemented in every Python model. `execute`
104
+ function receives a list of pb_utils.InferenceRequest as the only
105
+ argument. This function is called when an inference is requested
106
+ for this model.
107
+
108
+ Parameters
109
+ ----------
110
+ requests : list
111
+ A list of pb_utils.InferenceRequest
112
+
113
+ Returns
114
+ -------
115
+ list
116
+ A list of pb_utils.InferenceResponse. The length of this list must
117
+ be the same as `requests`
118
+ """
119
+ # Every Python backend must iterate through list of requests and create
120
+ # an instance of pb_utils.InferenceResponse class for each of them. You
121
+ # should avoid storing any of the input Tensors in the class attributes
122
+ # as they will be overridden in subsequent inference requests. You can
123
+ # make a copy of the underlying NumPy array and store it if it is
124
+ # required.
125
+
126
+ batch_encoder_out_list, batch_encoder_lens_list = [], []
127
+ batchsize_lists = []
128
+ total_seqs = 0
129
+ encoder_max_len = 0
130
+
131
+ for request in requests:
132
+ # Perform inference on the request and append it to responses list...
133
+ in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out")
134
+ in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens")
135
+ assert not in_0.is_cpu()
136
+ batch_encoder_out_list.append(from_dlpack(in_0.to_dlpack()))
137
+ encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1])
138
+ cur_b_lens = from_dlpack(in_1.to_dlpack())
139
+ batch_encoder_lens_list.append(cur_b_lens)
140
+ cur_batchsize = cur_b_lens.shape[0]
141
+ batchsize_lists.append(cur_batchsize)
142
+ total_seqs += cur_batchsize
143
+
144
+ encoder_out = torch.zeros((total_seqs, encoder_max_len, self.encoder_dim),
145
+ dtype=self.torch_dtype, device=self.device)
146
+ encoder_out_lens = torch.zeros(total_seqs, dtype=torch.int64)
147
+ st = 0
148
+
149
+ for b in batchsize_lists:
150
+ t = batch_encoder_out_list.pop(0)
151
+ encoder_out[st:st + b, 0:t.shape[1]] = t
152
+ encoder_out_lens[st:st + b] = batch_encoder_lens_list.pop(0)
153
+ st += b
154
+
155
+ if self.decoding_method == 'greedy_search':
156
+ ans = greedy_search(encoder_out, encoder_out_lens, self.context_size, self.unk_id, self.blank_id)
157
+ else:
158
+ raise NotImplementedError
159
+ results = []
160
+ if hasattr(self.tokenizer, 'token_table'):
161
+ for i in range(len(ans)):
162
+ results.append([self.tokenizer.token_table[idx] for idx in ans[i]])
163
+ else:
164
+ for hyp in self.tokenizer.decode(ans):
165
+ results.append(hyp.split())
166
+ st = 0
167
+ responses = []
168
+ for b in batchsize_lists:
169
+ sents = np.array(results[st:st + b])
170
+ out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
171
+ inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
172
+ responses.append(inference_response)
173
+ st += b
174
+ return responses
175
+
176
+ def finalize(self):
177
+ """`finalize` is called only once when the model is being unloaded.
178
+ Implementing `finalize` function is optional. This function allows
179
+ the model to perform any necessary clean ups before exit.
180
+ """
181
+ print('Cleaning up...')
model_repo_offline/scorer/1/search.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import triton_python_backend_utils as pb_utils
16
+ import numpy as np
17
+
18
+ import torch
19
+ from torch.utils.dlpack import from_dlpack, to_dlpack
20
+
21
+ def forward_joiner(cur_encoder_out, decoder_out):
22
+ in_joiner_tensor_0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(cur_encoder_out))
23
+ in_joiner_tensor_1 = pb_utils.Tensor.from_dlpack("decoder_out", to_dlpack(decoder_out.squeeze(1)))
24
+
25
+ inference_request = pb_utils.InferenceRequest(
26
+ model_name='joiner_encoder_proj',
27
+ requested_output_names=['projected_encoder_out'],
28
+ inputs=[in_joiner_tensor_0])
29
+ inference_response = inference_request.exec()
30
+ if inference_response.has_error():
31
+ raise pb_utils.TritonModelException(inference_response.error().message())
32
+ else:
33
+ # Extract the output tensors from the inference response.
34
+ proj_encoder_out = pb_utils.get_output_tensor_by_name(inference_response,
35
+ 'projected_encoder_out')
36
+
37
+ inference_request = pb_utils.InferenceRequest(
38
+ model_name='joiner_decoder_proj',
39
+ requested_output_names=['projected_decoder_out'],
40
+ inputs=[in_joiner_tensor_1])
41
+ inference_response = inference_request.exec()
42
+ if inference_response.has_error():
43
+ raise pb_utils.TritonModelException(inference_response.error().message())
44
+ else:
45
+ # Extract the output tensors from the inference response.
46
+ proj_decoder_out = pb_utils.get_output_tensor_by_name(inference_response,
47
+ 'projected_decoder_out')
48
+
49
+ inference_request = pb_utils.InferenceRequest(
50
+ model_name='joiner',
51
+ requested_output_names=['logit'],
52
+ inputs=[proj_encoder_out, proj_decoder_out])
53
+ inference_response = inference_request.exec()
54
+
55
+ if inference_response.has_error():
56
+ raise pb_utils.TritonModelException(inference_response.error().message())
57
+ else:
58
+ # Extract the output tensors from the inference response.
59
+ logits = pb_utils.get_output_tensor_by_name(inference_response,
60
+ 'logit')
61
+ logits = torch.utils.dlpack.from_dlpack(logits.to_dlpack()).cpu()
62
+ assert len(logits.shape) == 2, logits.shape
63
+ return logits
64
+
65
+ def forward_decoder(hyps, context_size):
66
+ decoder_input = [h[-context_size:] for h in hyps]
67
+
68
+ decoder_input = np.asarray(decoder_input,dtype=np.int64)
69
+
70
+ in_decoder_input_tensor = pb_utils.Tensor("y", decoder_input)
71
+
72
+ inference_request = pb_utils.InferenceRequest(
73
+ model_name='decoder',
74
+ requested_output_names=['decoder_out'],
75
+ inputs=[in_decoder_input_tensor])
76
+
77
+ inference_response = inference_request.exec()
78
+ if inference_response.has_error():
79
+ raise pb_utils.TritonModelException(inference_response.error().message())
80
+ else:
81
+ # Extract the output tensors from the inference response.
82
+ decoder_out = pb_utils.get_output_tensor_by_name(inference_response,
83
+ 'decoder_out')
84
+ decoder_out = from_dlpack(decoder_out.to_dlpack())
85
+ return decoder_out
86
+
87
+
88
+ def greedy_search(encoder_out, encoder_out_lens, context_size, unk_id, blank_id):
89
+
90
+ packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
91
+ input=encoder_out,
92
+ lengths=encoder_out_lens.cpu(),
93
+ batch_first=True,
94
+ enforce_sorted=False
95
+ )
96
+
97
+ pack_batch_size_list = packed_encoder_out.batch_sizes.tolist()
98
+
99
+ hyps = [[blank_id] * context_size for _ in range(encoder_out.shape[0])]
100
+ decoder_out = forward_decoder(hyps, context_size)
101
+
102
+ offset = 0
103
+ for batch_size in pack_batch_size_list:
104
+ start = offset
105
+ end = offset + batch_size
106
+ current_encoder_out = packed_encoder_out.data[start:end]
107
+
108
+ offset = end
109
+
110
+ decoder_out = decoder_out[:batch_size]
111
+
112
+ logits = forward_joiner(current_encoder_out, decoder_out)
113
+
114
+ assert logits.ndim == 2, logits.shape
115
+ y = logits.argmax(dim=1).tolist()
116
+
117
+ emitted = False
118
+ for i, v in enumerate(y):
119
+ if v not in (blank_id, unk_id):
120
+ hyps[i].append(v)
121
+ emitted = True
122
+ if emitted:
123
+ decoder_out = forward_decoder(hyps[:batch_size], context_size)
124
+
125
+
126
+ sorted_ans = [h[context_size:] for h in hyps]
127
+
128
+ ans = []
129
+ unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
130
+ for i in range(encoder_out.shape[0]):
131
+ ans.append(sorted_ans[unsorted_indices[i]])
132
+
133
+ return ans
model_repo_offline/scorer/config.pbtxt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "scorer"
16
+ backend: "python"
17
+ max_batch_size: 512
18
+
19
+ parameters [
20
+ {
21
+ key: "context_size",
22
+ value: { string_value: "2"}
23
+ },
24
+ {
25
+ key: "tokenizer_file",
26
+ value: { string_value: "/workspace/bpe.model"}
27
+ },
28
+ {
29
+ key: "FORCE_CPU_ONLY_INPUT_TENSORS",
30
+ value: {string_value:"no"}
31
+ },
32
+ {
33
+ key: "decoding_method",
34
+ value: { string_value: "greedy_search"}
35
+ }
36
+ ]
37
+
38
+
39
+ input [
40
+ {
41
+ name: "encoder_out"
42
+ data_type: TYPE_FP32
43
+ dims: [-1, 384]
44
+ },
45
+ {
46
+ name: "encoder_out_lens"
47
+ data_type: TYPE_INT64
48
+ dims: [1]
49
+ reshape: { shape: [ ] }
50
+ }
51
+ ]
52
+
53
+ output [
54
+ {
55
+ name: "OUTPUT0"
56
+ data_type: TYPE_STRING
57
+ dims: [1]
58
+ }
59
+ ]
60
+
61
+ dynamic_batching {
62
+ }
63
+ instance_group [
64
+ {
65
+ count: 1
66
+ kind: KIND_CPU
67
+ }
68
+ ]
model_repo_offline/transducer/1/.gitkeep ADDED
File without changes
model_repo_offline/transducer/config.pbtxt ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "transducer"
16
+ platform: "ensemble"
17
+ max_batch_size: 512
18
+
19
+ input [
20
+ {
21
+ name: "WAV"
22
+ data_type: TYPE_FP32
23
+ dims: [-1]
24
+ },
25
+ {
26
+ name: "WAV_LENS"
27
+ data_type: TYPE_INT32
28
+ dims: [1]
29
+ }
30
+ ]
31
+
32
+ output [
33
+ {
34
+ name: "TRANSCRIPTS"
35
+ data_type: TYPE_STRING
36
+ dims: [1]
37
+ }
38
+ ]
39
+
40
+ ensemble_scheduling {
41
+ step [
42
+ {
43
+ model_name: "feature_extractor"
44
+ model_version: -1
45
+ input_map {
46
+ key: "wav"
47
+ value: "WAV"
48
+ }
49
+ input_map {
50
+ key: "wav_lens"
51
+ value: "WAV_LENS"
52
+ }
53
+ output_map {
54
+ key: "speech"
55
+ value: "SPEECH"
56
+ }
57
+ output_map {
58
+ key: "speech_lengths"
59
+ value: "SPEECH_LENGTHS"
60
+ }
61
+ },
62
+ {
63
+ model_name: "encoder"
64
+ model_version: -1
65
+ input_map {
66
+ key: "x"
67
+ value: "SPEECH"
68
+ }
69
+ input_map {
70
+ key: "x_lens"
71
+ value: "SPEECH_LENGTHS"
72
+ }
73
+ output_map {
74
+ key: "encoder_out"
75
+ value: "encoder_out"
76
+ }
77
+ output_map {
78
+ key: "encoder_out_lens"
79
+ value: "encoder_out_lens"
80
+ }
81
+ },
82
+ {
83
+ model_name: "scorer"
84
+ model_version: -1
85
+ input_map {
86
+ key: "encoder_out"
87
+ value: "encoder_out"
88
+ }
89
+ input_map {
90
+ key: "encoder_out_lens"
91
+ value: "encoder_out_lens"
92
+ }
93
+ output_map {
94
+ key: "OUTPUT0"
95
+ value: "TRANSCRIPTS"
96
+ }
97
+ }
98
+ ]
99
+ }