wd929 commited on May 31, 2023

Commit

d4d0529

1 Parent(s): ba543a7

Upload model_repo for zipformer offline

Browse files

Files changed (23) hide show

model_repo_offline/decoder/1/.gitkeep +0 -0
model_repo_offline/decoder/1/decoder.onnx +3 -0
model_repo_offline/decoder/config.pbtxt +44 -0
model_repo_offline/encoder/1/.gitkeep +0 -0
model_repo_offline/encoder/1/encoder.onnx +3 -0
model_repo_offline/encoder/config.pbtxt +56 -0
model_repo_offline/feature_extractor/1/__pycache__/model.cpython-38.pyc +0 -0
model_repo_offline/feature_extractor/1/model.py +155 -0
model_repo_offline/feature_extractor/config.pbtxt +72 -0
model_repo_offline/joiner/1/.gitkeep +0 -0
model_repo_offline/joiner/1/joiner.onnx +3 -0
model_repo_offline/joiner/config.pbtxt +49 -0
model_repo_offline/joiner_decoder_proj/1/joiner_decoder_proj.onnx +3 -0
model_repo_offline/joiner_decoder_proj/config.pbtxt +43 -0
model_repo_offline/joiner_encoder_proj/1/joiner_encoder_proj.onnx +3 -0
model_repo_offline/joiner_encoder_proj/config.pbtxt +43 -0
model_repo_offline/scorer/1/__pycache__/model.cpython-38.pyc +0 -0
model_repo_offline/scorer/1/__pycache__/search.cpython-38.pyc +0 -0
model_repo_offline/scorer/1/model.py +181 -0
model_repo_offline/scorer/1/search.py +133 -0
model_repo_offline/scorer/config.pbtxt +68 -0
model_repo_offline/transducer/1/.gitkeep +0 -0
model_repo_offline/transducer/config.pbtxt +99 -0

model_repo_offline/decoder/1/.gitkeep ADDED Viewed

File without changes

model_repo_offline/decoder/1/decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d66e66ba71bc62f558aa2b246fb1c7d5f2e7b6b423e2c5ada327c3a2d40fd7ee
+size 1041576

model_repo_offline/decoder/config.pbtxt ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "decoder"
+backend: "onnxruntime"
+default_model_filename: "decoder.onnx"
+max_batch_size: 512
+input [
+  {
+    name: "y"
+    data_type: TYPE_INT64
+    dims: [2]
+  }
+]
+output [
+  {
+    name: "decoder_out"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  }
+]
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_GPU
+    }
+]

model_repo_offline/encoder/1/.gitkeep ADDED Viewed

File without changes

model_repo_offline/encoder/1/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f586e039864febaf6a65a5bb1fbf9e1ade230fafa3b82fc9c53e03d7dc06c40
+size 353036839

model_repo_offline/encoder/config.pbtxt ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "encoder"
+backend: "onnxruntime"
+default_model_filename: "encoder.onnx"
+max_batch_size: 512
+input [
+  {
+    name: "x"
+    data_type: TYPE_FP32
+    dims: [-1, 80]
+  },
+  {
+    name: "x_lens"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+output [
+  {
+    name: "encoder_out"
+    data_type: TYPE_FP32
+    dims: [-1, -1 ]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+dynamic_batching {
+  preferred_batch_size: [ 16, 32 ]
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_GPU
+    }
+]

model_repo_offline/feature_extractor/1/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (4.77 kB). View file

model_repo_offline/feature_extractor/1/model.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import to_dlpack
+import torch
+import numpy as np
+import kaldifeat
+import _kaldifeat
+from typing import List
+import json
+class Fbank(torch.nn.Module):
+    def __init__(self, opts):
+        super(Fbank, self).__init__()
+        self.fbank = kaldifeat.Fbank(opts)
+    def forward(self, waves: List[torch.Tensor]):
+        return self.fbank(waves)
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        self.max_batch_size = max(model_config["max_batch_size"], 1)
+        if "GPU" in model_config["instance_group"][0]["kind"]:
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "speech")
+        # Convert Triton types to numpy types
+        output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        if output0_dtype == np.float32:
+            self.output0_dtype = torch.float32
+        else:
+            self.output0_dtype = torch.float16
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(
+            model_config, "speech_lengths")
+        # Convert Triton types to numpy types
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config['data_type'])
+        params = self.model_config['parameters']
+        opts = kaldifeat.FbankOptions()
+        opts.frame_opts.dither = 0
+        opts.frame_opts.snip_edges = False
+        for li in params.items():
+            key, value = li
+            value = value["string_value"]
+            if key == "num_mel_bins":
+                opts.mel_opts.num_bins = int(value)
+            elif key == "frame_shift_in_ms":
+                opts.frame_opts.frame_shift_ms = float(value)
+            elif key == "frame_length_in_ms":
+                opts.frame_opts.frame_length_ms = float(value)
+            elif key == "sample_rate":
+                opts.frame_opts.samp_freq = int(value)
+        opts.device = torch.device(self.device)
+        self.opts = opts
+        self.feature_extractor = Fbank(self.opts)
+        self.feature_size = opts.mel_opts.num_bins
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        batch_count = []
+        total_waves = []
+        batch_len = []
+        responses = []
+        for request in requests:
+            input0 = pb_utils.get_input_tensor_by_name(request, "wav")
+            input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
+            cur_b_wav = input0.as_numpy()
+            cur_b_wav_lens = input1.as_numpy()  # b x 1
+            cur_batch = cur_b_wav.shape[0]
+            cur_len = cur_b_wav.shape[1]
+            batch_count.append(cur_batch)
+            batch_len.append(cur_len)
+            for wav, wav_len in zip(cur_b_wav, cur_b_wav_lens):
+                wav_len = wav_len[0]
+                wav = torch.tensor(wav[0:wav_len], dtype=torch.float32,
+                                   device=self.device)
+                total_waves.append(wav)
+        features = self.feature_extractor(total_waves)
+        for b, l in zip(batch_count, batch_len):
+            expect_feat_len = _kaldifeat.num_frames(l, self.opts.frame_opts)
+            speech = torch.zeros((b, expect_feat_len, self.feature_size),
+                                 dtype=self.output0_dtype, device=self.device)
+            speech_lengths = torch.zeros((b, 1), dtype=torch.int64, device=self.device)
+            for i in range(b):
+                f = features.pop(0)
+                f_l = f.shape[0]
+                speech[i, 0: f_l, :] = f.to(self.output0_dtype)
+                speech_lengths[i][0] = f_l
+            speech = speech.cpu()
+            speech_lengths = speech_lengths.cpu()
+            out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
+            out1 = pb_utils.Tensor.from_dlpack("speech_lengths",
+                                               to_dlpack(speech_lengths))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
+            responses.append(inference_response)
+        return responses

model_repo_offline/feature_extractor/config.pbtxt ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "feature_extractor"
+backend: "python"
+max_batch_size: 512
+parameters [
+  {
+    key: "num_mel_bins",
+    value: { string_value: "80"}
+  },
+  {
+    key: "frame_shift_in_ms"
+    value: { string_value: "10"}
+  },
+  {
+    key: "frame_length_in_ms"
+    value: { string_value: "25"}
+  },
+  {
+    key: "sample_rate"
+    value: { string_value: "16000"}
+  }
+]
+input [
+  {
+    name: "wav"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "wav_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "speech"
+    data_type: TYPE_FP32
+    dims: [-1, 80]
+  },
+  {
+    name: "speech_lengths"
+    data_type: TYPE_INT64
+    dims: [1]
+  }
+]
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_GPU
+    }
+]

model_repo_offline/joiner/1/.gitkeep ADDED Viewed

File without changes

model_repo_offline/joiner/1/joiner.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:761d2fd80adf49d75cf13e408468e8c735126d8baec527c56c797afffb71250a
+size 1026490

model_repo_offline/joiner/config.pbtxt ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "joiner"
+backend: "onnxruntime"
+default_model_filename: "joiner.onnx"
+max_batch_size: 512
+input [
+  {
+    name: "projected_encoder_out"
+    data_type: TYPE_FP32
+    dims: [ 512 ]
+  },
+  {
+    name: "projected_decoder_out"
+    data_type: TYPE_FP32
+    dims: [ 512 ]
+  }
+]
+output [
+  {
+    name: "logit"
+    data_type: TYPE_FP32
+    dims: [ 500 ]
+  }
+]
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_GPU
+    }
+]

model_repo_offline/joiner_decoder_proj/1/joiner_decoder_proj.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:230fa585dd50f78e28ecf80285c46dabc974f929e3872eb4a4bc73e8ab8e5832
+size 1050893

model_repo_offline/joiner_decoder_proj/config.pbtxt ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "joiner_decoder_proj"
+backend: "onnxruntime"
+default_model_filename: "joiner_decoder_proj.onnx"
+max_batch_size: 512
+input [
+  {
+    name: "decoder_out"
+    data_type: TYPE_FP32
+    dims: [ 512 ]
+  }
+]
+output [
+  {
+    name: "projected_decoder_out"
+    data_type: TYPE_FP32
+    dims: [ 512 ]
+  }
+]
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_GPU
+    }
+]

model_repo_offline/joiner_encoder_proj/1/joiner_encoder_proj.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:754cab31c25f9f2d48ce34291c8564a593cea5cb6114be82b28cbecaa3b3e9e5
+size 788749

model_repo_offline/joiner_encoder_proj/config.pbtxt ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "joiner_encoder_proj"
+backend: "onnxruntime"
+default_model_filename: "joiner_encoder_proj.onnx"
+max_batch_size: 512
+input [
+  {
+    name: "encoder_out"
+    data_type: TYPE_FP32
+    dims: [ 384 ]
+  }
+]
+output [
+  {
+    name: "projected_encoder_out"
+    data_type: TYPE_FP32
+    dims: [ 512 ]
+  }
+]
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]

model_repo_offline/scorer/1/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (5.33 kB). View file

model_repo_offline/scorer/1/__pycache__/search.cpython-38.pyc ADDED Viewed

Binary file (3.06 kB). View file

model_repo_offline/scorer/1/model.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import json
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+import sentencepiece as spm
+from icefall.lexicon import Lexicon
+from search import greedy_search
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        self.max_batch_size = max(model_config["max_batch_size"], 1)
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT0")
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        model_instance_kind = args['model_instance_kind']
+        model_instance_device_id = args['model_instance_device_id']
+        if model_instance_kind == 'GPU':
+            self.device = f'cuda:{model_instance_device_id}'
+        else:
+            self.device= 'cpu'
+        # Get INPUT configuration
+        encoder_config = pb_utils.get_input_config_by_name(
+            model_config, "encoder_out")
+        self.data_type = pb_utils.triton_string_to_numpy(
+            encoder_config['data_type'])
+        if self.data_type == np.float32:
+            self.torch_dtype = torch.float32
+        else:
+            assert self.data_type == np.float16
+            self.torch_dtype = torch.float16
+        self.encoder_dim = encoder_config['dims'][-1]
+        self.init_parameters(self.model_config['parameters'])
+    def init_parameters(self, parameters):
+        for key,value in parameters.items():
+            parameters[key] = value["string_value"]
+        self.context_size = int(parameters['context_size'])
+        self.decoding_method = parameters['decoding_method']
+        if 'bpe' in parameters['tokenizer_file']:
+            sp = spm.SentencePieceProcessor()
+            sp.load(parameters['tokenizer_file'])
+            self.blank_id = sp.piece_to_id("<blk>")
+            self.unk_id = sp.piece_to_id("<unk>")
+            self.vocab_size = sp.get_piece_size()
+            self.tokenizer = sp
+        else:
+            assert 'char' in parameters['tokenizer_file']
+            lexicon = Lexicon(parameters['tokenizer_file'])
+            self.unk_id = lexicon.token_table["<unk>"]
+            self.blank_id = lexicon.token_table["<blk>"]
+            self.vocab_size = max(lexicon.tokens) + 1
+            self.tokenizer = lexicon
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        # Every Python backend must iterate through list of requests and create
+        # an instance of pb_utils.InferenceResponse class for each of them. You
+        # should avoid storing any of the input Tensors in the class attributes
+        # as they will be overridden in subsequent inference requests. You can
+        # make a copy of the underlying NumPy array and store it if it is
+        # required.
+        batch_encoder_out_list, batch_encoder_lens_list = [], []
+        batchsize_lists = []
+        total_seqs = 0
+        encoder_max_len = 0
+        for request in requests:
+            # Perform inference on the request and append it to responses list...
+            in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out")
+            in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens")
+            assert not in_0.is_cpu()
+            batch_encoder_out_list.append(from_dlpack(in_0.to_dlpack()))
+            encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1])
+            cur_b_lens = from_dlpack(in_1.to_dlpack())
+            batch_encoder_lens_list.append(cur_b_lens)
+            cur_batchsize = cur_b_lens.shape[0]
+            batchsize_lists.append(cur_batchsize)
+            total_seqs += cur_batchsize
+        encoder_out = torch.zeros((total_seqs, encoder_max_len, self.encoder_dim),
+                                  dtype=self.torch_dtype, device=self.device)
+        encoder_out_lens = torch.zeros(total_seqs, dtype=torch.int64)
+        st = 0
+        for b in batchsize_lists:
+            t = batch_encoder_out_list.pop(0)
+            encoder_out[st:st + b, 0:t.shape[1]] = t
+            encoder_out_lens[st:st + b] = batch_encoder_lens_list.pop(0)
+            st += b
+        if self.decoding_method == 'greedy_search':
+            ans = greedy_search(encoder_out, encoder_out_lens, self.context_size, self.unk_id, self.blank_id)
+        else:
+            raise NotImplementedError
+        results = []
+        if hasattr(self.tokenizer, 'token_table'):
+            for i in range(len(ans)):
+                results.append([self.tokenizer.token_table[idx] for idx in ans[i]])
+        else:
+            for hyp in self.tokenizer.decode(ans):
+                results.append(hyp.split())
+        st = 0
+        responses = []
+        for b in batchsize_lists:
+            sents = np.array(results[st:st + b])
+            out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
+            responses.append(inference_response)
+            st += b
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

model_repo_offline/scorer/1/search.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+def forward_joiner(cur_encoder_out, decoder_out):
+    in_joiner_tensor_0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(cur_encoder_out))
+    in_joiner_tensor_1 = pb_utils.Tensor.from_dlpack("decoder_out", to_dlpack(decoder_out.squeeze(1)))
+    inference_request = pb_utils.InferenceRequest(
+        model_name='joiner_encoder_proj',
+        requested_output_names=['projected_encoder_out'],
+        inputs=[in_joiner_tensor_0])
+    inference_response = inference_request.exec()
+    if inference_response.has_error():
+        raise pb_utils.TritonModelException(inference_response.error().message())
+    else:
+        # Extract the output tensors from the inference response.
+        proj_encoder_out = pb_utils.get_output_tensor_by_name(inference_response,
+                                                        'projected_encoder_out')
+    inference_request = pb_utils.InferenceRequest(
+        model_name='joiner_decoder_proj',
+        requested_output_names=['projected_decoder_out'],
+        inputs=[in_joiner_tensor_1])
+    inference_response = inference_request.exec()
+    if inference_response.has_error():
+        raise pb_utils.TritonModelException(inference_response.error().message())
+    else:
+        # Extract the output tensors from the inference response.
+        proj_decoder_out = pb_utils.get_output_tensor_by_name(inference_response,
+                                                        'projected_decoder_out')
+    inference_request = pb_utils.InferenceRequest(
+        model_name='joiner',
+        requested_output_names=['logit'],
+        inputs=[proj_encoder_out, proj_decoder_out])
+    inference_response = inference_request.exec()
+    if inference_response.has_error():
+        raise pb_utils.TritonModelException(inference_response.error().message())
+    else:
+        # Extract the output tensors from the inference response.
+        logits = pb_utils.get_output_tensor_by_name(inference_response,
+                                                        'logit')
+        logits = torch.utils.dlpack.from_dlpack(logits.to_dlpack()).cpu()
+        assert len(logits.shape) == 2, logits.shape
+        return logits
+def forward_decoder(hyps, context_size):
+    decoder_input = [h[-context_size:] for h in hyps]
+    decoder_input = np.asarray(decoder_input,dtype=np.int64)
+    in_decoder_input_tensor = pb_utils.Tensor("y", decoder_input)
+    inference_request = pb_utils.InferenceRequest(
+        model_name='decoder',
+        requested_output_names=['decoder_out'],
+        inputs=[in_decoder_input_tensor])
+    inference_response = inference_request.exec()
+    if inference_response.has_error():
+        raise pb_utils.TritonModelException(inference_response.error().message())
+    else:
+        # Extract the output tensors from the inference response.
+        decoder_out = pb_utils.get_output_tensor_by_name(inference_response,
+                                                        'decoder_out')
+        decoder_out = from_dlpack(decoder_out.to_dlpack())
+        return decoder_out
+def greedy_search(encoder_out, encoder_out_lens, context_size, unk_id, blank_id):
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False
+    )
+    pack_batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    hyps = [[blank_id] * context_size for _ in range(encoder_out.shape[0])]
+    decoder_out = forward_decoder(hyps, context_size)
+    offset = 0
+    for batch_size in pack_batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = packed_encoder_out.data[start:end]
+        offset = end
+        decoder_out = decoder_out[:batch_size]
+        logits = forward_joiner(current_encoder_out, decoder_out)
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v not in (blank_id, unk_id):
+                hyps[i].append(v)
+                emitted = True
+        if emitted:
+            decoder_out = forward_decoder(hyps[:batch_size], context_size)
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(encoder_out.shape[0]):
+        ans.append(sorted_ans[unsorted_indices[i]])
+    return ans

model_repo_offline/scorer/config.pbtxt ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "scorer"
+backend: "python"
+max_batch_size: 512
+parameters [
+  {
+    key: "context_size",
+    value: { string_value: "2"}
+  },
+  {
+    key: "tokenizer_file",
+    value: { string_value: "/workspace/bpe.model"}
+  },
+  {
+   key: "FORCE_CPU_ONLY_INPUT_TENSORS",
+   value: {string_value:"no"}
+  },
+  {
+    key: "decoding_method",
+    value: { string_value: "greedy_search"}
+  }
+]
+input [
+  {
+    name: "encoder_out"
+    data_type: TYPE_FP32
+    dims: [-1, 384]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_CPU
+    }
+  ]

model_repo_offline/transducer/1/.gitkeep ADDED Viewed

File without changes

model_repo_offline/transducer/config.pbtxt ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "transducer"
+platform: "ensemble"
+max_batch_size: 512
+input [
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+ensemble_scheduling {
+ step [
+   {
+    model_name: "feature_extractor"
+    model_version: -1
+    input_map {
+      key: "wav"
+      value: "WAV"
+    }
+    input_map {
+      key: "wav_lens"
+      value: "WAV_LENS"
+    }
+    output_map {
+      key: "speech"
+      value: "SPEECH"
+    }
+    output_map {
+      key: "speech_lengths"
+      value: "SPEECH_LENGTHS"
+    }
+   },
+   {
+    model_name: "encoder"
+    model_version: -1
+    input_map {
+      key: "x"
+      value: "SPEECH"
+    }
+    input_map {
+      key: "x_lens"
+      value: "SPEECH_LENGTHS"
+    }
+    output_map {
+      key: "encoder_out"
+      value: "encoder_out"
+    }
+    output_map {
+      key: "encoder_out_lens"
+      value: "encoder_out_lens"
+    }
+  },
+  {
+      model_name: "scorer"
+      model_version: -1
+      input_map {
+          key: "encoder_out"
+          value: "encoder_out"
+      }
+      input_map {
+          key: "encoder_out_lens"
+          value: "encoder_out_lens"
+      }
+      output_map {
+          key: "OUTPUT0"
+          value: "TRANSCRIPTS"
+      }
+  }
+ ]
+}