Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +2 -0
whisper/1/__pycache__/fbank.cpython-310.pyc +0 -0
whisper/1/__pycache__/model.cpython-310.pyc +0 -0
whisper/1/__pycache__/tokenizer.cpython-310.pyc +0 -0
whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc +0 -0
whisper/1/distil_large_v2/decoder_config.json +52 -0
whisper/1/distil_large_v2/encoder_config.json +45 -0
whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine +3 -0
whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine +3 -0
whisper/1/fbank.py +91 -0
whisper/1/gpt2.tiktoken +0 -0
whisper/1/mel_filters.npz +3 -0
whisper/1/model.py +127 -0
whisper/1/multilingual.tiktoken +0 -0
whisper/1/tokenizer.py +184 -0
whisper/1/whisper_trtllm.py +219 -0
whisper/config.pbtxt +63 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text
+whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text

whisper/1/__pycache__/fbank.cpython-310.pyc ADDED Viewed

Binary file (3.03 kB). View file

whisper/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (4.75 kB). View file

whisper/1/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (4.6 kB). View file

whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc ADDED Viewed

Binary file (6.26 kB). View file

whisper/1/distil_large_v2/decoder_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "builder_config": {
+    "apply_query_key_layer_scaling": false,
+    "cross_attention": true,
+    "has_position_embedding": true,
+    "has_token_type_embedding": false,
+    "hidden_act": "gelu",
+    "hidden_size": 1280,
+    "int8": false,
+    "max_batch_size": 8,
+    "max_beam_width": 4,
+    "max_input_len": 14,
+    "max_output_len": 100,
+    "max_position_embeddings": 448,
+    "name": "whisper_decoder",
+    "num_heads": 20,
+    "num_layers": 2,
+    "precision": "float16",
+    "strongly_typed": false,
+    "tensor_parallel": 1,
+    "use_refit": false,
+    "vocab_size": 51865
+  },
+  "plugin_config": {
+    "attention_qk_half_accumulation": false,
+    "bert_attention_plugin": null,
+    "context_fmha": true,
+    "context_fmha_fp32_acc": false,
+    "enable_xqa": false,
+    "gemm_plugin": "float16",
+    "gpt_attention_plugin": "float16",
+    "identity_plugin": null,
+    "layernorm_quantization_plugin": null,
+    "lookup_plugin": null,
+    "lora_plugin": null,
+    "moe_plugin": null,
+    "multi_block_mode": false,
+    "nccl_plugin": null,
+    "paged_kv_cache": false,
+    "quantize_per_token_plugin": false,
+    "quantize_tensor_plugin": false,
+    "remove_input_padding": false,
+    "rmsnorm_quantization_plugin": null,
+    "smooth_quant_gemm_plugin": null,
+    "tokens_per_block": 128,
+    "use_context_fmha_for_generation": false,
+    "use_custom_all_reduce": false,
+    "use_paged_context_fmha": false,
+    "weight_only_groupwise_quant_matmul_plugin": null,
+    "weight_only_quant_matmul_plugin": null
+  }
+}

whisper/1/distil_large_v2/encoder_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "builder_config": {
+    "hidden_size": 1280,
+    "int8": false,
+    "max_batch_size": 8,
+    "max_beam_width": 4,
+    "n_mels": 80,
+    "name": "whisper_encoder",
+    "num_heads": 20,
+    "num_languages": 99,
+    "num_layers": 32,
+    "precision": "float16",
+    "strongly_typed": false,
+    "tensor_parallel": 1,
+    "use_refit": false
+  },
+  "plugin_config": {
+    "attention_qk_half_accumulation": false,
+    "bert_attention_plugin": "float16",
+    "context_fmha": true,
+    "context_fmha_fp32_acc": false,
+    "enable_xqa": false,
+    "gemm_plugin": "float16",
+    "gpt_attention_plugin": null,
+    "identity_plugin": null,
+    "layernorm_quantization_plugin": null,
+    "lookup_plugin": null,
+    "lora_plugin": null,
+    "moe_plugin": null,
+    "multi_block_mode": false,
+    "nccl_plugin": null,
+    "paged_kv_cache": false,
+    "quantize_per_token_plugin": false,
+    "quantize_tensor_plugin": false,
+    "remove_input_padding": false,
+    "rmsnorm_quantization_plugin": null,
+    "smooth_quant_gemm_plugin": null,
+    "tokens_per_block": 128,
+    "use_context_fmha_for_generation": false,
+    "use_custom_all_reduce": false,
+    "use_paged_context_fmha": false,
+    "weight_only_groupwise_quant_matmul_plugin": null,
+    "weight_only_quant_matmul_plugin": null
+  }
+}

whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef608047a26a2126a0705dbf42ddc614a246e7e59696e777a4f3ccc408acd80
+size 372366004

whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04520a320ea43e46cb83734eece8d966e29a651f97b7073a59dcd9bebe053be4
+size 1278617212

whisper/1/fbank.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Union
+import os
+def mel_filters(device, n_mels: int =128) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[torch.Tensor],
+    filters: torch.Tensor,
+    n_mels: int = 128,
+    n_fft: int = 400,
+    hop_length: int = 160,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 or 128 is supported
+    filters: torch.Tensor
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(n_fft).to(audio.device)
+    stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    # cast to float 16
+    log_spec = log_spec.half()
+    return log_spec
+class FeatureExtractor(torch.nn.Module):
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def __init__(self, n_mels: int = 128):
+        self.device = torch.device("cuda")
+        self.n_mels = n_mels
+        self.filters = mel_filters(self.device, n_mels=self.n_mels)
+    def compute_feature(self, wav, target: int = 3000):
+        mel = log_mel_spectrogram(wav, self.filters)
+        assert mel.shape[1] <= target, f"{mel.shape[1]} > {target}, audio is too long"
+        if mel.shape[1] < target:
+            mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
+        mel = mel.unsqueeze(0)
+        return mel

whisper/1/gpt2.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

whisper/1/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
+size 4271

whisper/1/model.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# -*- coding: utf-8 -*-
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import json
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+import re
+from .tokenizer import get_tokenizer
+from .whisper_trtllm import WhisperTRTLLM
+from .fbank import FeatureExtractor
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "TRANSCRIPTS")
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        self.tokenizer = get_tokenizer(num_languages=100)
+        self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
+        self.device = torch.device("cuda")
+        self.init_model(self.model_config['parameters'])
+    def init_model(self, parameters):
+        for key,value in parameters.items():
+            parameters[key] = value["string_value"]
+        engine_dir = parameters["engine_dir"]
+        n_mels = int(parameters["n_mels"])
+        self.model = WhisperTRTLLM(engine_dir)
+        self.feature_extractor = FeatureExtractor(n_mels=n_mels)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        # Every Python backend must iterate through list of requests and create
+        # an instance of pb_utils.InferenceResponse class for each of them. You
+        # should avoid storing any of the input Tensors in the class attributes
+        # as they will be overridden in subsequent inference requests. You can
+        # make a copy of the underlying NumPy array and store it if it is
+        # required.
+        mel_list, text_prefix_list = [], []
+        for request in requests:
+            # Perform inference on the request and append it to responses list...
+            in_0 = pb_utils.get_input_tensor_by_name(request, "TEXT_PREFIX")
+            in_1 = pb_utils.get_input_tensor_by_name(request, "WAV")
+            wav = in_1.as_numpy()
+            assert wav.shape[0] == 1, "Only support batch size 1"
+            wav = torch.from_numpy(wav[0]).to(self.device)
+            mel = self.feature_extractor.compute_feature(wav)
+            mel_list.append(mel)
+            text_prefix_list.append(in_0.as_numpy().tolist())
+        # concat tensors in batch dimension
+        features = torch.cat(mel_list, dim=0)
+        features = features.to(self.device)
+        prompt_ids = []
+        for text_prefix in text_prefix_list:
+            text_prefix = text_prefix[0][0].decode('utf-8')
+            if text_prefix == "":
+                text_prefix = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+            prompt_id = self.tokenizer.encode(text_prefix, allowed_special=self.tokenizer.special_tokens_set)
+            # convert prompt_id to tensor, tensor shape is [Seq]
+            prompt_id = torch.tensor(prompt_id)
+            prompt_ids.append(prompt_id)
+        # convert prompt_ids to tensor, tensor shape is [Batch, Seq], left padding with self.blank
+        tokens = torch.nn.utils.rnn.pad_sequence(prompt_ids, batch_first=True, padding_value=self.blank)
+        tokens = tokens.to(features.device)
+        print(features.shape)
+        output_ids = self.model.process_batch(features, tokens)
+        results = [output_ids[i][0] for i in range(len(output_ids))]
+        responses = []
+        for result in results:
+            s = self.tokenizer.decode(result)
+            s = re.sub(r'<\|.*?\|>', '', s)
+            sentence = np.array([s])
+            out0 = pb_utils.Tensor("TRANSCRIPTS", sentence.astype(self.out0_dtype))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
+            responses.append(inference_response)
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

whisper/1/multilingual.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

whisper/1/tokenizer.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+import base64
+import os
+import tiktoken
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+}
+def get_tokenizer(name: str = "multilingual",
+                  num_languages: int = 99,
+                  tokenizer_dir: str = None):
+    if tokenizer_dir is None:
+        vocab_path = os.path.join(os.path.dirname(__file__),
+                                  f"./{name}.tiktoken")
+    else:
+        vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str=
+        r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+if __name__ == "__main__":
+    enc = get_tokenizer()
+    mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
+    encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
+    mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
+    mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
+    #print(encoding, mystr, mystr2)
+    print(
+        enc.encode("<|startoftranscript|>",
+                   allowed_special=enc.special_tokens_set)[0])
+    print(
+        enc.encode("<|endoftext|>",
+                   allowed_special=enc.special_tokens_set)[0])
+    my_zh_str = "好好学习"
+    encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
+    decoding = enc.decode(encoding)
+    print(type(decoding))
+    #print(encoding, decoding)

whisper/1/whisper_trtllm.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from collections import OrderedDict
+from pathlib import Path
+import numpy as np
+import torch
+import tensorrt_llm
+import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
+                                 trt_dtype_to_torch)
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from tensorrt_llm.runtime.session import Session, TensorInfo
+class WhisperEncoding:
+    def __init__(self, engine_dir):
+        self.session = self.get_session(engine_dir)
+    def get_session(self, engine_dir):
+        config_path = engine_dir / 'encoder_config.json'
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        use_gpt_attention_plugin = config['plugin_config'][
+            'gpt_attention_plugin']
+        dtype = config['builder_config']['precision']
+        n_mels = config['builder_config']['n_mels']
+        num_languages = config['builder_config']['num_languages']
+        self.dtype = dtype
+        self.n_mels = n_mels
+        self.num_languages = num_languages
+        serialize_path = engine_dir / f'whisper_encoder_{self.dtype}_tp1_rank0.engine'
+        with open(serialize_path, 'rb') as f:
+            session = Session.from_serialized_engine(f.read())
+        return session
+    def get_audio_features(self, mel):
+        input_lengths = torch.tensor(
+            [mel.shape[2] // 2 for _ in range(mel.shape[0])],
+            dtype=torch.int32,
+            device=mel.device)
+        inputs = OrderedDict()
+        inputs['x'] = mel
+        inputs['input_lengths'] = input_lengths
+        output_list = [
+            TensorInfo('x', str_dtype_to_trt(self.dtype), mel.shape),
+            TensorInfo('input_lengths', str_dtype_to_trt('int32'),
+                       input_lengths.shape)
+        ]
+        output_info = (self.session).infer_shapes(output_list)
+        logger.debug(f'output info {output_info}')
+        outputs = {
+            t.name: torch.empty(tuple(t.shape),
+                                dtype=trt_dtype_to_torch(t.dtype),
+                                device='cuda')
+            for t in output_info
+        }
+        stream = torch.cuda.current_stream()
+        ok = self.session.run(inputs=inputs,
+                              outputs=outputs,
+                              stream=stream.cuda_stream)
+        assert ok, 'Engine execution failed'
+        stream.synchronize()
+        audio_features = outputs['output']
+        return audio_features
+class WhisperDecoding:
+    def __init__(self, engine_dir, runtime_mapping, debug_mode=False):
+        self.decoder_config = self.get_config(engine_dir)
+        self.decoder_generation_session = self.get_session(
+            engine_dir, runtime_mapping, debug_mode)
+    def get_config(self, engine_dir):
+        config_path = engine_dir / 'decoder_config.json'
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        decoder_config = OrderedDict()
+        decoder_config.update(config['plugin_config'])
+        decoder_config.update(config['builder_config'])
+        return decoder_config
+    def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
+        dtype = self.decoder_config['precision']
+        serialize_path = engine_dir / f'whisper_decoder_{dtype}_tp1_rank0.engine'
+        with open(serialize_path, "rb") as f:
+            decoder_engine_buffer = f.read()
+        decoder_model_config = ModelConfig(
+            max_batch_size=self.decoder_config['max_batch_size'],
+            max_beam_width=self.decoder_config['max_beam_width'],
+            num_heads=self.decoder_config['num_heads'],
+            num_kv_heads=self.decoder_config['num_heads'],
+            hidden_size=self.decoder_config['hidden_size'],
+            vocab_size=self.decoder_config['vocab_size'],
+            num_layers=self.decoder_config['num_layers'],
+            gpt_attention_plugin=self.decoder_config['gpt_attention_plugin'],
+            remove_input_padding=self.decoder_config['remove_input_padding'],
+            cross_attention=self.decoder_config['cross_attention'],
+            has_position_embedding=self.
+            decoder_config['has_position_embedding'],
+            has_token_type_embedding=self.
+            decoder_config['has_token_type_embedding'],
+        )
+        decoder_generation_session = tensorrt_llm.runtime.GenerationSession(
+            decoder_model_config,
+            decoder_engine_buffer,
+            runtime_mapping,
+            debug_mode=debug_mode)
+        return decoder_generation_session
+    def generate(self,
+                 decoder_input_ids,
+                 encoder_outputs,
+                 eot_id,
+                 max_new_tokens=40,
+                 num_beams=1):
+        encoder_input_lengths = torch.tensor(
+            [encoder_outputs.shape[1] for x in range(encoder_outputs.shape[0])],
+            dtype=torch.int32,
+            device='cuda')
+        decoder_input_lengths = torch.tensor([
+            decoder_input_ids.shape[-1]
+            for _ in range(decoder_input_ids.shape[0])
+        ],
+                                             dtype=torch.int32,
+                                             device='cuda')
+        decoder_max_input_length = torch.max(decoder_input_lengths).item()
+        cross_attention_mask = torch.ones(
+            [encoder_outputs.shape[0], 1,
+             encoder_outputs.shape[1]]).int().cuda()
+        # generation config
+        sampling_config = SamplingConfig(end_id=eot_id,
+                                         pad_id=eot_id,
+                                         num_beams=num_beams)
+        self.decoder_generation_session.setup(
+            decoder_input_lengths.size(0),
+            decoder_max_input_length,
+            max_new_tokens,
+            beam_width=num_beams,
+            encoder_max_input_length=encoder_outputs.shape[1])
+        torch.cuda.synchronize()
+        decoder_input_ids = decoder_input_ids.type(torch.int32).cuda()
+        output_ids = self.decoder_generation_session.decode(
+            decoder_input_ids,
+            decoder_input_lengths,
+            sampling_config,
+            encoder_output=encoder_outputs,
+            encoder_input_lengths=encoder_input_lengths,
+            cross_attention_mask=cross_attention_mask,
+        )
+        torch.cuda.synchronize()
+        # get the list of int from output_ids tensor
+        output_ids = output_ids.cpu().numpy().tolist()
+        return output_ids
+class WhisperTRTLLM(object):
+    def __init__(self, engine_dir):
+        world_size = 1
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        engine_dir = Path(engine_dir)
+        self.encoder = WhisperEncoding(engine_dir)
+        self.decoder = WhisperDecoding(engine_dir,
+                                       runtime_mapping,
+                                       debug_mode=False)
+    def process_batch(
+            self,
+            mel,
+            decoder_input_ids,
+            eot_id=50257,
+            max_new_tokens=96,
+            num_beams=1):
+        encoder_output = self.encoder.get_audio_features(mel)
+        output_ids = self.decoder.generate(decoder_input_ids,
+                                           encoder_output,
+                                           eot_id,
+                                           max_new_tokens=max_new_tokens,
+                                           num_beams=num_beams)
+        return output_ids

whisper/config.pbtxt ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "whisper"
+backend: "python"
+max_batch_size: 8
+parameters [
+  {
+   key: "n_mels",
+   value: {string_value:"80"} # 128 dim for large-v3, 80 dim for large-v2
+  },
+  {
+    key: "engine_dir"
+    value: { string_value: "/whisper/model_repo_whisper_trtllm/whisper/1/distil_large_v2"}
+  }
+]
+input [
+  {
+    name: "TEXT_PREFIX"
+    data_type: TYPE_STRING
+    dims: [1]
+  },
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  }
+]
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+dynamic_batching {
+    preferred_batch_size: [4,8]
+    max_queue_delay_microseconds: 250000
+}
+instance_group [
+    {
+      count: 1
+      kind: KIND_CPU
+    }
+  ]