otherhalf-dev commited on
Commit
505e8ac
·
verified ·
1 Parent(s): f7625ff

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text
37
+ whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text
whisper/1/__pycache__/fbank.cpython-310.pyc ADDED
Binary file (3.03 kB). View file
 
whisper/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (4.75 kB). View file
 
whisper/1/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (4.6 kB). View file
 
whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc ADDED
Binary file (6.26 kB). View file
 
whisper/1/distil_large_v2/decoder_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_config": {
3
+ "apply_query_key_layer_scaling": false,
4
+ "cross_attention": true,
5
+ "has_position_embedding": true,
6
+ "has_token_type_embedding": false,
7
+ "hidden_act": "gelu",
8
+ "hidden_size": 1280,
9
+ "int8": false,
10
+ "max_batch_size": 8,
11
+ "max_beam_width": 4,
12
+ "max_input_len": 14,
13
+ "max_output_len": 100,
14
+ "max_position_embeddings": 448,
15
+ "name": "whisper_decoder",
16
+ "num_heads": 20,
17
+ "num_layers": 2,
18
+ "precision": "float16",
19
+ "strongly_typed": false,
20
+ "tensor_parallel": 1,
21
+ "use_refit": false,
22
+ "vocab_size": 51865
23
+ },
24
+ "plugin_config": {
25
+ "attention_qk_half_accumulation": false,
26
+ "bert_attention_plugin": null,
27
+ "context_fmha": true,
28
+ "context_fmha_fp32_acc": false,
29
+ "enable_xqa": false,
30
+ "gemm_plugin": "float16",
31
+ "gpt_attention_plugin": "float16",
32
+ "identity_plugin": null,
33
+ "layernorm_quantization_plugin": null,
34
+ "lookup_plugin": null,
35
+ "lora_plugin": null,
36
+ "moe_plugin": null,
37
+ "multi_block_mode": false,
38
+ "nccl_plugin": null,
39
+ "paged_kv_cache": false,
40
+ "quantize_per_token_plugin": false,
41
+ "quantize_tensor_plugin": false,
42
+ "remove_input_padding": false,
43
+ "rmsnorm_quantization_plugin": null,
44
+ "smooth_quant_gemm_plugin": null,
45
+ "tokens_per_block": 128,
46
+ "use_context_fmha_for_generation": false,
47
+ "use_custom_all_reduce": false,
48
+ "use_paged_context_fmha": false,
49
+ "weight_only_groupwise_quant_matmul_plugin": null,
50
+ "weight_only_quant_matmul_plugin": null
51
+ }
52
+ }
whisper/1/distil_large_v2/encoder_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_config": {
3
+ "hidden_size": 1280,
4
+ "int8": false,
5
+ "max_batch_size": 8,
6
+ "max_beam_width": 4,
7
+ "n_mels": 80,
8
+ "name": "whisper_encoder",
9
+ "num_heads": 20,
10
+ "num_languages": 99,
11
+ "num_layers": 32,
12
+ "precision": "float16",
13
+ "strongly_typed": false,
14
+ "tensor_parallel": 1,
15
+ "use_refit": false
16
+ },
17
+ "plugin_config": {
18
+ "attention_qk_half_accumulation": false,
19
+ "bert_attention_plugin": "float16",
20
+ "context_fmha": true,
21
+ "context_fmha_fp32_acc": false,
22
+ "enable_xqa": false,
23
+ "gemm_plugin": "float16",
24
+ "gpt_attention_plugin": null,
25
+ "identity_plugin": null,
26
+ "layernorm_quantization_plugin": null,
27
+ "lookup_plugin": null,
28
+ "lora_plugin": null,
29
+ "moe_plugin": null,
30
+ "multi_block_mode": false,
31
+ "nccl_plugin": null,
32
+ "paged_kv_cache": false,
33
+ "quantize_per_token_plugin": false,
34
+ "quantize_tensor_plugin": false,
35
+ "remove_input_padding": false,
36
+ "rmsnorm_quantization_plugin": null,
37
+ "smooth_quant_gemm_plugin": null,
38
+ "tokens_per_block": 128,
39
+ "use_context_fmha_for_generation": false,
40
+ "use_custom_all_reduce": false,
41
+ "use_paged_context_fmha": false,
42
+ "weight_only_groupwise_quant_matmul_plugin": null,
43
+ "weight_only_quant_matmul_plugin": null
44
+ }
45
+ }
whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef608047a26a2126a0705dbf42ddc614a246e7e59696e777a4f3ccc408acd80
3
+ size 372366004
whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04520a320ea43e46cb83734eece8d966e29a651f97b7073a59dcd9bebe053be4
3
+ size 1278617212
whisper/1/fbank.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from typing import Union
19
+ import os
20
+
21
+ def mel_filters(device, n_mels: int =128) -> torch.Tensor:
22
+ """
23
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
24
+ Allows decoupling librosa dependency; saved using:
25
+
26
+ np.savez_compressed(
27
+ "mel_filters.npz",
28
+ mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
29
+ )
30
+ """
31
+ assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
32
+ with np.load(
33
+ os.path.join(os.path.dirname(__file__), "mel_filters.npz")
34
+ ) as f:
35
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
36
+
37
+
38
+ def log_mel_spectrogram(
39
+ audio: Union[torch.Tensor],
40
+ filters: torch.Tensor,
41
+ n_mels: int = 128,
42
+ n_fft: int = 400,
43
+ hop_length: int = 160,
44
+ ):
45
+ """
46
+ Compute the log-Mel spectrogram of
47
+
48
+ Parameters
49
+ ----------
50
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
51
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
52
+
53
+ n_mels: int
54
+ The number of Mel-frequency filters, only 80 or 128 is supported
55
+
56
+ filters: torch.Tensor
57
+
58
+ Returns
59
+ -------
60
+ torch.Tensor, shape = (128, n_frames)
61
+ A Tensor that contains the Mel spectrogram
62
+ """
63
+ window = torch.hann_window(n_fft).to(audio.device)
64
+ stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
65
+ magnitudes = stft[..., :-1].abs() ** 2
66
+
67
+ mel_spec = filters @ magnitudes
68
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
69
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
70
+ log_spec = (log_spec + 4.0) / 4.0
71
+ # cast to float 16
72
+ log_spec = log_spec.half()
73
+ return log_spec
74
+
75
+ class FeatureExtractor(torch.nn.Module):
76
+ """Your Python model must use the same class name. Every Python model
77
+ that is created must have "TritonPythonModel" as the class name.
78
+ """
79
+
80
+ def __init__(self, n_mels: int = 128):
81
+ self.device = torch.device("cuda")
82
+ self.n_mels = n_mels
83
+ self.filters = mel_filters(self.device, n_mels=self.n_mels)
84
+
85
+ def compute_feature(self, wav, target: int = 3000):
86
+ mel = log_mel_spectrogram(wav, self.filters)
87
+ assert mel.shape[1] <= target, f"{mel.shape[1]} > {target}, audio is too long"
88
+ if mel.shape[1] < target:
89
+ mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
90
+ mel = mel.unsqueeze(0)
91
+ return mel
whisper/1/gpt2.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
whisper/1/mel_filters.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
3
+ size 4271
whisper/1/model.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import triton_python_backend_utils as pb_utils
3
+ import numpy as np
4
+ import json
5
+ import torch
6
+ from torch.utils.dlpack import from_dlpack, to_dlpack
7
+ import re
8
+ from .tokenizer import get_tokenizer
9
+ from .whisper_trtllm import WhisperTRTLLM
10
+ from .fbank import FeatureExtractor
11
+
12
+ class TritonPythonModel:
13
+ """Your Python model must use the same class name. Every Python model
14
+ that is created must have "TritonPythonModel" as the class name.
15
+ """
16
+
17
+ def initialize(self, args):
18
+ """`initialize` is called only once when the model is being loaded.
19
+ Implementing `initialize` function is optional. This function allows
20
+ the model to initialize any state associated with this model.
21
+
22
+ Parameters
23
+ ----------
24
+ args : dict
25
+ Both keys and values are strings. The dictionary keys and values are:
26
+ * model_config: A JSON string containing the model configuration
27
+ * model_instance_kind: A string containing model instance kind
28
+ * model_instance_device_id: A string containing model instance device ID
29
+ * model_repository: Model repository path
30
+ * model_version: Model version
31
+ * model_name: Model name
32
+ """
33
+ self.model_config = model_config = json.loads(args['model_config'])
34
+
35
+ # Get OUTPUT0 configuration
36
+ output0_config = pb_utils.get_output_config_by_name(
37
+ model_config, "TRANSCRIPTS")
38
+ # Convert Triton types to numpy types
39
+ self.out0_dtype = pb_utils.triton_string_to_numpy(
40
+ output0_config['data_type'])
41
+
42
+ self.tokenizer = get_tokenizer(num_languages=100)
43
+ self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
44
+ self.device = torch.device("cuda")
45
+ self.init_model(self.model_config['parameters'])
46
+
47
+ def init_model(self, parameters):
48
+ for key,value in parameters.items():
49
+ parameters[key] = value["string_value"]
50
+ engine_dir = parameters["engine_dir"]
51
+ n_mels = int(parameters["n_mels"])
52
+ self.model = WhisperTRTLLM(engine_dir)
53
+ self.feature_extractor = FeatureExtractor(n_mels=n_mels)
54
+
55
+ def execute(self, requests):
56
+ """`execute` must be implemented in every Python model. `execute`
57
+ function receives a list of pb_utils.InferenceRequest as the only
58
+ argument. This function is called when an inference is requested
59
+ for this model.
60
+
61
+ Parameters
62
+ ----------
63
+ requests : list
64
+ A list of pb_utils.InferenceRequest
65
+
66
+ Returns
67
+ -------
68
+ list
69
+ A list of pb_utils.InferenceResponse. The length of this list must
70
+ be the same as `requests`
71
+ """
72
+ # Every Python backend must iterate through list of requests and create
73
+ # an instance of pb_utils.InferenceResponse class for each of them. You
74
+ # should avoid storing any of the input Tensors in the class attributes
75
+ # as they will be overridden in subsequent inference requests. You can
76
+ # make a copy of the underlying NumPy array and store it if it is
77
+ # required.
78
+ mel_list, text_prefix_list = [], []
79
+ for request in requests:
80
+ # Perform inference on the request and append it to responses list...
81
+ in_0 = pb_utils.get_input_tensor_by_name(request, "TEXT_PREFIX")
82
+ in_1 = pb_utils.get_input_tensor_by_name(request, "WAV")
83
+
84
+ wav = in_1.as_numpy()
85
+ assert wav.shape[0] == 1, "Only support batch size 1"
86
+ wav = torch.from_numpy(wav[0]).to(self.device)
87
+ mel = self.feature_extractor.compute_feature(wav)
88
+ mel_list.append(mel)
89
+
90
+ text_prefix_list.append(in_0.as_numpy().tolist())
91
+ # concat tensors in batch dimension
92
+ features = torch.cat(mel_list, dim=0)
93
+ features = features.to(self.device)
94
+
95
+ prompt_ids = []
96
+ for text_prefix in text_prefix_list:
97
+ text_prefix = text_prefix[0][0].decode('utf-8')
98
+ if text_prefix == "":
99
+ text_prefix = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
100
+ prompt_id = self.tokenizer.encode(text_prefix, allowed_special=self.tokenizer.special_tokens_set)
101
+ # convert prompt_id to tensor, tensor shape is [Seq]
102
+ prompt_id = torch.tensor(prompt_id)
103
+ prompt_ids.append(prompt_id)
104
+ # convert prompt_ids to tensor, tensor shape is [Batch, Seq], left padding with self.blank
105
+ tokens = torch.nn.utils.rnn.pad_sequence(prompt_ids, batch_first=True, padding_value=self.blank)
106
+ tokens = tokens.to(features.device)
107
+ print(features.shape)
108
+ output_ids = self.model.process_batch(features, tokens)
109
+
110
+ results = [output_ids[i][0] for i in range(len(output_ids))]
111
+
112
+ responses = []
113
+ for result in results:
114
+ s = self.tokenizer.decode(result)
115
+ s = re.sub(r'<\|.*?\|>', '', s)
116
+ sentence = np.array([s])
117
+ out0 = pb_utils.Tensor("TRANSCRIPTS", sentence.astype(self.out0_dtype))
118
+ inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
119
+ responses.append(inference_response)
120
+ return responses
121
+
122
+ def finalize(self):
123
+ """`finalize` is called only once when the model is being unloaded.
124
+ Implementing `finalize` function is optional. This function allows
125
+ the model to perform any necessary clean ups before exit.
126
+ """
127
+ print('Cleaning up...')
whisper/1/multilingual.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
whisper/1/tokenizer.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
16
+ import base64
17
+ import os
18
+
19
+ import tiktoken
20
+
21
+ LANGUAGES = {
22
+ "en": "english",
23
+ "zh": "chinese",
24
+ "de": "german",
25
+ "es": "spanish",
26
+ "ru": "russian",
27
+ "ko": "korean",
28
+ "fr": "french",
29
+ "ja": "japanese",
30
+ "pt": "portuguese",
31
+ "tr": "turkish",
32
+ "pl": "polish",
33
+ "ca": "catalan",
34
+ "nl": "dutch",
35
+ "ar": "arabic",
36
+ "sv": "swedish",
37
+ "it": "italian",
38
+ "id": "indonesian",
39
+ "hi": "hindi",
40
+ "fi": "finnish",
41
+ "vi": "vietnamese",
42
+ "he": "hebrew",
43
+ "uk": "ukrainian",
44
+ "el": "greek",
45
+ "ms": "malay",
46
+ "cs": "czech",
47
+ "ro": "romanian",
48
+ "da": "danish",
49
+ "hu": "hungarian",
50
+ "ta": "tamil",
51
+ "no": "norwegian",
52
+ "th": "thai",
53
+ "ur": "urdu",
54
+ "hr": "croatian",
55
+ "bg": "bulgarian",
56
+ "lt": "lithuanian",
57
+ "la": "latin",
58
+ "mi": "maori",
59
+ "ml": "malayalam",
60
+ "cy": "welsh",
61
+ "sk": "slovak",
62
+ "te": "telugu",
63
+ "fa": "persian",
64
+ "lv": "latvian",
65
+ "bn": "bengali",
66
+ "sr": "serbian",
67
+ "az": "azerbaijani",
68
+ "sl": "slovenian",
69
+ "kn": "kannada",
70
+ "et": "estonian",
71
+ "mk": "macedonian",
72
+ "br": "breton",
73
+ "eu": "basque",
74
+ "is": "icelandic",
75
+ "hy": "armenian",
76
+ "ne": "nepali",
77
+ "mn": "mongolian",
78
+ "bs": "bosnian",
79
+ "kk": "kazakh",
80
+ "sq": "albanian",
81
+ "sw": "swahili",
82
+ "gl": "galician",
83
+ "mr": "marathi",
84
+ "pa": "punjabi",
85
+ "si": "sinhala",
86
+ "km": "khmer",
87
+ "sn": "shona",
88
+ "yo": "yoruba",
89
+ "so": "somali",
90
+ "af": "afrikaans",
91
+ "oc": "occitan",
92
+ "ka": "georgian",
93
+ "be": "belarusian",
94
+ "tg": "tajik",
95
+ "sd": "sindhi",
96
+ "gu": "gujarati",
97
+ "am": "amharic",
98
+ "yi": "yiddish",
99
+ "lo": "lao",
100
+ "uz": "uzbek",
101
+ "fo": "faroese",
102
+ "ht": "haitian creole",
103
+ "ps": "pashto",
104
+ "tk": "turkmen",
105
+ "nn": "nynorsk",
106
+ "mt": "maltese",
107
+ "sa": "sanskrit",
108
+ "lb": "luxembourgish",
109
+ "my": "myanmar",
110
+ "bo": "tibetan",
111
+ "tl": "tagalog",
112
+ "mg": "malagasy",
113
+ "as": "assamese",
114
+ "tt": "tatar",
115
+ "haw": "hawaiian",
116
+ "ln": "lingala",
117
+ "ha": "hausa",
118
+ "ba": "bashkir",
119
+ "jw": "javanese",
120
+ "su": "sundanese",
121
+ "yue": "cantonese",
122
+ }
123
+
124
+
125
+ def get_tokenizer(name: str = "multilingual",
126
+ num_languages: int = 99,
127
+ tokenizer_dir: str = None):
128
+ if tokenizer_dir is None:
129
+ vocab_path = os.path.join(os.path.dirname(__file__),
130
+ f"./{name}.tiktoken")
131
+ else:
132
+ vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
133
+ ranks = {
134
+ base64.b64decode(token): int(rank)
135
+ for token, rank in (line.split() for line in open(vocab_path) if line)
136
+ }
137
+ n_vocab = len(ranks)
138
+ special_tokens = {}
139
+
140
+ specials = [
141
+ "<|endoftext|>",
142
+ "<|startoftranscript|>",
143
+ *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
144
+ "<|translate|>",
145
+ "<|transcribe|>",
146
+ "<|startoflm|>",
147
+ "<|startofprev|>",
148
+ "<|nospeech|>",
149
+ "<|notimestamps|>",
150
+ *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
151
+ ]
152
+
153
+ for token in specials:
154
+ special_tokens[token] = n_vocab
155
+ n_vocab += 1
156
+
157
+ return tiktoken.Encoding(
158
+ name=os.path.basename(vocab_path),
159
+ explicit_n_vocab=n_vocab,
160
+ pat_str=
161
+ r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
162
+ mergeable_ranks=ranks,
163
+ special_tokens=special_tokens,
164
+ )
165
+
166
+
167
+ if __name__ == "__main__":
168
+ enc = get_tokenizer()
169
+ mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
170
+ encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
171
+ mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
172
+ mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
173
+ #print(encoding, mystr, mystr2)
174
+ print(
175
+ enc.encode("<|startoftranscript|>",
176
+ allowed_special=enc.special_tokens_set)[0])
177
+ print(
178
+ enc.encode("<|endoftext|>",
179
+ allowed_special=enc.special_tokens_set)[0])
180
+ my_zh_str = "好好学习"
181
+ encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
182
+ decoding = enc.decode(encoding)
183
+ print(type(decoding))
184
+ #print(encoding, decoding)
whisper/1/whisper_trtllm.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import json
16
+ from collections import OrderedDict
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+ import torch
21
+
22
+ import tensorrt_llm
23
+ import tensorrt_llm.logger as logger
24
+ from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
25
+ trt_dtype_to_torch)
26
+ from tensorrt_llm.runtime import ModelConfig, SamplingConfig
27
+ from tensorrt_llm.runtime.session import Session, TensorInfo
28
+
29
+
30
+ class WhisperEncoding:
31
+
32
+ def __init__(self, engine_dir):
33
+ self.session = self.get_session(engine_dir)
34
+
35
+ def get_session(self, engine_dir):
36
+ config_path = engine_dir / 'encoder_config.json'
37
+ with open(config_path, 'r') as f:
38
+ config = json.load(f)
39
+
40
+ use_gpt_attention_plugin = config['plugin_config'][
41
+ 'gpt_attention_plugin']
42
+ dtype = config['builder_config']['precision']
43
+ n_mels = config['builder_config']['n_mels']
44
+ num_languages = config['builder_config']['num_languages']
45
+
46
+ self.dtype = dtype
47
+ self.n_mels = n_mels
48
+ self.num_languages = num_languages
49
+
50
+ serialize_path = engine_dir / f'whisper_encoder_{self.dtype}_tp1_rank0.engine'
51
+
52
+ with open(serialize_path, 'rb') as f:
53
+ session = Session.from_serialized_engine(f.read())
54
+
55
+ return session
56
+
57
+ def get_audio_features(self, mel):
58
+
59
+ input_lengths = torch.tensor(
60
+ [mel.shape[2] // 2 for _ in range(mel.shape[0])],
61
+ dtype=torch.int32,
62
+ device=mel.device)
63
+
64
+ inputs = OrderedDict()
65
+ inputs['x'] = mel
66
+ inputs['input_lengths'] = input_lengths
67
+
68
+ output_list = [
69
+ TensorInfo('x', str_dtype_to_trt(self.dtype), mel.shape),
70
+ TensorInfo('input_lengths', str_dtype_to_trt('int32'),
71
+ input_lengths.shape)
72
+ ]
73
+
74
+ output_info = (self.session).infer_shapes(output_list)
75
+
76
+ logger.debug(f'output info {output_info}')
77
+ outputs = {
78
+ t.name: torch.empty(tuple(t.shape),
79
+ dtype=trt_dtype_to_torch(t.dtype),
80
+ device='cuda')
81
+ for t in output_info
82
+ }
83
+ stream = torch.cuda.current_stream()
84
+ ok = self.session.run(inputs=inputs,
85
+ outputs=outputs,
86
+ stream=stream.cuda_stream)
87
+ assert ok, 'Engine execution failed'
88
+ stream.synchronize()
89
+ audio_features = outputs['output']
90
+ return audio_features
91
+
92
+
93
+ class WhisperDecoding:
94
+
95
+ def __init__(self, engine_dir, runtime_mapping, debug_mode=False):
96
+
97
+ self.decoder_config = self.get_config(engine_dir)
98
+ self.decoder_generation_session = self.get_session(
99
+ engine_dir, runtime_mapping, debug_mode)
100
+
101
+ def get_config(self, engine_dir):
102
+ config_path = engine_dir / 'decoder_config.json'
103
+ with open(config_path, 'r') as f:
104
+ config = json.load(f)
105
+ decoder_config = OrderedDict()
106
+ decoder_config.update(config['plugin_config'])
107
+ decoder_config.update(config['builder_config'])
108
+ return decoder_config
109
+
110
+ def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
111
+ dtype = self.decoder_config['precision']
112
+ serialize_path = engine_dir / f'whisper_decoder_{dtype}_tp1_rank0.engine'
113
+ with open(serialize_path, "rb") as f:
114
+ decoder_engine_buffer = f.read()
115
+
116
+ decoder_model_config = ModelConfig(
117
+ max_batch_size=self.decoder_config['max_batch_size'],
118
+ max_beam_width=self.decoder_config['max_beam_width'],
119
+ num_heads=self.decoder_config['num_heads'],
120
+ num_kv_heads=self.decoder_config['num_heads'],
121
+ hidden_size=self.decoder_config['hidden_size'],
122
+ vocab_size=self.decoder_config['vocab_size'],
123
+ num_layers=self.decoder_config['num_layers'],
124
+ gpt_attention_plugin=self.decoder_config['gpt_attention_plugin'],
125
+ remove_input_padding=self.decoder_config['remove_input_padding'],
126
+ cross_attention=self.decoder_config['cross_attention'],
127
+ has_position_embedding=self.
128
+ decoder_config['has_position_embedding'],
129
+ has_token_type_embedding=self.
130
+ decoder_config['has_token_type_embedding'],
131
+ )
132
+ decoder_generation_session = tensorrt_llm.runtime.GenerationSession(
133
+ decoder_model_config,
134
+ decoder_engine_buffer,
135
+ runtime_mapping,
136
+ debug_mode=debug_mode)
137
+
138
+ return decoder_generation_session
139
+
140
+ def generate(self,
141
+ decoder_input_ids,
142
+ encoder_outputs,
143
+ eot_id,
144
+ max_new_tokens=40,
145
+ num_beams=1):
146
+ encoder_input_lengths = torch.tensor(
147
+ [encoder_outputs.shape[1] for x in range(encoder_outputs.shape[0])],
148
+ dtype=torch.int32,
149
+ device='cuda')
150
+
151
+ decoder_input_lengths = torch.tensor([
152
+ decoder_input_ids.shape[-1]
153
+ for _ in range(decoder_input_ids.shape[0])
154
+ ],
155
+ dtype=torch.int32,
156
+ device='cuda')
157
+ decoder_max_input_length = torch.max(decoder_input_lengths).item()
158
+
159
+ cross_attention_mask = torch.ones(
160
+ [encoder_outputs.shape[0], 1,
161
+ encoder_outputs.shape[1]]).int().cuda()
162
+
163
+ # generation config
164
+ sampling_config = SamplingConfig(end_id=eot_id,
165
+ pad_id=eot_id,
166
+ num_beams=num_beams)
167
+ self.decoder_generation_session.setup(
168
+ decoder_input_lengths.size(0),
169
+ decoder_max_input_length,
170
+ max_new_tokens,
171
+ beam_width=num_beams,
172
+ encoder_max_input_length=encoder_outputs.shape[1])
173
+
174
+ torch.cuda.synchronize()
175
+
176
+ decoder_input_ids = decoder_input_ids.type(torch.int32).cuda()
177
+ output_ids = self.decoder_generation_session.decode(
178
+ decoder_input_ids,
179
+ decoder_input_lengths,
180
+ sampling_config,
181
+ encoder_output=encoder_outputs,
182
+ encoder_input_lengths=encoder_input_lengths,
183
+ cross_attention_mask=cross_attention_mask,
184
+ )
185
+ torch.cuda.synchronize()
186
+
187
+ # get the list of int from output_ids tensor
188
+ output_ids = output_ids.cpu().numpy().tolist()
189
+ return output_ids
190
+
191
+
192
+ class WhisperTRTLLM(object):
193
+
194
+ def __init__(self, engine_dir):
195
+ world_size = 1
196
+ runtime_rank = tensorrt_llm.mpi_rank()
197
+ runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
198
+ torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
199
+ engine_dir = Path(engine_dir)
200
+
201
+ self.encoder = WhisperEncoding(engine_dir)
202
+ self.decoder = WhisperDecoding(engine_dir,
203
+ runtime_mapping,
204
+ debug_mode=False)
205
+
206
+ def process_batch(
207
+ self,
208
+ mel,
209
+ decoder_input_ids,
210
+ eot_id=50257,
211
+ max_new_tokens=96,
212
+ num_beams=1):
213
+ encoder_output = self.encoder.get_audio_features(mel)
214
+ output_ids = self.decoder.generate(decoder_input_ids,
215
+ encoder_output,
216
+ eot_id,
217
+ max_new_tokens=max_new_tokens,
218
+ num_beams=num_beams)
219
+ return output_ids
whisper/config.pbtxt ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "whisper"
16
+ backend: "python"
17
+ max_batch_size: 8
18
+
19
+ parameters [
20
+ {
21
+ key: "n_mels",
22
+ value: {string_value:"80"} # 128 dim for large-v3, 80 dim for large-v2
23
+ },
24
+ {
25
+ key: "engine_dir"
26
+ value: { string_value: "/whisper/model_repo_whisper_trtllm/whisper/1/distil_large_v2"}
27
+ }
28
+ ]
29
+
30
+
31
+ input [
32
+ {
33
+ name: "TEXT_PREFIX"
34
+ data_type: TYPE_STRING
35
+ dims: [1]
36
+ },
37
+ {
38
+ name: "WAV"
39
+ data_type: TYPE_FP32
40
+ dims: [-1]
41
+ }
42
+ ]
43
+
44
+ output [
45
+ {
46
+ name: "TRANSCRIPTS"
47
+ data_type: TYPE_STRING
48
+ dims: [1]
49
+ }
50
+ ]
51
+
52
+ dynamic_batching {
53
+ preferred_batch_size: [4,8]
54
+ max_queue_delay_microseconds: 250000
55
+ }
56
+
57
+ instance_group [
58
+ {
59
+ count: 1
60
+ kind: KIND_CPU
61
+ }
62
+ ]
63
+