Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- whisper/1/__pycache__/fbank.cpython-310.pyc +0 -0
- whisper/1/__pycache__/model.cpython-310.pyc +0 -0
- whisper/1/__pycache__/tokenizer.cpython-310.pyc +0 -0
- whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc +0 -0
- whisper/1/distil_large_v2/decoder_config.json +52 -0
- whisper/1/distil_large_v2/encoder_config.json +45 -0
- whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine +3 -0
- whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine +3 -0
- whisper/1/fbank.py +91 -0
- whisper/1/gpt2.tiktoken +0 -0
- whisper/1/mel_filters.npz +3 -0
- whisper/1/model.py +127 -0
- whisper/1/multilingual.tiktoken +0 -0
- whisper/1/tokenizer.py +184 -0
- whisper/1/whisper_trtllm.py +219 -0
- whisper/config.pbtxt +63 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine filter=lfs diff=lfs merge=lfs -text
|
whisper/1/__pycache__/fbank.cpython-310.pyc
ADDED
|
Binary file (3.03 kB). View file
|
|
|
whisper/1/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (4.75 kB). View file
|
|
|
whisper/1/__pycache__/tokenizer.cpython-310.pyc
ADDED
|
Binary file (4.6 kB). View file
|
|
|
whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc
ADDED
|
Binary file (6.26 kB). View file
|
|
|
whisper/1/distil_large_v2/decoder_config.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_config": {
|
| 3 |
+
"apply_query_key_layer_scaling": false,
|
| 4 |
+
"cross_attention": true,
|
| 5 |
+
"has_position_embedding": true,
|
| 6 |
+
"has_token_type_embedding": false,
|
| 7 |
+
"hidden_act": "gelu",
|
| 8 |
+
"hidden_size": 1280,
|
| 9 |
+
"int8": false,
|
| 10 |
+
"max_batch_size": 8,
|
| 11 |
+
"max_beam_width": 4,
|
| 12 |
+
"max_input_len": 14,
|
| 13 |
+
"max_output_len": 100,
|
| 14 |
+
"max_position_embeddings": 448,
|
| 15 |
+
"name": "whisper_decoder",
|
| 16 |
+
"num_heads": 20,
|
| 17 |
+
"num_layers": 2,
|
| 18 |
+
"precision": "float16",
|
| 19 |
+
"strongly_typed": false,
|
| 20 |
+
"tensor_parallel": 1,
|
| 21 |
+
"use_refit": false,
|
| 22 |
+
"vocab_size": 51865
|
| 23 |
+
},
|
| 24 |
+
"plugin_config": {
|
| 25 |
+
"attention_qk_half_accumulation": false,
|
| 26 |
+
"bert_attention_plugin": null,
|
| 27 |
+
"context_fmha": true,
|
| 28 |
+
"context_fmha_fp32_acc": false,
|
| 29 |
+
"enable_xqa": false,
|
| 30 |
+
"gemm_plugin": "float16",
|
| 31 |
+
"gpt_attention_plugin": "float16",
|
| 32 |
+
"identity_plugin": null,
|
| 33 |
+
"layernorm_quantization_plugin": null,
|
| 34 |
+
"lookup_plugin": null,
|
| 35 |
+
"lora_plugin": null,
|
| 36 |
+
"moe_plugin": null,
|
| 37 |
+
"multi_block_mode": false,
|
| 38 |
+
"nccl_plugin": null,
|
| 39 |
+
"paged_kv_cache": false,
|
| 40 |
+
"quantize_per_token_plugin": false,
|
| 41 |
+
"quantize_tensor_plugin": false,
|
| 42 |
+
"remove_input_padding": false,
|
| 43 |
+
"rmsnorm_quantization_plugin": null,
|
| 44 |
+
"smooth_quant_gemm_plugin": null,
|
| 45 |
+
"tokens_per_block": 128,
|
| 46 |
+
"use_context_fmha_for_generation": false,
|
| 47 |
+
"use_custom_all_reduce": false,
|
| 48 |
+
"use_paged_context_fmha": false,
|
| 49 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
| 50 |
+
"weight_only_quant_matmul_plugin": null
|
| 51 |
+
}
|
| 52 |
+
}
|
whisper/1/distil_large_v2/encoder_config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_config": {
|
| 3 |
+
"hidden_size": 1280,
|
| 4 |
+
"int8": false,
|
| 5 |
+
"max_batch_size": 8,
|
| 6 |
+
"max_beam_width": 4,
|
| 7 |
+
"n_mels": 80,
|
| 8 |
+
"name": "whisper_encoder",
|
| 9 |
+
"num_heads": 20,
|
| 10 |
+
"num_languages": 99,
|
| 11 |
+
"num_layers": 32,
|
| 12 |
+
"precision": "float16",
|
| 13 |
+
"strongly_typed": false,
|
| 14 |
+
"tensor_parallel": 1,
|
| 15 |
+
"use_refit": false
|
| 16 |
+
},
|
| 17 |
+
"plugin_config": {
|
| 18 |
+
"attention_qk_half_accumulation": false,
|
| 19 |
+
"bert_attention_plugin": "float16",
|
| 20 |
+
"context_fmha": true,
|
| 21 |
+
"context_fmha_fp32_acc": false,
|
| 22 |
+
"enable_xqa": false,
|
| 23 |
+
"gemm_plugin": "float16",
|
| 24 |
+
"gpt_attention_plugin": null,
|
| 25 |
+
"identity_plugin": null,
|
| 26 |
+
"layernorm_quantization_plugin": null,
|
| 27 |
+
"lookup_plugin": null,
|
| 28 |
+
"lora_plugin": null,
|
| 29 |
+
"moe_plugin": null,
|
| 30 |
+
"multi_block_mode": false,
|
| 31 |
+
"nccl_plugin": null,
|
| 32 |
+
"paged_kv_cache": false,
|
| 33 |
+
"quantize_per_token_plugin": false,
|
| 34 |
+
"quantize_tensor_plugin": false,
|
| 35 |
+
"remove_input_padding": false,
|
| 36 |
+
"rmsnorm_quantization_plugin": null,
|
| 37 |
+
"smooth_quant_gemm_plugin": null,
|
| 38 |
+
"tokens_per_block": 128,
|
| 39 |
+
"use_context_fmha_for_generation": false,
|
| 40 |
+
"use_custom_all_reduce": false,
|
| 41 |
+
"use_paged_context_fmha": false,
|
| 42 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
| 43 |
+
"weight_only_quant_matmul_plugin": null
|
| 44 |
+
}
|
| 45 |
+
}
|
whisper/1/distil_large_v2/whisper_decoder_float16_tp1_rank0.engine
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ef608047a26a2126a0705dbf42ddc614a246e7e59696e777a4f3ccc408acd80
|
| 3 |
+
size 372366004
|
whisper/1/distil_large_v2/whisper_encoder_float16_tp1_rank0.engine
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04520a320ea43e46cb83734eece8d966e29a651f97b7073a59dcd9bebe053be4
|
| 3 |
+
size 1278617212
|
whisper/1/fbank.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
|
| 15 |
+
import numpy as np
|
| 16 |
+
import torch
|
| 17 |
+
import torch.nn.functional as F
|
| 18 |
+
from typing import Union
|
| 19 |
+
import os
|
| 20 |
+
|
| 21 |
+
def mel_filters(device, n_mels: int =128) -> torch.Tensor:
|
| 22 |
+
"""
|
| 23 |
+
load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
|
| 24 |
+
Allows decoupling librosa dependency; saved using:
|
| 25 |
+
|
| 26 |
+
np.savez_compressed(
|
| 27 |
+
"mel_filters.npz",
|
| 28 |
+
mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
|
| 29 |
+
)
|
| 30 |
+
"""
|
| 31 |
+
assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
|
| 32 |
+
with np.load(
|
| 33 |
+
os.path.join(os.path.dirname(__file__), "mel_filters.npz")
|
| 34 |
+
) as f:
|
| 35 |
+
return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def log_mel_spectrogram(
|
| 39 |
+
audio: Union[torch.Tensor],
|
| 40 |
+
filters: torch.Tensor,
|
| 41 |
+
n_mels: int = 128,
|
| 42 |
+
n_fft: int = 400,
|
| 43 |
+
hop_length: int = 160,
|
| 44 |
+
):
|
| 45 |
+
"""
|
| 46 |
+
Compute the log-Mel spectrogram of
|
| 47 |
+
|
| 48 |
+
Parameters
|
| 49 |
+
----------
|
| 50 |
+
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
|
| 51 |
+
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
|
| 52 |
+
|
| 53 |
+
n_mels: int
|
| 54 |
+
The number of Mel-frequency filters, only 80 or 128 is supported
|
| 55 |
+
|
| 56 |
+
filters: torch.Tensor
|
| 57 |
+
|
| 58 |
+
Returns
|
| 59 |
+
-------
|
| 60 |
+
torch.Tensor, shape = (128, n_frames)
|
| 61 |
+
A Tensor that contains the Mel spectrogram
|
| 62 |
+
"""
|
| 63 |
+
window = torch.hann_window(n_fft).to(audio.device)
|
| 64 |
+
stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
|
| 65 |
+
magnitudes = stft[..., :-1].abs() ** 2
|
| 66 |
+
|
| 67 |
+
mel_spec = filters @ magnitudes
|
| 68 |
+
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
|
| 69 |
+
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
|
| 70 |
+
log_spec = (log_spec + 4.0) / 4.0
|
| 71 |
+
# cast to float 16
|
| 72 |
+
log_spec = log_spec.half()
|
| 73 |
+
return log_spec
|
| 74 |
+
|
| 75 |
+
class FeatureExtractor(torch.nn.Module):
|
| 76 |
+
"""Your Python model must use the same class name. Every Python model
|
| 77 |
+
that is created must have "TritonPythonModel" as the class name.
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
def __init__(self, n_mels: int = 128):
|
| 81 |
+
self.device = torch.device("cuda")
|
| 82 |
+
self.n_mels = n_mels
|
| 83 |
+
self.filters = mel_filters(self.device, n_mels=self.n_mels)
|
| 84 |
+
|
| 85 |
+
def compute_feature(self, wav, target: int = 3000):
|
| 86 |
+
mel = log_mel_spectrogram(wav, self.filters)
|
| 87 |
+
assert mel.shape[1] <= target, f"{mel.shape[1]} > {target}, audio is too long"
|
| 88 |
+
if mel.shape[1] < target:
|
| 89 |
+
mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
|
| 90 |
+
mel = mel.unsqueeze(0)
|
| 91 |
+
return mel
|
whisper/1/gpt2.tiktoken
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
whisper/1/mel_filters.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
|
| 3 |
+
size 4271
|
whisper/1/model.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import triton_python_backend_utils as pb_utils
|
| 3 |
+
import numpy as np
|
| 4 |
+
import json
|
| 5 |
+
import torch
|
| 6 |
+
from torch.utils.dlpack import from_dlpack, to_dlpack
|
| 7 |
+
import re
|
| 8 |
+
from .tokenizer import get_tokenizer
|
| 9 |
+
from .whisper_trtllm import WhisperTRTLLM
|
| 10 |
+
from .fbank import FeatureExtractor
|
| 11 |
+
|
| 12 |
+
class TritonPythonModel:
|
| 13 |
+
"""Your Python model must use the same class name. Every Python model
|
| 14 |
+
that is created must have "TritonPythonModel" as the class name.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def initialize(self, args):
|
| 18 |
+
"""`initialize` is called only once when the model is being loaded.
|
| 19 |
+
Implementing `initialize` function is optional. This function allows
|
| 20 |
+
the model to initialize any state associated with this model.
|
| 21 |
+
|
| 22 |
+
Parameters
|
| 23 |
+
----------
|
| 24 |
+
args : dict
|
| 25 |
+
Both keys and values are strings. The dictionary keys and values are:
|
| 26 |
+
* model_config: A JSON string containing the model configuration
|
| 27 |
+
* model_instance_kind: A string containing model instance kind
|
| 28 |
+
* model_instance_device_id: A string containing model instance device ID
|
| 29 |
+
* model_repository: Model repository path
|
| 30 |
+
* model_version: Model version
|
| 31 |
+
* model_name: Model name
|
| 32 |
+
"""
|
| 33 |
+
self.model_config = model_config = json.loads(args['model_config'])
|
| 34 |
+
|
| 35 |
+
# Get OUTPUT0 configuration
|
| 36 |
+
output0_config = pb_utils.get_output_config_by_name(
|
| 37 |
+
model_config, "TRANSCRIPTS")
|
| 38 |
+
# Convert Triton types to numpy types
|
| 39 |
+
self.out0_dtype = pb_utils.triton_string_to_numpy(
|
| 40 |
+
output0_config['data_type'])
|
| 41 |
+
|
| 42 |
+
self.tokenizer = get_tokenizer(num_languages=100)
|
| 43 |
+
self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
|
| 44 |
+
self.device = torch.device("cuda")
|
| 45 |
+
self.init_model(self.model_config['parameters'])
|
| 46 |
+
|
| 47 |
+
def init_model(self, parameters):
|
| 48 |
+
for key,value in parameters.items():
|
| 49 |
+
parameters[key] = value["string_value"]
|
| 50 |
+
engine_dir = parameters["engine_dir"]
|
| 51 |
+
n_mels = int(parameters["n_mels"])
|
| 52 |
+
self.model = WhisperTRTLLM(engine_dir)
|
| 53 |
+
self.feature_extractor = FeatureExtractor(n_mels=n_mels)
|
| 54 |
+
|
| 55 |
+
def execute(self, requests):
|
| 56 |
+
"""`execute` must be implemented in every Python model. `execute`
|
| 57 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
| 58 |
+
argument. This function is called when an inference is requested
|
| 59 |
+
for this model.
|
| 60 |
+
|
| 61 |
+
Parameters
|
| 62 |
+
----------
|
| 63 |
+
requests : list
|
| 64 |
+
A list of pb_utils.InferenceRequest
|
| 65 |
+
|
| 66 |
+
Returns
|
| 67 |
+
-------
|
| 68 |
+
list
|
| 69 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
| 70 |
+
be the same as `requests`
|
| 71 |
+
"""
|
| 72 |
+
# Every Python backend must iterate through list of requests and create
|
| 73 |
+
# an instance of pb_utils.InferenceResponse class for each of them. You
|
| 74 |
+
# should avoid storing any of the input Tensors in the class attributes
|
| 75 |
+
# as they will be overridden in subsequent inference requests. You can
|
| 76 |
+
# make a copy of the underlying NumPy array and store it if it is
|
| 77 |
+
# required.
|
| 78 |
+
mel_list, text_prefix_list = [], []
|
| 79 |
+
for request in requests:
|
| 80 |
+
# Perform inference on the request and append it to responses list...
|
| 81 |
+
in_0 = pb_utils.get_input_tensor_by_name(request, "TEXT_PREFIX")
|
| 82 |
+
in_1 = pb_utils.get_input_tensor_by_name(request, "WAV")
|
| 83 |
+
|
| 84 |
+
wav = in_1.as_numpy()
|
| 85 |
+
assert wav.shape[0] == 1, "Only support batch size 1"
|
| 86 |
+
wav = torch.from_numpy(wav[0]).to(self.device)
|
| 87 |
+
mel = self.feature_extractor.compute_feature(wav)
|
| 88 |
+
mel_list.append(mel)
|
| 89 |
+
|
| 90 |
+
text_prefix_list.append(in_0.as_numpy().tolist())
|
| 91 |
+
# concat tensors in batch dimension
|
| 92 |
+
features = torch.cat(mel_list, dim=0)
|
| 93 |
+
features = features.to(self.device)
|
| 94 |
+
|
| 95 |
+
prompt_ids = []
|
| 96 |
+
for text_prefix in text_prefix_list:
|
| 97 |
+
text_prefix = text_prefix[0][0].decode('utf-8')
|
| 98 |
+
if text_prefix == "":
|
| 99 |
+
text_prefix = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
| 100 |
+
prompt_id = self.tokenizer.encode(text_prefix, allowed_special=self.tokenizer.special_tokens_set)
|
| 101 |
+
# convert prompt_id to tensor, tensor shape is [Seq]
|
| 102 |
+
prompt_id = torch.tensor(prompt_id)
|
| 103 |
+
prompt_ids.append(prompt_id)
|
| 104 |
+
# convert prompt_ids to tensor, tensor shape is [Batch, Seq], left padding with self.blank
|
| 105 |
+
tokens = torch.nn.utils.rnn.pad_sequence(prompt_ids, batch_first=True, padding_value=self.blank)
|
| 106 |
+
tokens = tokens.to(features.device)
|
| 107 |
+
print(features.shape)
|
| 108 |
+
output_ids = self.model.process_batch(features, tokens)
|
| 109 |
+
|
| 110 |
+
results = [output_ids[i][0] for i in range(len(output_ids))]
|
| 111 |
+
|
| 112 |
+
responses = []
|
| 113 |
+
for result in results:
|
| 114 |
+
s = self.tokenizer.decode(result)
|
| 115 |
+
s = re.sub(r'<\|.*?\|>', '', s)
|
| 116 |
+
sentence = np.array([s])
|
| 117 |
+
out0 = pb_utils.Tensor("TRANSCRIPTS", sentence.astype(self.out0_dtype))
|
| 118 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
|
| 119 |
+
responses.append(inference_response)
|
| 120 |
+
return responses
|
| 121 |
+
|
| 122 |
+
def finalize(self):
|
| 123 |
+
"""`finalize` is called only once when the model is being unloaded.
|
| 124 |
+
Implementing `finalize` function is optional. This function allows
|
| 125 |
+
the model to perform any necessary clean ups before exit.
|
| 126 |
+
"""
|
| 127 |
+
print('Cleaning up...')
|
whisper/1/multilingual.tiktoken
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
whisper/1/tokenizer.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
# Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
|
| 16 |
+
import base64
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
import tiktoken
|
| 20 |
+
|
| 21 |
+
LANGUAGES = {
|
| 22 |
+
"en": "english",
|
| 23 |
+
"zh": "chinese",
|
| 24 |
+
"de": "german",
|
| 25 |
+
"es": "spanish",
|
| 26 |
+
"ru": "russian",
|
| 27 |
+
"ko": "korean",
|
| 28 |
+
"fr": "french",
|
| 29 |
+
"ja": "japanese",
|
| 30 |
+
"pt": "portuguese",
|
| 31 |
+
"tr": "turkish",
|
| 32 |
+
"pl": "polish",
|
| 33 |
+
"ca": "catalan",
|
| 34 |
+
"nl": "dutch",
|
| 35 |
+
"ar": "arabic",
|
| 36 |
+
"sv": "swedish",
|
| 37 |
+
"it": "italian",
|
| 38 |
+
"id": "indonesian",
|
| 39 |
+
"hi": "hindi",
|
| 40 |
+
"fi": "finnish",
|
| 41 |
+
"vi": "vietnamese",
|
| 42 |
+
"he": "hebrew",
|
| 43 |
+
"uk": "ukrainian",
|
| 44 |
+
"el": "greek",
|
| 45 |
+
"ms": "malay",
|
| 46 |
+
"cs": "czech",
|
| 47 |
+
"ro": "romanian",
|
| 48 |
+
"da": "danish",
|
| 49 |
+
"hu": "hungarian",
|
| 50 |
+
"ta": "tamil",
|
| 51 |
+
"no": "norwegian",
|
| 52 |
+
"th": "thai",
|
| 53 |
+
"ur": "urdu",
|
| 54 |
+
"hr": "croatian",
|
| 55 |
+
"bg": "bulgarian",
|
| 56 |
+
"lt": "lithuanian",
|
| 57 |
+
"la": "latin",
|
| 58 |
+
"mi": "maori",
|
| 59 |
+
"ml": "malayalam",
|
| 60 |
+
"cy": "welsh",
|
| 61 |
+
"sk": "slovak",
|
| 62 |
+
"te": "telugu",
|
| 63 |
+
"fa": "persian",
|
| 64 |
+
"lv": "latvian",
|
| 65 |
+
"bn": "bengali",
|
| 66 |
+
"sr": "serbian",
|
| 67 |
+
"az": "azerbaijani",
|
| 68 |
+
"sl": "slovenian",
|
| 69 |
+
"kn": "kannada",
|
| 70 |
+
"et": "estonian",
|
| 71 |
+
"mk": "macedonian",
|
| 72 |
+
"br": "breton",
|
| 73 |
+
"eu": "basque",
|
| 74 |
+
"is": "icelandic",
|
| 75 |
+
"hy": "armenian",
|
| 76 |
+
"ne": "nepali",
|
| 77 |
+
"mn": "mongolian",
|
| 78 |
+
"bs": "bosnian",
|
| 79 |
+
"kk": "kazakh",
|
| 80 |
+
"sq": "albanian",
|
| 81 |
+
"sw": "swahili",
|
| 82 |
+
"gl": "galician",
|
| 83 |
+
"mr": "marathi",
|
| 84 |
+
"pa": "punjabi",
|
| 85 |
+
"si": "sinhala",
|
| 86 |
+
"km": "khmer",
|
| 87 |
+
"sn": "shona",
|
| 88 |
+
"yo": "yoruba",
|
| 89 |
+
"so": "somali",
|
| 90 |
+
"af": "afrikaans",
|
| 91 |
+
"oc": "occitan",
|
| 92 |
+
"ka": "georgian",
|
| 93 |
+
"be": "belarusian",
|
| 94 |
+
"tg": "tajik",
|
| 95 |
+
"sd": "sindhi",
|
| 96 |
+
"gu": "gujarati",
|
| 97 |
+
"am": "amharic",
|
| 98 |
+
"yi": "yiddish",
|
| 99 |
+
"lo": "lao",
|
| 100 |
+
"uz": "uzbek",
|
| 101 |
+
"fo": "faroese",
|
| 102 |
+
"ht": "haitian creole",
|
| 103 |
+
"ps": "pashto",
|
| 104 |
+
"tk": "turkmen",
|
| 105 |
+
"nn": "nynorsk",
|
| 106 |
+
"mt": "maltese",
|
| 107 |
+
"sa": "sanskrit",
|
| 108 |
+
"lb": "luxembourgish",
|
| 109 |
+
"my": "myanmar",
|
| 110 |
+
"bo": "tibetan",
|
| 111 |
+
"tl": "tagalog",
|
| 112 |
+
"mg": "malagasy",
|
| 113 |
+
"as": "assamese",
|
| 114 |
+
"tt": "tatar",
|
| 115 |
+
"haw": "hawaiian",
|
| 116 |
+
"ln": "lingala",
|
| 117 |
+
"ha": "hausa",
|
| 118 |
+
"ba": "bashkir",
|
| 119 |
+
"jw": "javanese",
|
| 120 |
+
"su": "sundanese",
|
| 121 |
+
"yue": "cantonese",
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def get_tokenizer(name: str = "multilingual",
|
| 126 |
+
num_languages: int = 99,
|
| 127 |
+
tokenizer_dir: str = None):
|
| 128 |
+
if tokenizer_dir is None:
|
| 129 |
+
vocab_path = os.path.join(os.path.dirname(__file__),
|
| 130 |
+
f"./{name}.tiktoken")
|
| 131 |
+
else:
|
| 132 |
+
vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
|
| 133 |
+
ranks = {
|
| 134 |
+
base64.b64decode(token): int(rank)
|
| 135 |
+
for token, rank in (line.split() for line in open(vocab_path) if line)
|
| 136 |
+
}
|
| 137 |
+
n_vocab = len(ranks)
|
| 138 |
+
special_tokens = {}
|
| 139 |
+
|
| 140 |
+
specials = [
|
| 141 |
+
"<|endoftext|>",
|
| 142 |
+
"<|startoftranscript|>",
|
| 143 |
+
*[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
|
| 144 |
+
"<|translate|>",
|
| 145 |
+
"<|transcribe|>",
|
| 146 |
+
"<|startoflm|>",
|
| 147 |
+
"<|startofprev|>",
|
| 148 |
+
"<|nospeech|>",
|
| 149 |
+
"<|notimestamps|>",
|
| 150 |
+
*[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
for token in specials:
|
| 154 |
+
special_tokens[token] = n_vocab
|
| 155 |
+
n_vocab += 1
|
| 156 |
+
|
| 157 |
+
return tiktoken.Encoding(
|
| 158 |
+
name=os.path.basename(vocab_path),
|
| 159 |
+
explicit_n_vocab=n_vocab,
|
| 160 |
+
pat_str=
|
| 161 |
+
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
| 162 |
+
mergeable_ranks=ranks,
|
| 163 |
+
special_tokens=special_tokens,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
if __name__ == "__main__":
|
| 168 |
+
enc = get_tokenizer()
|
| 169 |
+
mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
|
| 170 |
+
encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
|
| 171 |
+
mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
|
| 172 |
+
mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
|
| 173 |
+
#print(encoding, mystr, mystr2)
|
| 174 |
+
print(
|
| 175 |
+
enc.encode("<|startoftranscript|>",
|
| 176 |
+
allowed_special=enc.special_tokens_set)[0])
|
| 177 |
+
print(
|
| 178 |
+
enc.encode("<|endoftext|>",
|
| 179 |
+
allowed_special=enc.special_tokens_set)[0])
|
| 180 |
+
my_zh_str = "好好学习"
|
| 181 |
+
encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
|
| 182 |
+
decoding = enc.decode(encoding)
|
| 183 |
+
print(type(decoding))
|
| 184 |
+
#print(encoding, decoding)
|
whisper/1/whisper_trtllm.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
import json
|
| 16 |
+
from collections import OrderedDict
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import torch
|
| 21 |
+
|
| 22 |
+
import tensorrt_llm
|
| 23 |
+
import tensorrt_llm.logger as logger
|
| 24 |
+
from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
|
| 25 |
+
trt_dtype_to_torch)
|
| 26 |
+
from tensorrt_llm.runtime import ModelConfig, SamplingConfig
|
| 27 |
+
from tensorrt_llm.runtime.session import Session, TensorInfo
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class WhisperEncoding:
|
| 31 |
+
|
| 32 |
+
def __init__(self, engine_dir):
|
| 33 |
+
self.session = self.get_session(engine_dir)
|
| 34 |
+
|
| 35 |
+
def get_session(self, engine_dir):
|
| 36 |
+
config_path = engine_dir / 'encoder_config.json'
|
| 37 |
+
with open(config_path, 'r') as f:
|
| 38 |
+
config = json.load(f)
|
| 39 |
+
|
| 40 |
+
use_gpt_attention_plugin = config['plugin_config'][
|
| 41 |
+
'gpt_attention_plugin']
|
| 42 |
+
dtype = config['builder_config']['precision']
|
| 43 |
+
n_mels = config['builder_config']['n_mels']
|
| 44 |
+
num_languages = config['builder_config']['num_languages']
|
| 45 |
+
|
| 46 |
+
self.dtype = dtype
|
| 47 |
+
self.n_mels = n_mels
|
| 48 |
+
self.num_languages = num_languages
|
| 49 |
+
|
| 50 |
+
serialize_path = engine_dir / f'whisper_encoder_{self.dtype}_tp1_rank0.engine'
|
| 51 |
+
|
| 52 |
+
with open(serialize_path, 'rb') as f:
|
| 53 |
+
session = Session.from_serialized_engine(f.read())
|
| 54 |
+
|
| 55 |
+
return session
|
| 56 |
+
|
| 57 |
+
def get_audio_features(self, mel):
|
| 58 |
+
|
| 59 |
+
input_lengths = torch.tensor(
|
| 60 |
+
[mel.shape[2] // 2 for _ in range(mel.shape[0])],
|
| 61 |
+
dtype=torch.int32,
|
| 62 |
+
device=mel.device)
|
| 63 |
+
|
| 64 |
+
inputs = OrderedDict()
|
| 65 |
+
inputs['x'] = mel
|
| 66 |
+
inputs['input_lengths'] = input_lengths
|
| 67 |
+
|
| 68 |
+
output_list = [
|
| 69 |
+
TensorInfo('x', str_dtype_to_trt(self.dtype), mel.shape),
|
| 70 |
+
TensorInfo('input_lengths', str_dtype_to_trt('int32'),
|
| 71 |
+
input_lengths.shape)
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
output_info = (self.session).infer_shapes(output_list)
|
| 75 |
+
|
| 76 |
+
logger.debug(f'output info {output_info}')
|
| 77 |
+
outputs = {
|
| 78 |
+
t.name: torch.empty(tuple(t.shape),
|
| 79 |
+
dtype=trt_dtype_to_torch(t.dtype),
|
| 80 |
+
device='cuda')
|
| 81 |
+
for t in output_info
|
| 82 |
+
}
|
| 83 |
+
stream = torch.cuda.current_stream()
|
| 84 |
+
ok = self.session.run(inputs=inputs,
|
| 85 |
+
outputs=outputs,
|
| 86 |
+
stream=stream.cuda_stream)
|
| 87 |
+
assert ok, 'Engine execution failed'
|
| 88 |
+
stream.synchronize()
|
| 89 |
+
audio_features = outputs['output']
|
| 90 |
+
return audio_features
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class WhisperDecoding:
|
| 94 |
+
|
| 95 |
+
def __init__(self, engine_dir, runtime_mapping, debug_mode=False):
|
| 96 |
+
|
| 97 |
+
self.decoder_config = self.get_config(engine_dir)
|
| 98 |
+
self.decoder_generation_session = self.get_session(
|
| 99 |
+
engine_dir, runtime_mapping, debug_mode)
|
| 100 |
+
|
| 101 |
+
def get_config(self, engine_dir):
|
| 102 |
+
config_path = engine_dir / 'decoder_config.json'
|
| 103 |
+
with open(config_path, 'r') as f:
|
| 104 |
+
config = json.load(f)
|
| 105 |
+
decoder_config = OrderedDict()
|
| 106 |
+
decoder_config.update(config['plugin_config'])
|
| 107 |
+
decoder_config.update(config['builder_config'])
|
| 108 |
+
return decoder_config
|
| 109 |
+
|
| 110 |
+
def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
|
| 111 |
+
dtype = self.decoder_config['precision']
|
| 112 |
+
serialize_path = engine_dir / f'whisper_decoder_{dtype}_tp1_rank0.engine'
|
| 113 |
+
with open(serialize_path, "rb") as f:
|
| 114 |
+
decoder_engine_buffer = f.read()
|
| 115 |
+
|
| 116 |
+
decoder_model_config = ModelConfig(
|
| 117 |
+
max_batch_size=self.decoder_config['max_batch_size'],
|
| 118 |
+
max_beam_width=self.decoder_config['max_beam_width'],
|
| 119 |
+
num_heads=self.decoder_config['num_heads'],
|
| 120 |
+
num_kv_heads=self.decoder_config['num_heads'],
|
| 121 |
+
hidden_size=self.decoder_config['hidden_size'],
|
| 122 |
+
vocab_size=self.decoder_config['vocab_size'],
|
| 123 |
+
num_layers=self.decoder_config['num_layers'],
|
| 124 |
+
gpt_attention_plugin=self.decoder_config['gpt_attention_plugin'],
|
| 125 |
+
remove_input_padding=self.decoder_config['remove_input_padding'],
|
| 126 |
+
cross_attention=self.decoder_config['cross_attention'],
|
| 127 |
+
has_position_embedding=self.
|
| 128 |
+
decoder_config['has_position_embedding'],
|
| 129 |
+
has_token_type_embedding=self.
|
| 130 |
+
decoder_config['has_token_type_embedding'],
|
| 131 |
+
)
|
| 132 |
+
decoder_generation_session = tensorrt_llm.runtime.GenerationSession(
|
| 133 |
+
decoder_model_config,
|
| 134 |
+
decoder_engine_buffer,
|
| 135 |
+
runtime_mapping,
|
| 136 |
+
debug_mode=debug_mode)
|
| 137 |
+
|
| 138 |
+
return decoder_generation_session
|
| 139 |
+
|
| 140 |
+
def generate(self,
|
| 141 |
+
decoder_input_ids,
|
| 142 |
+
encoder_outputs,
|
| 143 |
+
eot_id,
|
| 144 |
+
max_new_tokens=40,
|
| 145 |
+
num_beams=1):
|
| 146 |
+
encoder_input_lengths = torch.tensor(
|
| 147 |
+
[encoder_outputs.shape[1] for x in range(encoder_outputs.shape[0])],
|
| 148 |
+
dtype=torch.int32,
|
| 149 |
+
device='cuda')
|
| 150 |
+
|
| 151 |
+
decoder_input_lengths = torch.tensor([
|
| 152 |
+
decoder_input_ids.shape[-1]
|
| 153 |
+
for _ in range(decoder_input_ids.shape[0])
|
| 154 |
+
],
|
| 155 |
+
dtype=torch.int32,
|
| 156 |
+
device='cuda')
|
| 157 |
+
decoder_max_input_length = torch.max(decoder_input_lengths).item()
|
| 158 |
+
|
| 159 |
+
cross_attention_mask = torch.ones(
|
| 160 |
+
[encoder_outputs.shape[0], 1,
|
| 161 |
+
encoder_outputs.shape[1]]).int().cuda()
|
| 162 |
+
|
| 163 |
+
# generation config
|
| 164 |
+
sampling_config = SamplingConfig(end_id=eot_id,
|
| 165 |
+
pad_id=eot_id,
|
| 166 |
+
num_beams=num_beams)
|
| 167 |
+
self.decoder_generation_session.setup(
|
| 168 |
+
decoder_input_lengths.size(0),
|
| 169 |
+
decoder_max_input_length,
|
| 170 |
+
max_new_tokens,
|
| 171 |
+
beam_width=num_beams,
|
| 172 |
+
encoder_max_input_length=encoder_outputs.shape[1])
|
| 173 |
+
|
| 174 |
+
torch.cuda.synchronize()
|
| 175 |
+
|
| 176 |
+
decoder_input_ids = decoder_input_ids.type(torch.int32).cuda()
|
| 177 |
+
output_ids = self.decoder_generation_session.decode(
|
| 178 |
+
decoder_input_ids,
|
| 179 |
+
decoder_input_lengths,
|
| 180 |
+
sampling_config,
|
| 181 |
+
encoder_output=encoder_outputs,
|
| 182 |
+
encoder_input_lengths=encoder_input_lengths,
|
| 183 |
+
cross_attention_mask=cross_attention_mask,
|
| 184 |
+
)
|
| 185 |
+
torch.cuda.synchronize()
|
| 186 |
+
|
| 187 |
+
# get the list of int from output_ids tensor
|
| 188 |
+
output_ids = output_ids.cpu().numpy().tolist()
|
| 189 |
+
return output_ids
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
class WhisperTRTLLM(object):
|
| 193 |
+
|
| 194 |
+
def __init__(self, engine_dir):
|
| 195 |
+
world_size = 1
|
| 196 |
+
runtime_rank = tensorrt_llm.mpi_rank()
|
| 197 |
+
runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
|
| 198 |
+
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
|
| 199 |
+
engine_dir = Path(engine_dir)
|
| 200 |
+
|
| 201 |
+
self.encoder = WhisperEncoding(engine_dir)
|
| 202 |
+
self.decoder = WhisperDecoding(engine_dir,
|
| 203 |
+
runtime_mapping,
|
| 204 |
+
debug_mode=False)
|
| 205 |
+
|
| 206 |
+
def process_batch(
|
| 207 |
+
self,
|
| 208 |
+
mel,
|
| 209 |
+
decoder_input_ids,
|
| 210 |
+
eot_id=50257,
|
| 211 |
+
max_new_tokens=96,
|
| 212 |
+
num_beams=1):
|
| 213 |
+
encoder_output = self.encoder.get_audio_features(mel)
|
| 214 |
+
output_ids = self.decoder.generate(decoder_input_ids,
|
| 215 |
+
encoder_output,
|
| 216 |
+
eot_id,
|
| 217 |
+
max_new_tokens=max_new_tokens,
|
| 218 |
+
num_beams=num_beams)
|
| 219 |
+
return output_ids
|
whisper/config.pbtxt
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
name: "whisper"
|
| 16 |
+
backend: "python"
|
| 17 |
+
max_batch_size: 8
|
| 18 |
+
|
| 19 |
+
parameters [
|
| 20 |
+
{
|
| 21 |
+
key: "n_mels",
|
| 22 |
+
value: {string_value:"80"} # 128 dim for large-v3, 80 dim for large-v2
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
key: "engine_dir"
|
| 26 |
+
value: { string_value: "/whisper/model_repo_whisper_trtllm/whisper/1/distil_large_v2"}
|
| 27 |
+
}
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
input [
|
| 32 |
+
{
|
| 33 |
+
name: "TEXT_PREFIX"
|
| 34 |
+
data_type: TYPE_STRING
|
| 35 |
+
dims: [1]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
name: "WAV"
|
| 39 |
+
data_type: TYPE_FP32
|
| 40 |
+
dims: [-1]
|
| 41 |
+
}
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
output [
|
| 45 |
+
{
|
| 46 |
+
name: "TRANSCRIPTS"
|
| 47 |
+
data_type: TYPE_STRING
|
| 48 |
+
dims: [1]
|
| 49 |
+
}
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
dynamic_batching {
|
| 53 |
+
preferred_batch_size: [4,8]
|
| 54 |
+
max_queue_delay_microseconds: 250000
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
instance_group [
|
| 58 |
+
{
|
| 59 |
+
count: 1
|
| 60 |
+
kind: KIND_CPU
|
| 61 |
+
}
|
| 62 |
+
]
|
| 63 |
+
|