Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- ensemble/1/.tmp +0 -0
- ensemble/config.pbtxt +470 -0
- postprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
- postprocessing/1/model.py +231 -0
- postprocessing/1/special_tokens_map.json +5 -0
- postprocessing/1/tokenizer.json +0 -0
- postprocessing/1/tokenizer.model +3 -0
- postprocessing/1/tokenizer_config.json +43 -0
- postprocessing/config.pbtxt +113 -0
- preprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
- preprocessing/1/model.py +373 -0
- preprocessing/1/special_tokens_map.json +5 -0
- preprocessing/1/tokenizer.json +0 -0
- preprocessing/1/tokenizer.model +3 -0
- preprocessing/1/tokenizer_config.json +43 -0
- preprocessing/config.pbtxt +156 -0
- tensorrt_llm/1/.gitkeep +0 -0
- tensorrt_llm/1/config.json +148 -0
- tensorrt_llm/1/model.py +782 -0
- tensorrt_llm/1/rank0.engine +3 -0
- tensorrt_llm/config.pbtxt +537 -0
- tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/decode.py +333 -0
- tensorrt_llm_bls/1/lib/triton_decoder.py +440 -0
- tensorrt_llm_bls/1/model.py +131 -0
- tensorrt_llm_bls/config.pbtxt +253 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
|
ensemble/1/.tmp
ADDED
|
File without changes
|
ensemble/config.pbtxt
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
name: "ensemble"
|
| 28 |
+
platform: "ensemble"
|
| 29 |
+
max_batch_size: 16
|
| 30 |
+
input [
|
| 31 |
+
{
|
| 32 |
+
name: "text_input"
|
| 33 |
+
data_type: TYPE_STRING
|
| 34 |
+
dims: [ -1 ]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
name: "decoder_text_input"
|
| 38 |
+
data_type: TYPE_STRING
|
| 39 |
+
dims: [ -1 ]
|
| 40 |
+
optional: true
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
name: "max_tokens"
|
| 44 |
+
data_type: TYPE_INT32
|
| 45 |
+
dims: [ -1 ]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
name: "bad_words"
|
| 49 |
+
data_type: TYPE_STRING
|
| 50 |
+
dims: [ -1 ]
|
| 51 |
+
optional: true
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
name: "stop_words"
|
| 55 |
+
data_type: TYPE_STRING
|
| 56 |
+
dims: [ -1 ]
|
| 57 |
+
optional: true
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
name: "end_id"
|
| 61 |
+
data_type: TYPE_INT32
|
| 62 |
+
dims: [ 1 ]
|
| 63 |
+
optional: true
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
name: "pad_id"
|
| 67 |
+
data_type: TYPE_INT32
|
| 68 |
+
dims: [ 1 ]
|
| 69 |
+
optional: true
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
name: "top_k"
|
| 73 |
+
data_type: TYPE_INT32
|
| 74 |
+
dims: [ 1 ]
|
| 75 |
+
optional: true
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
name: "top_p"
|
| 79 |
+
data_type: TYPE_FP32
|
| 80 |
+
dims: [ 1 ]
|
| 81 |
+
optional: true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
name: "temperature"
|
| 85 |
+
data_type: TYPE_FP32
|
| 86 |
+
dims: [ 1 ]
|
| 87 |
+
optional: true
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
name: "length_penalty"
|
| 91 |
+
data_type: TYPE_FP32
|
| 92 |
+
dims: [ 1 ]
|
| 93 |
+
optional: true
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
name: "repetition_penalty"
|
| 97 |
+
data_type: TYPE_FP32
|
| 98 |
+
dims: [ 1 ]
|
| 99 |
+
optional: true
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
name: "min_length"
|
| 103 |
+
data_type: TYPE_INT32
|
| 104 |
+
dims: [ 1 ]
|
| 105 |
+
optional: true
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
name: "presence_penalty"
|
| 109 |
+
data_type: TYPE_FP32
|
| 110 |
+
dims: [ 1 ]
|
| 111 |
+
optional: true
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
name: "frequency_penalty"
|
| 115 |
+
data_type: TYPE_FP32
|
| 116 |
+
dims: [ 1 ]
|
| 117 |
+
optional: true
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
name: "random_seed"
|
| 121 |
+
data_type: TYPE_UINT64
|
| 122 |
+
dims: [ 1 ]
|
| 123 |
+
optional: true
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
name: "return_log_probs"
|
| 127 |
+
data_type: TYPE_BOOL
|
| 128 |
+
dims: [ 1 ]
|
| 129 |
+
optional: true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
name: "return_context_logits"
|
| 133 |
+
data_type: TYPE_BOOL
|
| 134 |
+
dims: [ 1 ]
|
| 135 |
+
optional: true
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
name: "return_generation_logits"
|
| 139 |
+
data_type: TYPE_BOOL
|
| 140 |
+
dims: [ 1 ]
|
| 141 |
+
optional: true
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
name: "beam_width"
|
| 145 |
+
data_type: TYPE_INT32
|
| 146 |
+
dims: [ 1 ]
|
| 147 |
+
optional: true
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
name: "stream"
|
| 151 |
+
data_type: TYPE_BOOL
|
| 152 |
+
dims: [ 1 ]
|
| 153 |
+
optional: true
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
name: "prompt_embedding_table"
|
| 157 |
+
data_type: TYPE_FP16
|
| 158 |
+
dims: [ -1, -1 ]
|
| 159 |
+
optional: true
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
name: "prompt_vocab_size"
|
| 163 |
+
data_type: TYPE_INT32
|
| 164 |
+
dims: [ 1 ]
|
| 165 |
+
optional: true
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
name: "embedding_bias_words"
|
| 169 |
+
data_type: TYPE_STRING
|
| 170 |
+
dims: [ -1 ]
|
| 171 |
+
optional: true
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
name: "embedding_bias_weights"
|
| 175 |
+
data_type: TYPE_FP32
|
| 176 |
+
dims: [ -1 ]
|
| 177 |
+
optional: true
|
| 178 |
+
}
|
| 179 |
+
]
|
| 180 |
+
output [
|
| 181 |
+
{
|
| 182 |
+
name: "text_output"
|
| 183 |
+
data_type: TYPE_STRING
|
| 184 |
+
dims: [ -1 ]
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
name: "cum_log_probs"
|
| 188 |
+
data_type: TYPE_FP32
|
| 189 |
+
dims: [ -1 ]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
name: "output_log_probs"
|
| 193 |
+
data_type: TYPE_FP32
|
| 194 |
+
dims: [ -1, -1 ]
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
name: "context_logits"
|
| 198 |
+
data_type: TYPE_FP32
|
| 199 |
+
dims: [ -1, -1 ]
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
name: "generation_logits"
|
| 203 |
+
data_type: TYPE_FP32
|
| 204 |
+
dims: [ -1, -1, -1 ]
|
| 205 |
+
}
|
| 206 |
+
]
|
| 207 |
+
ensemble_scheduling {
|
| 208 |
+
step [
|
| 209 |
+
{
|
| 210 |
+
model_name: "preprocessing"
|
| 211 |
+
model_version: -1
|
| 212 |
+
input_map {
|
| 213 |
+
key: "QUERY"
|
| 214 |
+
value: "text_input"
|
| 215 |
+
}
|
| 216 |
+
input_map {
|
| 217 |
+
key: "DECODER_QUERY"
|
| 218 |
+
value: "decoder_text_input"
|
| 219 |
+
}
|
| 220 |
+
input_map {
|
| 221 |
+
key: "REQUEST_OUTPUT_LEN"
|
| 222 |
+
value: "max_tokens"
|
| 223 |
+
}
|
| 224 |
+
input_map {
|
| 225 |
+
key: "BAD_WORDS_DICT"
|
| 226 |
+
value: "bad_words"
|
| 227 |
+
}
|
| 228 |
+
input_map {
|
| 229 |
+
key: "STOP_WORDS_DICT"
|
| 230 |
+
value: "stop_words"
|
| 231 |
+
}
|
| 232 |
+
input_map {
|
| 233 |
+
key: "EMBEDDING_BIAS_WORDS"
|
| 234 |
+
value: "embedding_bias_words"
|
| 235 |
+
}
|
| 236 |
+
input_map {
|
| 237 |
+
key: "EMBEDDING_BIAS_WEIGHTS"
|
| 238 |
+
value: "embedding_bias_weights"
|
| 239 |
+
}
|
| 240 |
+
input_map {
|
| 241 |
+
key: "END_ID"
|
| 242 |
+
value: "end_id"
|
| 243 |
+
}
|
| 244 |
+
input_map {
|
| 245 |
+
key: "PAD_ID"
|
| 246 |
+
value: "pad_id"
|
| 247 |
+
}
|
| 248 |
+
output_map {
|
| 249 |
+
key: "REQUEST_INPUT_LEN"
|
| 250 |
+
value: "_REQUEST_INPUT_LEN"
|
| 251 |
+
}
|
| 252 |
+
output_map {
|
| 253 |
+
key: "INPUT_ID"
|
| 254 |
+
value: "_INPUT_ID"
|
| 255 |
+
}
|
| 256 |
+
output_map {
|
| 257 |
+
key: "REQUEST_DECODER_INPUT_LEN"
|
| 258 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
| 259 |
+
}
|
| 260 |
+
output_map {
|
| 261 |
+
key: "DECODER_INPUT_ID"
|
| 262 |
+
value: "_DECODER_INPUT_ID"
|
| 263 |
+
}
|
| 264 |
+
output_map {
|
| 265 |
+
key: "REQUEST_OUTPUT_LEN"
|
| 266 |
+
value: "_REQUEST_OUTPUT_LEN"
|
| 267 |
+
}
|
| 268 |
+
output_map {
|
| 269 |
+
key: "STOP_WORDS_IDS"
|
| 270 |
+
value: "_STOP_WORDS_IDS"
|
| 271 |
+
}
|
| 272 |
+
output_map {
|
| 273 |
+
key: "BAD_WORDS_IDS"
|
| 274 |
+
value: "_BAD_WORDS_IDS"
|
| 275 |
+
}
|
| 276 |
+
output_map {
|
| 277 |
+
key: "EMBEDDING_BIAS"
|
| 278 |
+
value: "_EMBEDDING_BIAS"
|
| 279 |
+
}
|
| 280 |
+
output_map {
|
| 281 |
+
key: "OUT_END_ID"
|
| 282 |
+
value: "_PREPROCESSOR_END_ID"
|
| 283 |
+
}
|
| 284 |
+
output_map {
|
| 285 |
+
key: "OUT_PAD_ID"
|
| 286 |
+
value: "_PREPROCESSOR_PAD_ID"
|
| 287 |
+
}
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
model_name: "tensorrt_llm"
|
| 291 |
+
model_version: -1
|
| 292 |
+
input_map {
|
| 293 |
+
key: "input_ids"
|
| 294 |
+
value: "_INPUT_ID"
|
| 295 |
+
}
|
| 296 |
+
input_map {
|
| 297 |
+
key: "decoder_input_ids"
|
| 298 |
+
value: "_DECODER_INPUT_ID"
|
| 299 |
+
}
|
| 300 |
+
input_map {
|
| 301 |
+
key: "input_lengths"
|
| 302 |
+
value: "_REQUEST_INPUT_LEN"
|
| 303 |
+
}
|
| 304 |
+
input_map {
|
| 305 |
+
key: "decoder_input_lengths"
|
| 306 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
| 307 |
+
}
|
| 308 |
+
input_map {
|
| 309 |
+
key: "request_output_len"
|
| 310 |
+
value: "_REQUEST_OUTPUT_LEN"
|
| 311 |
+
}
|
| 312 |
+
input_map {
|
| 313 |
+
key: "end_id"
|
| 314 |
+
value: "_PREPROCESSOR_END_ID"
|
| 315 |
+
}
|
| 316 |
+
input_map {
|
| 317 |
+
key: "pad_id"
|
| 318 |
+
value: "_PREPROCESSOR_PAD_ID"
|
| 319 |
+
}
|
| 320 |
+
input_map {
|
| 321 |
+
key: "embedding_bias"
|
| 322 |
+
value: "_EMBEDDING_BIAS"
|
| 323 |
+
}
|
| 324 |
+
input_map {
|
| 325 |
+
key: "runtime_top_k"
|
| 326 |
+
value: "top_k"
|
| 327 |
+
}
|
| 328 |
+
input_map {
|
| 329 |
+
key: "runtime_top_p"
|
| 330 |
+
value: "top_p"
|
| 331 |
+
}
|
| 332 |
+
input_map {
|
| 333 |
+
key: "temperature"
|
| 334 |
+
value: "temperature"
|
| 335 |
+
}
|
| 336 |
+
input_map {
|
| 337 |
+
key: "len_penalty"
|
| 338 |
+
value: "length_penalty"
|
| 339 |
+
}
|
| 340 |
+
input_map {
|
| 341 |
+
key: "repetition_penalty"
|
| 342 |
+
value: "repetition_penalty"
|
| 343 |
+
}
|
| 344 |
+
input_map {
|
| 345 |
+
key: "min_length"
|
| 346 |
+
value: "min_length"
|
| 347 |
+
}
|
| 348 |
+
input_map {
|
| 349 |
+
key: "presence_penalty"
|
| 350 |
+
value: "presence_penalty"
|
| 351 |
+
}
|
| 352 |
+
input_map {
|
| 353 |
+
key: "frequency_penalty"
|
| 354 |
+
value: "frequency_penalty"
|
| 355 |
+
}
|
| 356 |
+
input_map {
|
| 357 |
+
key: "random_seed"
|
| 358 |
+
value: "random_seed"
|
| 359 |
+
}
|
| 360 |
+
input_map {
|
| 361 |
+
key: "return_log_probs"
|
| 362 |
+
value: "return_log_probs"
|
| 363 |
+
}
|
| 364 |
+
input_map {
|
| 365 |
+
key: "return_context_logits"
|
| 366 |
+
value: "return_context_logits"
|
| 367 |
+
}
|
| 368 |
+
input_map {
|
| 369 |
+
key: "return_generation_logits"
|
| 370 |
+
value: "return_generation_logits"
|
| 371 |
+
}
|
| 372 |
+
input_map {
|
| 373 |
+
key: "beam_width"
|
| 374 |
+
value: "beam_width"
|
| 375 |
+
}
|
| 376 |
+
input_map {
|
| 377 |
+
key: "streaming"
|
| 378 |
+
value: "stream"
|
| 379 |
+
}
|
| 380 |
+
input_map {
|
| 381 |
+
key: "prompt_embedding_table"
|
| 382 |
+
value: "prompt_embedding_table"
|
| 383 |
+
}
|
| 384 |
+
input_map {
|
| 385 |
+
key: "prompt_vocab_size"
|
| 386 |
+
value: "prompt_vocab_size"
|
| 387 |
+
}
|
| 388 |
+
input_map {
|
| 389 |
+
key: "stop_words_list"
|
| 390 |
+
value: "_STOP_WORDS_IDS"
|
| 391 |
+
}
|
| 392 |
+
input_map {
|
| 393 |
+
key: "bad_words_list"
|
| 394 |
+
value: "_BAD_WORDS_IDS"
|
| 395 |
+
}
|
| 396 |
+
output_map {
|
| 397 |
+
key: "output_ids"
|
| 398 |
+
value: "_TOKENS_BATCH"
|
| 399 |
+
}
|
| 400 |
+
output_map {
|
| 401 |
+
key: "sequence_length"
|
| 402 |
+
value: "_SEQUENCE_LENGTH"
|
| 403 |
+
},
|
| 404 |
+
output_map {
|
| 405 |
+
key: "cum_log_probs"
|
| 406 |
+
value: "_CUM_LOG_PROBS"
|
| 407 |
+
}
|
| 408 |
+
output_map {
|
| 409 |
+
key: "output_log_probs"
|
| 410 |
+
value: "_OUTPUT_LOG_PROBS"
|
| 411 |
+
},
|
| 412 |
+
output_map {
|
| 413 |
+
key: "context_logits"
|
| 414 |
+
value: "_CONTEXT_LOGITS"
|
| 415 |
+
},
|
| 416 |
+
output_map {
|
| 417 |
+
key: "generation_logits"
|
| 418 |
+
value: "_GENERATION_LOGITS"
|
| 419 |
+
}
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
model_name: "postprocessing"
|
| 423 |
+
model_version: -1
|
| 424 |
+
input_map {
|
| 425 |
+
key: "TOKENS_BATCH"
|
| 426 |
+
value: "_TOKENS_BATCH"
|
| 427 |
+
}
|
| 428 |
+
input_map {
|
| 429 |
+
key: "CUM_LOG_PROBS"
|
| 430 |
+
value: "_CUM_LOG_PROBS"
|
| 431 |
+
}
|
| 432 |
+
input_map {
|
| 433 |
+
key: "OUTPUT_LOG_PROBS"
|
| 434 |
+
value: "_OUTPUT_LOG_PROBS"
|
| 435 |
+
}
|
| 436 |
+
input_map {
|
| 437 |
+
key: "CONTEXT_LOGITS"
|
| 438 |
+
value: "_CONTEXT_LOGITS"
|
| 439 |
+
}
|
| 440 |
+
input_map {
|
| 441 |
+
key: "GENERATION_LOGITS"
|
| 442 |
+
value: "_GENERATION_LOGITS"
|
| 443 |
+
}
|
| 444 |
+
input_map {
|
| 445 |
+
key: "SEQUENCE_LENGTH"
|
| 446 |
+
value: "_SEQUENCE_LENGTH"
|
| 447 |
+
}
|
| 448 |
+
output_map {
|
| 449 |
+
key: "OUTPUT"
|
| 450 |
+
value: "text_output"
|
| 451 |
+
}
|
| 452 |
+
output_map {
|
| 453 |
+
key: "OUT_OUTPUT_LOG_PROBS"
|
| 454 |
+
value: "output_log_probs"
|
| 455 |
+
}
|
| 456 |
+
output_map {
|
| 457 |
+
key: "OUT_CUM_LOG_PROBS"
|
| 458 |
+
value: "cum_log_probs"
|
| 459 |
+
}
|
| 460 |
+
output_map {
|
| 461 |
+
key: "OUT_CONTEXT_LOGITS"
|
| 462 |
+
value: "context_logits"
|
| 463 |
+
}
|
| 464 |
+
output_map {
|
| 465 |
+
key: "OUT_GENERATION_LOGITS"
|
| 466 |
+
value: "generation_logits"
|
| 467 |
+
}
|
| 468 |
+
}
|
| 469 |
+
]
|
| 470 |
+
}
|
postprocessing/1/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (5.33 kB). View file
|
|
|
postprocessing/1/model.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
import json
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
import triton_python_backend_utils as pb_utils
|
| 31 |
+
from transformers import AutoTokenizer
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TritonPythonModel:
|
| 35 |
+
"""Your Python model must use the same class name. Every Python model
|
| 36 |
+
that is created must have "TritonPythonModel" as the class name.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def initialize(self, args):
|
| 40 |
+
"""`initialize` is called only once when the model is being loaded.
|
| 41 |
+
Implementing `initialize` function is optional. This function allows
|
| 42 |
+
the model to initialize any state associated with this model.
|
| 43 |
+
Parameters
|
| 44 |
+
----------
|
| 45 |
+
args : dict
|
| 46 |
+
Both keys and values are strings. The dictionary keys and values are:
|
| 47 |
+
* model_config: A JSON string containing the model configuration
|
| 48 |
+
* model_instance_kind: A string containing model instance kind
|
| 49 |
+
* model_instance_device_id: A string containing model instance device ID
|
| 50 |
+
* model_repository: Model repository path
|
| 51 |
+
* model_version: Model version
|
| 52 |
+
* model_name: Model name
|
| 53 |
+
"""
|
| 54 |
+
# Parse model configs
|
| 55 |
+
model_config = json.loads(args['model_config'])
|
| 56 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
| 57 |
+
'string_value']
|
| 58 |
+
|
| 59 |
+
skip_special_tokens = model_config['parameters'].get(
|
| 60 |
+
'skip_special_tokens')
|
| 61 |
+
if skip_special_tokens is not None:
|
| 62 |
+
skip_special_tokens_str = skip_special_tokens[
|
| 63 |
+
'string_value'].lower()
|
| 64 |
+
if skip_special_tokens_str in [
|
| 65 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
| 66 |
+
]:
|
| 67 |
+
self.skip_special_tokens = skip_special_tokens_str in [
|
| 68 |
+
'true', '1', 't', 'y', 'yes'
|
| 69 |
+
]
|
| 70 |
+
else:
|
| 71 |
+
print(
|
| 72 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
|
| 73 |
+
)
|
| 74 |
+
self.skip_special_tokens = True
|
| 75 |
+
else:
|
| 76 |
+
print(
|
| 77 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
|
| 78 |
+
)
|
| 79 |
+
self.skip_special_tokens = True
|
| 80 |
+
|
| 81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
| 82 |
+
legacy=False,
|
| 83 |
+
padding_side='left',
|
| 84 |
+
trust_remote_code=True)
|
| 85 |
+
if not self.tokenizer.pad_token:
|
| 86 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 87 |
+
|
| 88 |
+
# Parse model output configs
|
| 89 |
+
output_config = pb_utils.get_output_config_by_name(
|
| 90 |
+
model_config, "OUTPUT")
|
| 91 |
+
|
| 92 |
+
# Convert Triton types to numpy types
|
| 93 |
+
self.output_dtype = pb_utils.triton_string_to_numpy(
|
| 94 |
+
output_config['data_type'])
|
| 95 |
+
|
| 96 |
+
def execute(self, requests):
|
| 97 |
+
"""`execute` must be implemented in every Python model. `execute`
|
| 98 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
| 99 |
+
argument. This function is called when an inference is requested
|
| 100 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
| 101 |
+
Batching) used, `requests` may contain multiple requests. Every
|
| 102 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
| 103 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
| 104 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
| 105 |
+
Parameters
|
| 106 |
+
----------
|
| 107 |
+
requests : list
|
| 108 |
+
A list of pb_utils.InferenceRequest
|
| 109 |
+
Returns
|
| 110 |
+
-------
|
| 111 |
+
list
|
| 112 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
| 113 |
+
be the same as `requests`
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
responses = []
|
| 117 |
+
|
| 118 |
+
# Every Python backend must iterate over everyone of the requests
|
| 119 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
| 120 |
+
for idx, request in enumerate(requests):
|
| 121 |
+
# Get input tensors
|
| 122 |
+
tokens_batch = pb_utils.get_input_tensor_by_name(
|
| 123 |
+
request, 'TOKENS_BATCH').as_numpy()
|
| 124 |
+
|
| 125 |
+
# Get sequence length
|
| 126 |
+
sequence_lengths = pb_utils.get_input_tensor_by_name(
|
| 127 |
+
request, 'SEQUENCE_LENGTH').as_numpy()
|
| 128 |
+
|
| 129 |
+
# Get cum log probs
|
| 130 |
+
cum_log_probs = pb_utils.get_input_tensor_by_name(
|
| 131 |
+
request, 'CUM_LOG_PROBS')
|
| 132 |
+
|
| 133 |
+
# Get sequence length
|
| 134 |
+
output_log_probs = pb_utils.get_input_tensor_by_name(
|
| 135 |
+
request, 'OUTPUT_LOG_PROBS')
|
| 136 |
+
|
| 137 |
+
# Get context logits
|
| 138 |
+
context_logits = pb_utils.get_input_tensor_by_name(
|
| 139 |
+
request, 'CONTEXT_LOGITS')
|
| 140 |
+
|
| 141 |
+
# Get generation logits
|
| 142 |
+
generation_logits = pb_utils.get_input_tensor_by_name(
|
| 143 |
+
request, 'GENERATION_LOGITS')
|
| 144 |
+
|
| 145 |
+
# Reshape Input
|
| 146 |
+
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
|
| 147 |
+
# tokens_batch = tokens_batch.T
|
| 148 |
+
|
| 149 |
+
# Postprocessing output data.
|
| 150 |
+
outputs = self._postprocessing(tokens_batch, sequence_lengths)
|
| 151 |
+
|
| 152 |
+
# Create output tensors. You need pb_utils.Tensor
|
| 153 |
+
# objects to create pb_utils.InferenceResponse.
|
| 154 |
+
output_tensor = pb_utils.Tensor(
|
| 155 |
+
'OUTPUT',
|
| 156 |
+
np.array(outputs).astype(self.output_dtype))
|
| 157 |
+
|
| 158 |
+
outputs = []
|
| 159 |
+
outputs.append(output_tensor)
|
| 160 |
+
|
| 161 |
+
if cum_log_probs:
|
| 162 |
+
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
|
| 163 |
+
cum_log_probs.as_numpy())
|
| 164 |
+
outputs.append(out_cum_log_probs)
|
| 165 |
+
else:
|
| 166 |
+
out_cum_log_probs = pb_utils.Tensor(
|
| 167 |
+
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
|
| 168 |
+
outputs.append(out_cum_log_probs)
|
| 169 |
+
|
| 170 |
+
if output_log_probs:
|
| 171 |
+
out_output_log_probs = pb_utils.Tensor(
|
| 172 |
+
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
|
| 173 |
+
outputs.append(out_output_log_probs)
|
| 174 |
+
else:
|
| 175 |
+
out_output_log_probs = pb_utils.Tensor(
|
| 176 |
+
'OUT_OUTPUT_LOG_PROBS',
|
| 177 |
+
np.array([[[0.0]]], dtype=np.float32))
|
| 178 |
+
outputs.append(out_output_log_probs)
|
| 179 |
+
|
| 180 |
+
if context_logits:
|
| 181 |
+
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
|
| 182 |
+
context_logits.as_numpy())
|
| 183 |
+
outputs.append(out_context_logits)
|
| 184 |
+
else:
|
| 185 |
+
out_context_logits = pb_utils.Tensor(
|
| 186 |
+
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
|
| 187 |
+
dtype=np.float32))
|
| 188 |
+
outputs.append(out_context_logits)
|
| 189 |
+
|
| 190 |
+
if generation_logits:
|
| 191 |
+
out_generation_logits = pb_utils.Tensor(
|
| 192 |
+
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
|
| 193 |
+
outputs.append(out_generation_logits)
|
| 194 |
+
else:
|
| 195 |
+
out_generation_logits = pb_utils.Tensor(
|
| 196 |
+
'OUT_GENERATION_LOGITS',
|
| 197 |
+
np.array([[[[0.0]]]], dtype=np.float32))
|
| 198 |
+
outputs.append(out_generation_logits)
|
| 199 |
+
|
| 200 |
+
# Create InferenceResponse. You can set an error here in case
|
| 201 |
+
# there was a problem with handling this inference request.
|
| 202 |
+
# Below is an example of how you can set errors in inference
|
| 203 |
+
# response:
|
| 204 |
+
#
|
| 205 |
+
# pb_utils.InferenceResponse(
|
| 206 |
+
# output_tensors=..., TritonError("An error occurred"))
|
| 207 |
+
inference_response = pb_utils.InferenceResponse(
|
| 208 |
+
output_tensors=outputs)
|
| 209 |
+
responses.append(inference_response)
|
| 210 |
+
|
| 211 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
| 212 |
+
# of this list must match the length of `requests` list.
|
| 213 |
+
return responses
|
| 214 |
+
|
| 215 |
+
def finalize(self):
|
| 216 |
+
"""`finalize` is called only once when the model is being unloaded.
|
| 217 |
+
Implementing `finalize` function is optional. This function allows
|
| 218 |
+
the model to perform any necessary clean ups before exit.
|
| 219 |
+
"""
|
| 220 |
+
print('Cleaning up...')
|
| 221 |
+
|
| 222 |
+
def _postprocessing(self, tokens_batch, sequence_lengths):
|
| 223 |
+
outputs = []
|
| 224 |
+
for batch_idx, beam_tokens in enumerate(tokens_batch):
|
| 225 |
+
for beam_idx, tokens in enumerate(beam_tokens):
|
| 226 |
+
seq_len = sequence_lengths[batch_idx][beam_idx]
|
| 227 |
+
output = self.tokenizer.decode(
|
| 228 |
+
tokens[:seq_len],
|
| 229 |
+
skip_special_tokens=self.skip_special_tokens)
|
| 230 |
+
outputs.append(output.encode('utf8'))
|
| 231 |
+
return outputs
|
postprocessing/1/special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"eos_token": "</s>",
|
| 4 |
+
"unk_token": "<unk>"
|
| 5 |
+
}
|
postprocessing/1/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
postprocessing/1/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
| 3 |
+
size 493443
|
postprocessing/1/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"0": {
|
| 6 |
+
"content": "<unk>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"1": {
|
| 14 |
+
"content": "<s>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"2": {
|
| 22 |
+
"content": "</s>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"additional_special_tokens": [],
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"legacy": true,
|
| 35 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 36 |
+
"pad_token": null,
|
| 37 |
+
"sp_model_kwargs": {},
|
| 38 |
+
"spaces_between_special_tokens": false,
|
| 39 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 40 |
+
"unk_token": "<unk>",
|
| 41 |
+
"use_default_system_prompt": false,
|
| 42 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
| 43 |
+
}
|
postprocessing/config.pbtxt
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
name: "postprocessing"
|
| 28 |
+
backend: "python"
|
| 29 |
+
max_batch_size: 16
|
| 30 |
+
input [
|
| 31 |
+
{
|
| 32 |
+
name: "TOKENS_BATCH"
|
| 33 |
+
data_type: TYPE_INT32
|
| 34 |
+
dims: [ -1, -1 ]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
name: "SEQUENCE_LENGTH"
|
| 38 |
+
data_type: TYPE_INT32
|
| 39 |
+
dims: [ -1 ]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
name: "CUM_LOG_PROBS"
|
| 43 |
+
data_type: TYPE_FP32
|
| 44 |
+
dims: [ -1 ]
|
| 45 |
+
optional: true
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
name: "OUTPUT_LOG_PROBS"
|
| 49 |
+
data_type: TYPE_FP32
|
| 50 |
+
dims: [ -1, -1 ]
|
| 51 |
+
optional: true
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
name: "CONTEXT_LOGITS"
|
| 55 |
+
data_type: TYPE_FP32
|
| 56 |
+
dims: [ -1, -1 ]
|
| 57 |
+
optional: true
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
name: "GENERATION_LOGITS"
|
| 61 |
+
data_type: TYPE_FP32
|
| 62 |
+
dims: [ -1, -1, -1 ]
|
| 63 |
+
optional: true
|
| 64 |
+
}
|
| 65 |
+
]
|
| 66 |
+
output [
|
| 67 |
+
{
|
| 68 |
+
name: "OUTPUT"
|
| 69 |
+
data_type: TYPE_STRING
|
| 70 |
+
dims: [ -1 ]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
name: "OUT_CUM_LOG_PROBS"
|
| 74 |
+
data_type: TYPE_FP32
|
| 75 |
+
dims: [ -1 ]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
name: "OUT_OUTPUT_LOG_PROBS"
|
| 79 |
+
data_type: TYPE_FP32
|
| 80 |
+
dims: [ -1, -1 ]
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
name: "OUT_CONTEXT_LOGITS"
|
| 84 |
+
data_type: TYPE_FP32
|
| 85 |
+
dims: [ -1, -1 ]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
name: "OUT_GENERATION_LOGITS"
|
| 89 |
+
data_type: TYPE_FP32
|
| 90 |
+
dims: [ -1, -1, -1 ]
|
| 91 |
+
}
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
parameters {
|
| 95 |
+
key: "tokenizer_dir"
|
| 96 |
+
value: {
|
| 97 |
+
string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
parameters {
|
| 102 |
+
key: "skip_special_tokens"
|
| 103 |
+
value: {
|
| 104 |
+
string_value: "${skip_special_tokens}"
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
instance_group [
|
| 109 |
+
{
|
| 110 |
+
count: 1
|
| 111 |
+
kind: KIND_CPU
|
| 112 |
+
}
|
| 113 |
+
]
|
preprocessing/1/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (9.56 kB). View file
|
|
|
preprocessing/1/model.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
import json
|
| 28 |
+
from typing import List
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
import triton_python_backend_utils as pb_utils
|
| 32 |
+
from transformers import AutoTokenizer, T5Tokenizer
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class TritonPythonModel:
|
| 36 |
+
"""Your Python model must use the same class name. Every Python model
|
| 37 |
+
that is created must have "TritonPythonModel" as the class name.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def initialize(self, args):
|
| 41 |
+
"""`initialize` is called only once when the model is being loaded.
|
| 42 |
+
Implementing `initialize` function is optional. This function allows
|
| 43 |
+
the model to initialize any state associated with this model.
|
| 44 |
+
Parameters
|
| 45 |
+
----------
|
| 46 |
+
args : dict
|
| 47 |
+
Both keys and values are strings. The dictionary keys and values are:
|
| 48 |
+
* model_config: A JSON string containing the model configuration
|
| 49 |
+
* model_instance_kind: A string containing model instance kind
|
| 50 |
+
* model_instance_device_id: A string containing model instance device ID
|
| 51 |
+
* model_repository: Model repository path
|
| 52 |
+
* model_version: Model version
|
| 53 |
+
* model_name: Model name
|
| 54 |
+
"""
|
| 55 |
+
# Parse model configs
|
| 56 |
+
model_config = json.loads(args['model_config'])
|
| 57 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
| 58 |
+
'string_value']
|
| 59 |
+
|
| 60 |
+
add_special_tokens = model_config['parameters'].get(
|
| 61 |
+
'add_special_tokens')
|
| 62 |
+
if add_special_tokens is not None:
|
| 63 |
+
add_special_tokens_str = add_special_tokens['string_value'].lower()
|
| 64 |
+
if add_special_tokens_str in [
|
| 65 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
| 66 |
+
]:
|
| 67 |
+
self.add_special_tokens = add_special_tokens_str in [
|
| 68 |
+
'true', '1', 't', 'y', 'yes'
|
| 69 |
+
]
|
| 70 |
+
else:
|
| 71 |
+
print(
|
| 72 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
|
| 73 |
+
)
|
| 74 |
+
self.add_special_tokens = True
|
| 75 |
+
else:
|
| 76 |
+
print(
|
| 77 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
|
| 78 |
+
)
|
| 79 |
+
self.add_special_tokens = True
|
| 80 |
+
|
| 81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
| 82 |
+
legacy=False,
|
| 83 |
+
padding_side='left',
|
| 84 |
+
trust_remote_code=True)
|
| 85 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
| 86 |
+
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
|
| 87 |
+
|
| 88 |
+
if not self.tokenizer.pad_token:
|
| 89 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 90 |
+
|
| 91 |
+
self.tokenizer_end_id = self.tokenizer.encode(
|
| 92 |
+
self.tokenizer.eos_token, add_special_tokens=False)[0]
|
| 93 |
+
self.tokenizer_pad_id = self.tokenizer.encode(
|
| 94 |
+
self.tokenizer.pad_token, add_special_tokens=False)[0]
|
| 95 |
+
|
| 96 |
+
# Parse model output configs and convert Triton types to numpy types
|
| 97 |
+
output_names = [
|
| 98 |
+
"INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
|
| 99 |
+
"REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
|
| 100 |
+
"OUT_END_ID", "OUT_PAD_ID"
|
| 101 |
+
]
|
| 102 |
+
input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
|
| 103 |
+
for input_name in input_names:
|
| 104 |
+
setattr(
|
| 105 |
+
self,
|
| 106 |
+
input_name.lower() + "_dtype",
|
| 107 |
+
pb_utils.triton_string_to_numpy(
|
| 108 |
+
pb_utils.get_input_config_by_name(
|
| 109 |
+
model_config, input_name)['data_type']))
|
| 110 |
+
|
| 111 |
+
for output_name in output_names:
|
| 112 |
+
setattr(
|
| 113 |
+
self,
|
| 114 |
+
output_name.lower() + "_dtype",
|
| 115 |
+
pb_utils.triton_string_to_numpy(
|
| 116 |
+
pb_utils.get_output_config_by_name(
|
| 117 |
+
model_config, output_name)['data_type']))
|
| 118 |
+
|
| 119 |
+
def execute(self, requests):
|
| 120 |
+
"""`execute` must be implemented in every Python model. `execute`
|
| 121 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
| 122 |
+
argument. This function is called when an inference is requested
|
| 123 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
| 124 |
+
Batching) used, `requests` may contain multiple requests. Every
|
| 125 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
| 126 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
| 127 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
| 128 |
+
Parameters
|
| 129 |
+
----------
|
| 130 |
+
requests : list
|
| 131 |
+
A list of pb_utils.InferenceRequest
|
| 132 |
+
Returns
|
| 133 |
+
-------
|
| 134 |
+
list
|
| 135 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
| 136 |
+
be the same as `requests`
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
responses = []
|
| 140 |
+
|
| 141 |
+
# Every Python backend must iterate over everyone of the requests
|
| 142 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
| 143 |
+
logger = pb_utils.Logger
|
| 144 |
+
for idx, request in enumerate(requests):
|
| 145 |
+
# Get input tensors
|
| 146 |
+
query = pb_utils.get_input_tensor_by_name(request,
|
| 147 |
+
'QUERY').as_numpy()
|
| 148 |
+
decoder_query = pb_utils.get_input_tensor_by_name(
|
| 149 |
+
request, 'DECODER_QUERY')
|
| 150 |
+
if decoder_query is not None:
|
| 151 |
+
decoder_query = decoder_query.as_numpy()
|
| 152 |
+
|
| 153 |
+
batch_dim = query.shape[0]
|
| 154 |
+
if batch_dim != 1:
|
| 155 |
+
|
| 156 |
+
err_str = "Inflight batching backend expects requests with batch size of 1."
|
| 157 |
+
logger.log_error(err_str)
|
| 158 |
+
responses.append(
|
| 159 |
+
pb_utils.InferenceResponse(
|
| 160 |
+
output_tensors=[],
|
| 161 |
+
error=pb_utils.TritonError(err_str)))
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
request_output_len = pb_utils.get_input_tensor_by_name(
|
| 165 |
+
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
| 166 |
+
|
| 167 |
+
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
| 168 |
+
request, 'BAD_WORDS_DICT')
|
| 169 |
+
if bad_words_dict is not None:
|
| 170 |
+
bad_words_dict = bad_words_dict.as_numpy()
|
| 171 |
+
|
| 172 |
+
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
| 173 |
+
request, 'STOP_WORDS_DICT')
|
| 174 |
+
if stop_words_dict is not None:
|
| 175 |
+
stop_words_dict = stop_words_dict.as_numpy()
|
| 176 |
+
|
| 177 |
+
embedding_bias_words = pb_utils.get_input_tensor_by_name(
|
| 178 |
+
request, 'EMBEDDING_BIAS_WORDS')
|
| 179 |
+
if embedding_bias_words is not None:
|
| 180 |
+
embedding_bias_words = embedding_bias_words.as_numpy()
|
| 181 |
+
|
| 182 |
+
embedding_bias_weights = pb_utils.get_input_tensor_by_name(
|
| 183 |
+
request, 'EMBEDDING_BIAS_WEIGHTS')
|
| 184 |
+
if embedding_bias_weights is not None:
|
| 185 |
+
embedding_bias_weights = embedding_bias_weights.as_numpy()
|
| 186 |
+
|
| 187 |
+
# Take the end_id from the input tensors
|
| 188 |
+
# If not specified, use tokenizer to get end_id
|
| 189 |
+
end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
|
| 190 |
+
if end_id is not None:
|
| 191 |
+
end_id = end_id.as_numpy()
|
| 192 |
+
else:
|
| 193 |
+
end_id = [[self.tokenizer_end_id]]
|
| 194 |
+
|
| 195 |
+
# Take the pad_id from the input tensors
|
| 196 |
+
# If not specified, use tokenizer to get pad_id
|
| 197 |
+
pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
|
| 198 |
+
if pad_id is not None:
|
| 199 |
+
pad_id = pad_id.as_numpy()
|
| 200 |
+
else:
|
| 201 |
+
pad_id = [[self.tokenizer_pad_id]]
|
| 202 |
+
|
| 203 |
+
# Preprocessing input data.
|
| 204 |
+
input_id, request_input_len = self._create_request(query)
|
| 205 |
+
print(input_id)
|
| 206 |
+
print(request_input_len)
|
| 207 |
+
if decoder_query is not None:
|
| 208 |
+
decoder_input_id, request_decoder_input_len = self._create_request(
|
| 209 |
+
decoder_query)
|
| 210 |
+
else:
|
| 211 |
+
decoder_input_id = pad_id * np.ones((1, 1), np.int32)
|
| 212 |
+
request_decoder_input_len = 1 * np.ones((1, 1), np.int32)
|
| 213 |
+
|
| 214 |
+
bad_words = self._to_word_list_format(bad_words_dict)
|
| 215 |
+
stop_words = self._to_word_list_format(stop_words_dict)
|
| 216 |
+
|
| 217 |
+
embedding_bias = self._get_embedding_bias(
|
| 218 |
+
embedding_bias_words, embedding_bias_weights,
|
| 219 |
+
self.embedding_bias_weights_dtype)
|
| 220 |
+
|
| 221 |
+
# Create output tensors. You need pb_utils.Tensor
|
| 222 |
+
# objects to create pb_utils.InferenceResponse.
|
| 223 |
+
input_id_tensor = pb_utils.Tensor(
|
| 224 |
+
'INPUT_ID', input_id.astype(self.input_id_dtype))
|
| 225 |
+
request_input_len_tensor = pb_utils.Tensor(
|
| 226 |
+
'REQUEST_INPUT_LEN',
|
| 227 |
+
request_input_len.astype(self.request_input_len_dtype))
|
| 228 |
+
decoder_input_id_tensor = pb_utils.Tensor(
|
| 229 |
+
'DECODER_INPUT_ID',
|
| 230 |
+
decoder_input_id.astype(self.decoder_input_id_dtype))
|
| 231 |
+
request_decoder_input_len_tensor = pb_utils.Tensor(
|
| 232 |
+
'REQUEST_DECODER_INPUT_LEN',
|
| 233 |
+
request_decoder_input_len.astype(
|
| 234 |
+
self.request_decoder_input_len_dtype))
|
| 235 |
+
request_output_len_tensor = pb_utils.Tensor(
|
| 236 |
+
'REQUEST_OUTPUT_LEN', request_output_len)
|
| 237 |
+
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
| 238 |
+
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
| 239 |
+
stop_words)
|
| 240 |
+
embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
|
| 241 |
+
embedding_bias)
|
| 242 |
+
end_id_tensor = pb_utils.Tensor('OUT_END_ID',
|
| 243 |
+
np.array(end_id, dtype=np.int32))
|
| 244 |
+
pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
|
| 245 |
+
np.array(pad_id, dtype=np.int32))
|
| 246 |
+
|
| 247 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
| 248 |
+
input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
|
| 249 |
+
stop_words_ids_tensor, request_input_len_tensor,
|
| 250 |
+
request_decoder_input_len_tensor, request_output_len_tensor,
|
| 251 |
+
embedding_bias_tensor, end_id_tensor, pad_id_tensor
|
| 252 |
+
])
|
| 253 |
+
responses.append(inference_response)
|
| 254 |
+
|
| 255 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
| 256 |
+
# of this list must match the length of `requests` list.
|
| 257 |
+
return responses
|
| 258 |
+
|
| 259 |
+
def finalize(self):
|
| 260 |
+
"""`finalize` is called only once when the model is being unloaded.
|
| 261 |
+
Implementing `finalize` function is optional. This function allows
|
| 262 |
+
the model to perform any necessary clean ups before exit.
|
| 263 |
+
"""
|
| 264 |
+
print('Cleaning up...')
|
| 265 |
+
|
| 266 |
+
def _create_request(self, query):
|
| 267 |
+
"""
|
| 268 |
+
query : batch string (2D numpy array)
|
| 269 |
+
"""
|
| 270 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
| 271 |
+
start_ids = [
|
| 272 |
+
np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
|
| 273 |
+
s[0].decode(), add_special_tokens=self.add_special_tokens)
|
| 274 |
+
).astype(int) for s in query
|
| 275 |
+
]
|
| 276 |
+
else:
|
| 277 |
+
start_ids = [
|
| 278 |
+
np.array(
|
| 279 |
+
self.tokenizer.encode(
|
| 280 |
+
s[0].decode(),
|
| 281 |
+
add_special_tokens=self.add_special_tokens)).astype(
|
| 282 |
+
int) for s in query
|
| 283 |
+
]
|
| 284 |
+
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
|
| 285 |
+
|
| 286 |
+
max_len = 0
|
| 287 |
+
for seq in start_ids:
|
| 288 |
+
max_len = max(max_len, seq.shape[0])
|
| 289 |
+
start_ids = np.stack([
|
| 290 |
+
np.pad(seq, (0, max_len - seq.shape[0]),
|
| 291 |
+
'constant',
|
| 292 |
+
constant_values=(0, self.tokenizer_pad_id))
|
| 293 |
+
for seq in start_ids
|
| 294 |
+
])
|
| 295 |
+
|
| 296 |
+
return start_ids, start_lengths
|
| 297 |
+
|
| 298 |
+
def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
|
| 299 |
+
'''
|
| 300 |
+
word_lists format:
|
| 301 |
+
len(word_lists) == batch_size
|
| 302 |
+
word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
|
| 303 |
+
'''
|
| 304 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
| 305 |
+
|
| 306 |
+
if word_lists is None:
|
| 307 |
+
# Return an empty array of shape (1,2,0)
|
| 308 |
+
return np.empty([1, 2, 0], dtype="int32")
|
| 309 |
+
|
| 310 |
+
flat_ids = []
|
| 311 |
+
offsets = []
|
| 312 |
+
for word_list in word_lists:
|
| 313 |
+
item_flat_ids = []
|
| 314 |
+
item_offsets = []
|
| 315 |
+
|
| 316 |
+
for word in word_list:
|
| 317 |
+
if isinstance(word, bytes):
|
| 318 |
+
word = word.decode()
|
| 319 |
+
|
| 320 |
+
ids = self.tokenizer.encode(word, add_special_tokens=False)
|
| 321 |
+
if len(ids) == 0:
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
item_flat_ids += ids
|
| 325 |
+
item_offsets.append(len(ids))
|
| 326 |
+
|
| 327 |
+
flat_ids.append(np.array(item_flat_ids))
|
| 328 |
+
offsets.append(np.cumsum(np.array(item_offsets)))
|
| 329 |
+
|
| 330 |
+
pad_to = max(1, max(len(ids) for ids in flat_ids))
|
| 331 |
+
|
| 332 |
+
for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
|
| 333 |
+
flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
|
| 334 |
+
constant_values=0)
|
| 335 |
+
offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
|
| 336 |
+
constant_values=-1)
|
| 337 |
+
|
| 338 |
+
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
| 339 |
+
(1, 0, 2))
|
| 340 |
+
|
| 341 |
+
def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
|
| 342 |
+
bias_dtype):
|
| 343 |
+
|
| 344 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
| 345 |
+
|
| 346 |
+
if embedding_bias_words is None or embedding_bias_weights is None:
|
| 347 |
+
return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
|
| 348 |
+
|
| 349 |
+
batch_embedding_bias = []
|
| 350 |
+
for words, weights in zip(embedding_bias_words,
|
| 351 |
+
embedding_bias_weights):
|
| 352 |
+
|
| 353 |
+
vocab_size = self.tokenizer.vocab_size
|
| 354 |
+
embedding_bias = [0.] * vocab_size
|
| 355 |
+
|
| 356 |
+
assert len(words) == len(
|
| 357 |
+
weights
|
| 358 |
+
), "Embedding bias words must have same dimension as embedding bias weights"
|
| 359 |
+
|
| 360 |
+
for word, weight in zip(words, weights):
|
| 361 |
+
if isinstance(word, bytes):
|
| 362 |
+
word = word.decode()
|
| 363 |
+
ids = self.tokenizer.encode(word)
|
| 364 |
+
|
| 365 |
+
if len(ids) == 0:
|
| 366 |
+
continue
|
| 367 |
+
|
| 368 |
+
for id in ids:
|
| 369 |
+
embedding_bias[id] += weight
|
| 370 |
+
|
| 371 |
+
batch_embedding_bias.append(np.array(embedding_bias))
|
| 372 |
+
|
| 373 |
+
return np.array(batch_embedding_bias, dtype=bias_dtype)
|
preprocessing/1/special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"eos_token": "</s>",
|
| 4 |
+
"unk_token": "<unk>"
|
| 5 |
+
}
|
preprocessing/1/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
preprocessing/1/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
| 3 |
+
size 493443
|
preprocessing/1/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"0": {
|
| 6 |
+
"content": "<unk>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"1": {
|
| 14 |
+
"content": "<s>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"2": {
|
| 22 |
+
"content": "</s>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"additional_special_tokens": [],
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"legacy": true,
|
| 35 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 36 |
+
"pad_token": null,
|
| 37 |
+
"sp_model_kwargs": {},
|
| 38 |
+
"spaces_between_special_tokens": false,
|
| 39 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 40 |
+
"unk_token": "<unk>",
|
| 41 |
+
"use_default_system_prompt": false,
|
| 42 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
| 43 |
+
}
|
preprocessing/config.pbtxt
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
name: "preprocessing"
|
| 28 |
+
backend: "python"
|
| 29 |
+
max_batch_size: 16
|
| 30 |
+
input [
|
| 31 |
+
{
|
| 32 |
+
name: "QUERY"
|
| 33 |
+
data_type: TYPE_STRING
|
| 34 |
+
dims: [ -1 ]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
name: "DECODER_QUERY"
|
| 38 |
+
data_type: TYPE_STRING
|
| 39 |
+
dims: [ -1 ]
|
| 40 |
+
optional: true
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
name: "REQUEST_OUTPUT_LEN"
|
| 44 |
+
data_type: TYPE_INT32
|
| 45 |
+
dims: [ -1 ]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
name: "BAD_WORDS_DICT"
|
| 49 |
+
data_type: TYPE_STRING
|
| 50 |
+
dims: [ -1 ]
|
| 51 |
+
optional: true
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
name: "STOP_WORDS_DICT"
|
| 55 |
+
data_type: TYPE_STRING
|
| 56 |
+
dims: [ -1 ]
|
| 57 |
+
optional: true
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
name: "EMBEDDING_BIAS_WORDS"
|
| 61 |
+
data_type: TYPE_STRING
|
| 62 |
+
dims: [ -1 ]
|
| 63 |
+
optional: true
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
name: "EMBEDDING_BIAS_WEIGHTS"
|
| 67 |
+
data_type: TYPE_FP32
|
| 68 |
+
dims: [ -1 ]
|
| 69 |
+
optional: true
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
name: "END_ID"
|
| 73 |
+
data_type: TYPE_INT32
|
| 74 |
+
dims: [ -1 ]
|
| 75 |
+
optional: true
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
name: "PAD_ID"
|
| 79 |
+
data_type: TYPE_INT32
|
| 80 |
+
dims: [ -1 ]
|
| 81 |
+
optional: true
|
| 82 |
+
}
|
| 83 |
+
]
|
| 84 |
+
output [
|
| 85 |
+
{
|
| 86 |
+
name: "INPUT_ID"
|
| 87 |
+
data_type: TYPE_INT32
|
| 88 |
+
dims: [ -1 ]
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
name: "REQUEST_INPUT_LEN"
|
| 92 |
+
data_type: TYPE_INT32
|
| 93 |
+
dims: [ 1 ]
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
name: "DECODER_INPUT_ID"
|
| 97 |
+
data_type: TYPE_INT32
|
| 98 |
+
dims: [ -1 ]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
name: "REQUEST_DECODER_INPUT_LEN"
|
| 102 |
+
data_type: TYPE_INT32
|
| 103 |
+
dims: [ 1 ]
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
name: "BAD_WORDS_IDS"
|
| 107 |
+
data_type: TYPE_INT32
|
| 108 |
+
dims: [ 2, -1 ]
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
name: "STOP_WORDS_IDS"
|
| 112 |
+
data_type: TYPE_INT32
|
| 113 |
+
dims: [ 2, -1 ]
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
name: "EMBEDDING_BIAS"
|
| 117 |
+
data_type: TYPE_FP32
|
| 118 |
+
dims: [ -1 ]
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
name: "REQUEST_OUTPUT_LEN"
|
| 122 |
+
data_type: TYPE_INT32
|
| 123 |
+
dims: [ -1 ]
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
name: "OUT_END_ID"
|
| 127 |
+
data_type: TYPE_INT32
|
| 128 |
+
dims: [ -1 ]
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
name: "OUT_PAD_ID"
|
| 132 |
+
data_type: TYPE_INT32
|
| 133 |
+
dims: [ -1 ]
|
| 134 |
+
}
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
parameters {
|
| 138 |
+
key: "tokenizer_dir"
|
| 139 |
+
value: {
|
| 140 |
+
string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
parameters {
|
| 145 |
+
key: "add_special_tokens"
|
| 146 |
+
value: {
|
| 147 |
+
string_value: "${add_special_tokens}"
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
instance_group [
|
| 152 |
+
{
|
| 153 |
+
count: 1
|
| 154 |
+
kind: KIND_CPU
|
| 155 |
+
}
|
| 156 |
+
]
|
tensorrt_llm/1/.gitkeep
ADDED
|
File without changes
|
tensorrt_llm/1/config.json
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "0.11.0.dev2024062500",
|
| 3 |
+
"pretrained_config": {
|
| 4 |
+
"mlp_bias": false,
|
| 5 |
+
"attn_bias": false,
|
| 6 |
+
"rotary_base": 1000000.0,
|
| 7 |
+
"rotary_scaling": null,
|
| 8 |
+
"residual_mlp": false,
|
| 9 |
+
"disable_weight_only_quant_plugin": false,
|
| 10 |
+
"moe": {
|
| 11 |
+
"num_experts": 8,
|
| 12 |
+
"top_k": 2,
|
| 13 |
+
"normalization_mode": 1
|
| 14 |
+
},
|
| 15 |
+
"architecture": "LlamaForCausalLM",
|
| 16 |
+
"dtype": "float16",
|
| 17 |
+
"vocab_size": 32000,
|
| 18 |
+
"hidden_size": 4096,
|
| 19 |
+
"num_hidden_layers": 32,
|
| 20 |
+
"num_attention_heads": 32,
|
| 21 |
+
"hidden_act": "swiglu",
|
| 22 |
+
"logits_dtype": "float32",
|
| 23 |
+
"norm_epsilon": 1e-05,
|
| 24 |
+
"position_embedding_type": "rope_gpt_neox",
|
| 25 |
+
"max_position_embeddings": 32768,
|
| 26 |
+
"num_key_value_heads": 8,
|
| 27 |
+
"intermediate_size": 14336,
|
| 28 |
+
"mapping": {
|
| 29 |
+
"world_size": 1,
|
| 30 |
+
"gpus_per_node": 8,
|
| 31 |
+
"tp_size": 1,
|
| 32 |
+
"pp_size": 1,
|
| 33 |
+
"moe_tp_size": 1,
|
| 34 |
+
"moe_ep_size": 1
|
| 35 |
+
},
|
| 36 |
+
"quantization": {
|
| 37 |
+
"quant_algo": "W8A16",
|
| 38 |
+
"kv_cache_quant_algo": null,
|
| 39 |
+
"group_size": 128,
|
| 40 |
+
"smoothquant_val": null,
|
| 41 |
+
"has_zero_point": false,
|
| 42 |
+
"pre_quant_scale": false,
|
| 43 |
+
"exclude_modules": null
|
| 44 |
+
},
|
| 45 |
+
"use_parallel_embedding": false,
|
| 46 |
+
"embedding_sharding_dim": 0,
|
| 47 |
+
"share_embedding_table": false,
|
| 48 |
+
"head_size": 128,
|
| 49 |
+
"qk_layernorm": false
|
| 50 |
+
},
|
| 51 |
+
"build_config": {
|
| 52 |
+
"max_input_len": 28000,
|
| 53 |
+
"max_seq_len": 32000,
|
| 54 |
+
"opt_batch_size": null,
|
| 55 |
+
"max_batch_size": 16,
|
| 56 |
+
"max_beam_width": 1,
|
| 57 |
+
"max_num_tokens": 32000,
|
| 58 |
+
"opt_num_tokens": 16,
|
| 59 |
+
"max_prompt_embedding_table_size": 0,
|
| 60 |
+
"gather_context_logits": false,
|
| 61 |
+
"gather_generation_logits": false,
|
| 62 |
+
"strongly_typed": true,
|
| 63 |
+
"builder_opt": null,
|
| 64 |
+
"profiling_verbosity": "layer_names_only",
|
| 65 |
+
"enable_debug_output": false,
|
| 66 |
+
"max_draft_len": 0,
|
| 67 |
+
"speculative_decoding_mode": 1,
|
| 68 |
+
"use_refit": false,
|
| 69 |
+
"input_timing_cache": null,
|
| 70 |
+
"output_timing_cache": "model.cache",
|
| 71 |
+
"lora_config": {
|
| 72 |
+
"lora_dir": [],
|
| 73 |
+
"lora_ckpt_source": "hf",
|
| 74 |
+
"max_lora_rank": 64,
|
| 75 |
+
"lora_target_modules": [],
|
| 76 |
+
"trtllm_modules_to_hf_modules": {}
|
| 77 |
+
},
|
| 78 |
+
"auto_parallel_config": {
|
| 79 |
+
"world_size": 1,
|
| 80 |
+
"gpus_per_node": 8,
|
| 81 |
+
"cluster_key": "A100-SXM-80GB",
|
| 82 |
+
"cluster_info": null,
|
| 83 |
+
"sharding_cost_model": "alpha_beta",
|
| 84 |
+
"comm_cost_model": "alpha_beta",
|
| 85 |
+
"enable_pipeline_parallelism": false,
|
| 86 |
+
"enable_shard_unbalanced_shape": false,
|
| 87 |
+
"enable_shard_dynamic_shape": false,
|
| 88 |
+
"enable_reduce_scatter": true,
|
| 89 |
+
"builder_flags": null,
|
| 90 |
+
"debug_mode": false,
|
| 91 |
+
"infer_shape": true,
|
| 92 |
+
"validation_mode": false,
|
| 93 |
+
"same_buffer_io": {
|
| 94 |
+
"past_key_value_(\\d+)": "present_key_value_\\1"
|
| 95 |
+
},
|
| 96 |
+
"same_spec_io": {},
|
| 97 |
+
"sharded_io_allowlist": [
|
| 98 |
+
"past_key_value_\\d+",
|
| 99 |
+
"present_key_value_\\d*"
|
| 100 |
+
],
|
| 101 |
+
"fill_weights": false,
|
| 102 |
+
"parallel_config_cache": null,
|
| 103 |
+
"profile_cache": null,
|
| 104 |
+
"dump_path": null,
|
| 105 |
+
"debug_outputs": []
|
| 106 |
+
},
|
| 107 |
+
"weight_sparsity": false,
|
| 108 |
+
"weight_streaming": false,
|
| 109 |
+
"plugin_config": {
|
| 110 |
+
"dtype": "float16",
|
| 111 |
+
"bert_attention_plugin": "auto",
|
| 112 |
+
"gpt_attention_plugin": "auto",
|
| 113 |
+
"gemm_plugin": "float16",
|
| 114 |
+
"gemm_swiglu_plugin": null,
|
| 115 |
+
"smooth_quant_gemm_plugin": null,
|
| 116 |
+
"identity_plugin": null,
|
| 117 |
+
"layernorm_quantization_plugin": null,
|
| 118 |
+
"rmsnorm_quantization_plugin": null,
|
| 119 |
+
"nccl_plugin": null,
|
| 120 |
+
"lookup_plugin": null,
|
| 121 |
+
"lora_plugin": null,
|
| 122 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
| 123 |
+
"weight_only_quant_matmul_plugin": "float16",
|
| 124 |
+
"quantize_per_token_plugin": false,
|
| 125 |
+
"quantize_tensor_plugin": false,
|
| 126 |
+
"moe_plugin": "auto",
|
| 127 |
+
"mamba_conv1d_plugin": "auto",
|
| 128 |
+
"context_fmha": true,
|
| 129 |
+
"context_fmha_fp32_acc": false,
|
| 130 |
+
"paged_kv_cache": true,
|
| 131 |
+
"remove_input_padding": true,
|
| 132 |
+
"use_custom_all_reduce": true,
|
| 133 |
+
"reduce_fusion": false,
|
| 134 |
+
"multi_block_mode": false,
|
| 135 |
+
"enable_xqa": true,
|
| 136 |
+
"attention_qk_half_accumulation": false,
|
| 137 |
+
"tokens_per_block": 64,
|
| 138 |
+
"use_paged_context_fmha": false,
|
| 139 |
+
"use_fp8_context_fmha": false,
|
| 140 |
+
"multiple_profiles": false,
|
| 141 |
+
"paged_state": true,
|
| 142 |
+
"streamingllm": false
|
| 143 |
+
},
|
| 144 |
+
"use_strip_plan": false,
|
| 145 |
+
"max_encoder_input_len": 1024,
|
| 146 |
+
"use_fused_mlp": false
|
| 147 |
+
}
|
| 148 |
+
}
|
tensorrt_llm/1/model.py
ADDED
|
@@ -0,0 +1,782 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from threading import Lock, Thread
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import triton_python_backend_utils as pb_utils
|
| 9 |
+
from torch import from_numpy
|
| 10 |
+
|
| 11 |
+
import tensorrt_llm.bindings.executor as trtllm
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_input_tensor_by_name(request, name):
|
| 15 |
+
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
| 16 |
+
if tensor is None:
|
| 17 |
+
return None
|
| 18 |
+
return tensor.as_numpy()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_input_scalar_by_name(request, name):
|
| 22 |
+
tensor = get_input_tensor_by_name(request, name)
|
| 23 |
+
if tensor is None:
|
| 24 |
+
return None
|
| 25 |
+
if tensor.size != 1:
|
| 26 |
+
raise pb_utils.TritonModelException(
|
| 27 |
+
f"Expected a single value for {name}")
|
| 28 |
+
return tensor.item()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def read_parameter_as_type(value, name, pytype=str):
|
| 32 |
+
if value == "":
|
| 33 |
+
return None
|
| 34 |
+
if value.startswith("${") and value.endswith("}"):
|
| 35 |
+
return None
|
| 36 |
+
if pytype is bool:
|
| 37 |
+
return value.lower() in ["1", "true"]
|
| 38 |
+
try:
|
| 39 |
+
result = pytype(value)
|
| 40 |
+
return result
|
| 41 |
+
except:
|
| 42 |
+
pb_utils.Logger.log_warning(
|
| 43 |
+
f"Could not read parameter '{name}' with value '{value}', will use default."
|
| 44 |
+
)
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def get_parameter(model_config, name, pytype=str):
|
| 49 |
+
if name not in model_config['parameters']:
|
| 50 |
+
return None
|
| 51 |
+
return read_parameter_as_type(
|
| 52 |
+
model_config['parameters'][name]['string_value'], name, pytype)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def convert_word_list(word_list):
|
| 56 |
+
if word_list is None:
|
| 57 |
+
return None
|
| 58 |
+
word_list = word_list.tolist()
|
| 59 |
+
if len(word_list) == 0 or len(word_list[0]) != 2:
|
| 60 |
+
raise pb_utils.TritonModelException(f"Invalid format for word list.")
|
| 61 |
+
words, indices = word_list[0]
|
| 62 |
+
result = []
|
| 63 |
+
current_index = 0
|
| 64 |
+
for i in indices:
|
| 65 |
+
if i == -1:
|
| 66 |
+
continue
|
| 67 |
+
if i > len(words):
|
| 68 |
+
raise pb_utils.TritonModelException(
|
| 69 |
+
f"Invalid format for word list.")
|
| 70 |
+
current_word = []
|
| 71 |
+
while current_index < i:
|
| 72 |
+
current_word.append(words[current_index])
|
| 73 |
+
current_index += 1
|
| 74 |
+
result.append(current_word)
|
| 75 |
+
return result
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def parse_medusa_choices(medusa_choices):
|
| 79 |
+
if medusa_choices is None:
|
| 80 |
+
return None
|
| 81 |
+
try:
|
| 82 |
+
result = json.loads(
|
| 83 |
+
"[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
|
| 84 |
+
assert isinstance(result, list) and len(result) > 0
|
| 85 |
+
assert all([isinstance(x, list) for x in result])
|
| 86 |
+
assert all([isinstance(y, int) for x in result for y in x])
|
| 87 |
+
except Exception:
|
| 88 |
+
raise pb_utils.TritonModelException(
|
| 89 |
+
"Invalid format for medusa_choices")
|
| 90 |
+
return result
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_sampling_config_from_request(request):
|
| 94 |
+
kwargs = {}
|
| 95 |
+
kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1
|
| 96 |
+
kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k')
|
| 97 |
+
kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p')
|
| 98 |
+
kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
|
| 99 |
+
'top_p'] <= 0 else kwargs['top_p']
|
| 100 |
+
kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed')
|
| 101 |
+
kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature')
|
| 102 |
+
kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length')
|
| 103 |
+
kwargs['repetition_penalty'] = get_input_scalar_by_name(
|
| 104 |
+
request, 'repetition_penalty')
|
| 105 |
+
kwargs['presence_penalty'] = get_input_scalar_by_name(
|
| 106 |
+
request, 'presence_penalty')
|
| 107 |
+
kwargs['frequency_penalty'] = get_input_scalar_by_name(
|
| 108 |
+
request, 'frequency_penalty')
|
| 109 |
+
kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty')
|
| 110 |
+
kwargs['top_p_min'] = get_input_scalar_by_name(request,
|
| 111 |
+
'runtime_top_p_min')
|
| 112 |
+
kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
|
| 113 |
+
request, 'runtime_top_p_reset_ids')
|
| 114 |
+
kwargs['top_p_decay'] = get_input_scalar_by_name(request,
|
| 115 |
+
'runtime_top_p_decay')
|
| 116 |
+
kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
|
| 117 |
+
request, 'beam_search_diversity_rate')
|
| 118 |
+
kwargs['early_stopping'] = get_input_scalar_by_name(
|
| 119 |
+
request, 'early_stopping')
|
| 120 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 121 |
+
return trtllm.SamplingConfig(**kwargs)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def get_output_config_from_request(request, exclude_input_from_output):
|
| 125 |
+
kwargs = {}
|
| 126 |
+
kwargs["return_log_probs"] = get_input_scalar_by_name(
|
| 127 |
+
request, 'return_log_probs')
|
| 128 |
+
kwargs["return_context_logits"] = get_input_scalar_by_name(
|
| 129 |
+
request, 'return_context_logits')
|
| 130 |
+
kwargs["return_generation_logits"] = get_input_scalar_by_name(
|
| 131 |
+
request, 'return_generation_logits')
|
| 132 |
+
kwargs["exclude_input_from_output"] = exclude_input_from_output
|
| 133 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 134 |
+
return trtllm.OutputConfig(**kwargs)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def get_external_draft_tokens_config_from_request(request):
|
| 138 |
+
kwargs = {}
|
| 139 |
+
draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
|
| 140 |
+
if draft_input_ids is not None:
|
| 141 |
+
kwargs['tokens'] = draft_input_ids.tolist()
|
| 142 |
+
draft_logits = get_input_tensor_by_name(request, 'draft_logits')
|
| 143 |
+
if draft_logits is not None:
|
| 144 |
+
kwargs['logits'] = from_numpy(draft_logits)
|
| 145 |
+
kwargs['acceptance_threshold'] = get_input_scalar_by_name(
|
| 146 |
+
request, 'draft_acceptance_threshold')
|
| 147 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 148 |
+
if len(kwargs) > 0:
|
| 149 |
+
return trtllm.ExternalDraftTokensConfig(**kwargs)
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def get_prompt_tuning_config_from_request(request):
|
| 154 |
+
# prompt_vocab_size is unused by executor.
|
| 155 |
+
kwargs = {}
|
| 156 |
+
prompt_embedding_table = get_input_tensor_by_name(
|
| 157 |
+
request, 'prompt_embedding_table')
|
| 158 |
+
if prompt_embedding_table is not None:
|
| 159 |
+
kwargs["embedding_table"] = from_numpy(prompt_embedding_table)
|
| 160 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 161 |
+
if len(kwargs) > 0:
|
| 162 |
+
return trtllm.PromptTuningConfig(**kwargs)
|
| 163 |
+
return None
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def get_lora_config_from_request(request):
|
| 167 |
+
kwargs = {}
|
| 168 |
+
kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id')
|
| 169 |
+
lora_weights = get_input_tensor_by_name(request, 'lora_weights')
|
| 170 |
+
if lora_weights is not None:
|
| 171 |
+
kwargs["weights"] = from_numpy(lora_weights)
|
| 172 |
+
lora_config = get_input_tensor_by_name(request, 'lora_config')
|
| 173 |
+
if lora_config is not None:
|
| 174 |
+
kwargs["config"] = from_numpy(lora_config)
|
| 175 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 176 |
+
if len(kwargs) > 0:
|
| 177 |
+
return trtllm.LoraConfig(**kwargs)
|
| 178 |
+
return None
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def convert_request(request, exclude_input_from_output, decoupled):
|
| 182 |
+
inputs = {}
|
| 183 |
+
input_token_ids = get_input_tensor_by_name(request, 'input_ids')
|
| 184 |
+
if input_token_ids is None:
|
| 185 |
+
raise pb_utils.TritonModelException(
|
| 186 |
+
"A value is required for input_ids")
|
| 187 |
+
input_token_ids = input_token_ids.tolist()
|
| 188 |
+
if len(input_token_ids) == 0:
|
| 189 |
+
raise pb_utils.TritonModelException(f"Invalid format for input_ids")
|
| 190 |
+
inputs['input_token_ids'] = input_token_ids[0]
|
| 191 |
+
# input_lengths is not not used by executor.
|
| 192 |
+
inputs['max_new_tokens'] = get_input_scalar_by_name(
|
| 193 |
+
request, 'request_output_len')
|
| 194 |
+
if inputs['max_new_tokens'] is None:
|
| 195 |
+
raise pb_utils.TritonModelException(
|
| 196 |
+
"A value is required for request_output_len")
|
| 197 |
+
inputs['streaming'] = get_input_scalar_by_name(request, 'streaming')
|
| 198 |
+
if inputs['streaming'] and not decoupled:
|
| 199 |
+
raise pb_utils.TritonModelException(
|
| 200 |
+
"Streaming is only supported in decoupled mode.")
|
| 201 |
+
inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
|
| 202 |
+
inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
|
| 203 |
+
inputs['stop_words'] = convert_word_list(
|
| 204 |
+
get_input_tensor_by_name(request, 'stop_words_list'))
|
| 205 |
+
inputs['bad_words'] = convert_word_list(
|
| 206 |
+
get_input_tensor_by_name(request, 'bad_words_list'))
|
| 207 |
+
embedding_bias = get_input_tensor_by_name(request, 'embedding_bias')
|
| 208 |
+
if embedding_bias is not None and embedding_bias.size != 0:
|
| 209 |
+
inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
|
| 210 |
+
|
| 211 |
+
sampling_config = get_sampling_config_from_request(request)
|
| 212 |
+
output_config = get_output_config_from_request(request,
|
| 213 |
+
exclude_input_from_output)
|
| 214 |
+
external_draft_tokens_config = get_external_draft_tokens_config_from_request(
|
| 215 |
+
request)
|
| 216 |
+
prompt_tuning_config = get_prompt_tuning_config_from_request(request)
|
| 217 |
+
lora_config = get_lora_config_from_request(request)
|
| 218 |
+
|
| 219 |
+
return trtllm.Request(
|
| 220 |
+
**inputs,
|
| 221 |
+
sampling_config=sampling_config,
|
| 222 |
+
output_config=output_config,
|
| 223 |
+
external_draft_tokens_config=external_draft_tokens_config,
|
| 224 |
+
prompt_tuning_config=prompt_tuning_config,
|
| 225 |
+
lora_config=lora_config,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def convert_response(response):
|
| 230 |
+
if response.has_error():
|
| 231 |
+
return pb_utils.InferenceResponse(output_tensors=[],
|
| 232 |
+
error=pb_utils.TritonError(
|
| 233 |
+
response.error_msg)), True
|
| 234 |
+
result = response.result
|
| 235 |
+
beam_lengths = np.expand_dims(
|
| 236 |
+
np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
|
| 237 |
+
max_beam_length = max([len(beam) for beam in result.output_token_ids])
|
| 238 |
+
output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
|
| 239 |
+
-1, np.int32)
|
| 240 |
+
for idx, beam in enumerate(result.output_token_ids):
|
| 241 |
+
output_ids[0, idx, :len(beam)] = beam
|
| 242 |
+
output_tensors = [
|
| 243 |
+
pb_utils.Tensor("output_ids", output_ids),
|
| 244 |
+
pb_utils.Tensor("sequence_length", beam_lengths),
|
| 245 |
+
]
|
| 246 |
+
output_tensors.append(
|
| 247 |
+
pb_utils.Tensor(
|
| 248 |
+
"cum_log_probs",
|
| 249 |
+
np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
|
| 250 |
+
if result.cum_log_probs is not None else np.zeros(
|
| 251 |
+
(1, 1), np.float32)))
|
| 252 |
+
output_tensors.append(
|
| 253 |
+
pb_utils.Tensor(
|
| 254 |
+
"output_log_probs",
|
| 255 |
+
np.expand_dims(np.array(result.log_probs, np.float32), 0) if
|
| 256 |
+
result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
|
| 257 |
+
output_tensors.append(
|
| 258 |
+
pb_utils.Tensor(
|
| 259 |
+
"context_logits",
|
| 260 |
+
np.expand_dims(np.array(result.context_logits, np.float32), 0)
|
| 261 |
+
if result.context_logits is not None else np.zeros(
|
| 262 |
+
(1, 1, 1), np.float32)))
|
| 263 |
+
output_tensors.append(
|
| 264 |
+
pb_utils.Tensor(
|
| 265 |
+
"generation_logits",
|
| 266 |
+
np.expand_dims(np.array(result.generation_logits, np.float32), 0)
|
| 267 |
+
if result.generation_logits is not None else np.zeros(
|
| 268 |
+
(1, 1, 1, 1), np.float32)))
|
| 269 |
+
return pb_utils.InferenceResponse(output_tensors), result.is_final
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def convert_scheduler_policy(batch_scheduler_policy: str):
|
| 273 |
+
if batch_scheduler_policy.lower() == "max_utilization":
|
| 274 |
+
return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
|
| 275 |
+
elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
|
| 276 |
+
return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
|
| 277 |
+
raise pb_utils.TritonModelException(
|
| 278 |
+
f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def convert_batching_type(gpt_model_type: str):
|
| 283 |
+
if gpt_model_type is None:
|
| 284 |
+
return None
|
| 285 |
+
if gpt_model_type.lower(
|
| 286 |
+
) == "inflight_fused_batching" or gpt_model_type.lower(
|
| 287 |
+
) == "inflight_batching":
|
| 288 |
+
return trtllm.BatchingType.INFLIGHT
|
| 289 |
+
elif gpt_model_type.lower() == "v1":
|
| 290 |
+
return trtllm.BatchingType.STATIC
|
| 291 |
+
raise pb_utils.TritonModelException(
|
| 292 |
+
f"gpt_model_type value of '{gpt_model_type}' is not supported.")
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def convert_decoding_mode(decoding_mode: str):
|
| 296 |
+
if decoding_mode is None:
|
| 297 |
+
return None
|
| 298 |
+
elif decoding_mode == "auto":
|
| 299 |
+
return trtllm.DecodingMode.Auto()
|
| 300 |
+
elif decoding_mode == "top_k":
|
| 301 |
+
return trtllm.DecodingMode.TopK()
|
| 302 |
+
elif decoding_mode == "top_p":
|
| 303 |
+
return trtllm.DecodingMode.TopP()
|
| 304 |
+
elif decoding_mode == "top_k_top_p":
|
| 305 |
+
return trtllm.DecodingMode.TopKTopP()
|
| 306 |
+
elif decoding_mode == "beam_search":
|
| 307 |
+
return trtllm.DecodingMode.BeamSearch()
|
| 308 |
+
elif decoding_mode == "medusa":
|
| 309 |
+
return trtllm.DecodingMode.Medusa()
|
| 310 |
+
raise pb_utils.TritonModelException(
|
| 311 |
+
f"decoding_mode value of '{decoding_mode}' is not supported.")
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def convert_timestamp_to_seconds(timestamp: str):
|
| 315 |
+
return int(
|
| 316 |
+
datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
class TritonPythonModel:
|
| 320 |
+
"""Your Python model must use the same class name. Every Python model
|
| 321 |
+
that is created must have "TritonPythonModel" as the class name.
|
| 322 |
+
"""
|
| 323 |
+
|
| 324 |
+
def get_scheduler_config(self, model_config):
|
| 325 |
+
batch_scheduler_policy = get_parameter(model_config,
|
| 326 |
+
"batch_scheduler_policy")
|
| 327 |
+
if batch_scheduler_policy is None:
|
| 328 |
+
return trtllm.SchedulerConfig()
|
| 329 |
+
return trtllm.SchedulerConfig(
|
| 330 |
+
convert_scheduler_policy(batch_scheduler_policy))
|
| 331 |
+
|
| 332 |
+
def get_kv_cache_config(self, model_config):
|
| 333 |
+
kwargs = {
|
| 334 |
+
"enable_block_reuse":
|
| 335 |
+
get_parameter(model_config, "enable_kv_cache_reuse", bool),
|
| 336 |
+
"max_tokens":
|
| 337 |
+
get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
|
| 338 |
+
"sink_token_length":
|
| 339 |
+
get_parameter(model_config, "sink_token_length", int),
|
| 340 |
+
"max_attention_window":
|
| 341 |
+
get_parameter(model_config, "max_attention_window_size", int),
|
| 342 |
+
"free_gpu_memory_fraction":
|
| 343 |
+
get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
|
| 344 |
+
float),
|
| 345 |
+
"host_cache_size":
|
| 346 |
+
get_parameter(model_config, "kv_cache_host_memory_bytes", int),
|
| 347 |
+
"onboard_blocks":
|
| 348 |
+
get_parameter(model_config, "kv_cache_onboard_blocks", bool),
|
| 349 |
+
}
|
| 350 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 351 |
+
return trtllm.KvCacheConfig(**kwargs)
|
| 352 |
+
|
| 353 |
+
def get_parallel_config(self, model_config):
|
| 354 |
+
kwargs = {}
|
| 355 |
+
gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
|
| 356 |
+
if gpu_device_ids:
|
| 357 |
+
kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
|
| 358 |
+
self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
|
| 359 |
+
"0") == "1"
|
| 360 |
+
if self.use_orchestrator_mode:
|
| 361 |
+
kwargs[
|
| 362 |
+
"communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
|
| 363 |
+
worker_path = get_parameter(model_config, "worker_path")
|
| 364 |
+
if worker_path is not None:
|
| 365 |
+
raise pb_utils.TritonModelException(
|
| 366 |
+
"worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
|
| 367 |
+
)
|
| 368 |
+
executor_worker_path = get_parameter(model_config,
|
| 369 |
+
"executor_worker_path")
|
| 370 |
+
kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
|
| 371 |
+
True, executor_worker_path)
|
| 372 |
+
if len(kwargs) > 0:
|
| 373 |
+
return trtllm.ParallelConfig(**kwargs)
|
| 374 |
+
return None
|
| 375 |
+
|
| 376 |
+
def get_peft_cache_config(self, model_config):
|
| 377 |
+
kwargs = {
|
| 378 |
+
"optimal_adapter_size":
|
| 379 |
+
get_parameter(model_config, "lora_cache_optimal_adapter_size",
|
| 380 |
+
int),
|
| 381 |
+
"max_adapter_size":
|
| 382 |
+
get_parameter(model_config, "lora_cache_max_adapter_size", int),
|
| 383 |
+
"device_cache_percent":
|
| 384 |
+
get_parameter(model_config, "lora_cache_gpu_memory_fraction",
|
| 385 |
+
float),
|
| 386 |
+
"host_cache_size":
|
| 387 |
+
get_parameter(model_config, "lora_cache_host_memory_bytes", int),
|
| 388 |
+
}
|
| 389 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 390 |
+
return trtllm.PeftCacheConfig(**kwargs)
|
| 391 |
+
|
| 392 |
+
def get_decoding_config(self, model_config):
|
| 393 |
+
kwargs = {
|
| 394 |
+
"medusa_choices":
|
| 395 |
+
parse_medusa_choices(get_parameter(model_config,
|
| 396 |
+
"medusa_choices")),
|
| 397 |
+
"decoding_mode":
|
| 398 |
+
convert_decoding_mode(get_parameter(model_config,
|
| 399 |
+
"decoding_mode")),
|
| 400 |
+
}
|
| 401 |
+
print(kwargs)
|
| 402 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 403 |
+
return trtllm.DecodingConfig(**kwargs)
|
| 404 |
+
|
| 405 |
+
def get_executor_config(self, model_config):
|
| 406 |
+
kwargs = {
|
| 407 |
+
"max_beam_width":
|
| 408 |
+
get_parameter(model_config, "max_beam_width", int),
|
| 409 |
+
"scheduler_config":
|
| 410 |
+
self.get_scheduler_config(model_config),
|
| 411 |
+
"kv_cache_config":
|
| 412 |
+
self.get_kv_cache_config(model_config),
|
| 413 |
+
"enable_chunked_context":
|
| 414 |
+
get_parameter(model_config, "enable_chunked_context", bool),
|
| 415 |
+
"normalize_log_probs":
|
| 416 |
+
get_parameter(model_config, "normalize_log_probs", bool),
|
| 417 |
+
"batching_type":
|
| 418 |
+
convert_batching_type(get_parameter(model_config,
|
| 419 |
+
"gpt_model_type")),
|
| 420 |
+
"parallel_config":
|
| 421 |
+
self.get_parallel_config(model_config),
|
| 422 |
+
"peft_cache_config":
|
| 423 |
+
self.get_peft_cache_config(model_config),
|
| 424 |
+
"decoding_config":
|
| 425 |
+
self.get_decoding_config(model_config),
|
| 426 |
+
}
|
| 427 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
| 428 |
+
return trtllm.ExecutorConfig(**kwargs)
|
| 429 |
+
|
| 430 |
+
def create_metrics(self, model: str, version: str, is_v1_model: bool):
|
| 431 |
+
self.request_metric_family = pb_utils.MetricFamily(
|
| 432 |
+
name="nv_trt_llm_request_metrics",
|
| 433 |
+
description="TRT LLM request metrics",
|
| 434 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
| 435 |
+
)
|
| 436 |
+
self.runtime_memory_metric_family = pb_utils.MetricFamily(
|
| 437 |
+
name="nv_trt_llm_runtime_memory_metrics",
|
| 438 |
+
description="TRT LLM runtime memory metrics",
|
| 439 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
| 440 |
+
)
|
| 441 |
+
self.kv_cache_metric_family = pb_utils.MetricFamily(
|
| 442 |
+
name="nv_trt_llm_kv_cache_block_metrics",
|
| 443 |
+
description="TRT LLM KV cache block metrics",
|
| 444 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
| 445 |
+
)
|
| 446 |
+
model_type = "v1" if is_v1_model else "inflight_batcher"
|
| 447 |
+
self.model_type_metric_family = pb_utils.MetricFamily(
|
| 448 |
+
name=f"nv_trt_llm_{model_type}_metrics",
|
| 449 |
+
description=f"TRT LLM {model_type}-specific metrics",
|
| 450 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
| 451 |
+
)
|
| 452 |
+
self.general_metric_family = pb_utils.MetricFamily(
|
| 453 |
+
name="nv_trt_llm_general_metrics",
|
| 454 |
+
description="General TRT LLM metrics",
|
| 455 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
| 456 |
+
)
|
| 457 |
+
common_labels = {"model": model, "version": version}
|
| 458 |
+
self.all_metrics = {
|
| 459 |
+
# Request metrics
|
| 460 |
+
"num_active_requests":
|
| 461 |
+
self.request_metric_family.Metric(labels={
|
| 462 |
+
"request_type": "active",
|
| 463 |
+
**common_labels
|
| 464 |
+
}),
|
| 465 |
+
"max_num_active_requests":
|
| 466 |
+
self.request_metric_family.Metric(labels={
|
| 467 |
+
"request_type": "max",
|
| 468 |
+
**common_labels
|
| 469 |
+
}),
|
| 470 |
+
"num_scheduled_requests":
|
| 471 |
+
self.request_metric_family.Metric(labels={
|
| 472 |
+
"request_type": "scheduled",
|
| 473 |
+
**common_labels
|
| 474 |
+
}),
|
| 475 |
+
"num_context_requests":
|
| 476 |
+
self.request_metric_family.Metric(labels={
|
| 477 |
+
"request_type": "context",
|
| 478 |
+
**common_labels
|
| 479 |
+
}),
|
| 480 |
+
# Runtime metrics
|
| 481 |
+
"cpu_mem_usage":
|
| 482 |
+
self.runtime_memory_metric_family.Metric(labels={
|
| 483 |
+
"memory_type": "cpu",
|
| 484 |
+
**common_labels
|
| 485 |
+
}),
|
| 486 |
+
"gpu_mem_usage":
|
| 487 |
+
self.runtime_memory_metric_family.Metric(labels={
|
| 488 |
+
"memory_type": "gpu",
|
| 489 |
+
**common_labels
|
| 490 |
+
}),
|
| 491 |
+
"pinned_mem_usage":
|
| 492 |
+
self.runtime_memory_metric_family.Metric(labels={
|
| 493 |
+
"memory_type": "pinned",
|
| 494 |
+
**common_labels
|
| 495 |
+
}),
|
| 496 |
+
# KV cache metrics
|
| 497 |
+
"max_num_blocks":
|
| 498 |
+
self.kv_cache_metric_family.Metric(labels={
|
| 499 |
+
"kv_cache_block_type": "max",
|
| 500 |
+
**common_labels
|
| 501 |
+
}),
|
| 502 |
+
"free_num_blocks":
|
| 503 |
+
self.kv_cache_metric_family.Metric(labels={
|
| 504 |
+
"kv_cache_block_type": "free",
|
| 505 |
+
**common_labels
|
| 506 |
+
}),
|
| 507 |
+
"used_num_blocks":
|
| 508 |
+
self.kv_cache_metric_family.Metric(labels={
|
| 509 |
+
"kv_cache_block_type": "used",
|
| 510 |
+
**common_labels
|
| 511 |
+
}),
|
| 512 |
+
"tokens_per_block":
|
| 513 |
+
self.kv_cache_metric_family.Metric(labels={
|
| 514 |
+
"kv_cache_block_type": "tokens_per",
|
| 515 |
+
**common_labels
|
| 516 |
+
}),
|
| 517 |
+
# General metrics
|
| 518 |
+
"timestamp":
|
| 519 |
+
self.general_metric_family.Metric(labels={
|
| 520 |
+
"general_type": "timestamp",
|
| 521 |
+
**common_labels
|
| 522 |
+
}),
|
| 523 |
+
"iter":
|
| 524 |
+
self.general_metric_family.Metric(labels={
|
| 525 |
+
"general_type": "iteration_counter",
|
| 526 |
+
**common_labels
|
| 527 |
+
}),
|
| 528 |
+
}
|
| 529 |
+
if is_v1_model:
|
| 530 |
+
self.all_metrics.update({
|
| 531 |
+
"num_ctx_tokens":
|
| 532 |
+
self.model_type_metric_family.Metric(labels={
|
| 533 |
+
"v1_specific_metric": "total_context_tokens",
|
| 534 |
+
**common_labels
|
| 535 |
+
}),
|
| 536 |
+
"num_gen_tokens":
|
| 537 |
+
self.model_type_metric_family.Metric(
|
| 538 |
+
labels={
|
| 539 |
+
"v1_specific_metric": "total_generation_tokens",
|
| 540 |
+
**common_labels
|
| 541 |
+
}),
|
| 542 |
+
"empty_gen_slots":
|
| 543 |
+
self.model_type_metric_family.Metric(
|
| 544 |
+
labels={
|
| 545 |
+
"v1_specific_metric": "empty_generation_slots",
|
| 546 |
+
**common_labels
|
| 547 |
+
}),
|
| 548 |
+
})
|
| 549 |
+
else:
|
| 550 |
+
self.all_metrics.update({
|
| 551 |
+
"num_ctx_tokens":
|
| 552 |
+
self.model_type_metric_family.Metric(
|
| 553 |
+
labels={
|
| 554 |
+
"inflight_batcher_specific_metric":
|
| 555 |
+
"total_context_tokens",
|
| 556 |
+
**common_labels
|
| 557 |
+
}),
|
| 558 |
+
"num_gen_requests":
|
| 559 |
+
self.model_type_metric_family.Metric(
|
| 560 |
+
labels={
|
| 561 |
+
"inflight_batcher_specific_metric":
|
| 562 |
+
"generation_requests",
|
| 563 |
+
**common_labels
|
| 564 |
+
}),
|
| 565 |
+
"micro_batch_id":
|
| 566 |
+
self.model_type_metric_family.Metric(
|
| 567 |
+
labels={
|
| 568 |
+
"inflight_batcher_specific_metric": "micro_batch_id",
|
| 569 |
+
**common_labels
|
| 570 |
+
}),
|
| 571 |
+
"num_paused_requests":
|
| 572 |
+
self.model_type_metric_family.Metric(
|
| 573 |
+
labels={
|
| 574 |
+
"inflight_batcher_specific_metric": "paused_requests",
|
| 575 |
+
**common_labels
|
| 576 |
+
}),
|
| 577 |
+
})
|
| 578 |
+
|
| 579 |
+
def initialize(self, args):
|
| 580 |
+
"""`initialize` is called only once when the model is being loaded.
|
| 581 |
+
Implementing `initialize` function is optional. This function allows
|
| 582 |
+
the model to initialize any state associated with this model.
|
| 583 |
+
|
| 584 |
+
Parameters
|
| 585 |
+
----------
|
| 586 |
+
args : dict
|
| 587 |
+
Both keys and values are strings. The dictionary keys and values are:
|
| 588 |
+
* model_config: A JSON string containing the model configuration
|
| 589 |
+
* model_instance_kind: A string containing model instance kind
|
| 590 |
+
* model_instance_device_id: A string containing model instance device ID
|
| 591 |
+
* model_repository: Model repository path
|
| 592 |
+
* model_version: Model version
|
| 593 |
+
* model_name: Model name
|
| 594 |
+
"""
|
| 595 |
+
model_config = json.loads(args['model_config'])
|
| 596 |
+
gpt_model_path = get_parameter(model_config, "gpt_model_path")
|
| 597 |
+
if get_parameter(model_config, "enable_trt_overlap", bool):
|
| 598 |
+
raise pb_utils.TritonModelException(
|
| 599 |
+
f"enable_trt_overlap=true is not supported.")
|
| 600 |
+
self.exclude_input_from_output = get_parameter(
|
| 601 |
+
model_config, "exclude_input_in_output", bool)
|
| 602 |
+
executor_config = self.get_executor_config(model_config)
|
| 603 |
+
self.executor = trtllm.Executor(gpt_model_path,
|
| 604 |
+
trtllm.ModelType.DECODER_ONLY,
|
| 605 |
+
executor_config)
|
| 606 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
| 607 |
+
model_config)
|
| 608 |
+
self.cancellation_check_period_ms = get_parameter(
|
| 609 |
+
model_config, "cancellation_check_period_ms", int) or 100
|
| 610 |
+
self.stats_check_period_ms = get_parameter(
|
| 611 |
+
model_config, "stats_check_period_ms", int) or 100
|
| 612 |
+
|
| 613 |
+
if not self.decoupled:
|
| 614 |
+
raise pb_utils.TritonModelException(
|
| 615 |
+
"Please enable decoupled transaction policy in the model configuration to serve this model"
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
self.create_metrics(args["model_name"],
|
| 619 |
+
args["model_version"],
|
| 620 |
+
is_v1_model=executor_config.batching_type ==
|
| 621 |
+
trtllm.BatchingType.STATIC)
|
| 622 |
+
self.triton_id_to_req_id = {}
|
| 623 |
+
self.req_id_to_response_sender = {}
|
| 624 |
+
self.lock = Lock()
|
| 625 |
+
self.running = False
|
| 626 |
+
self.awaiter_thread = Thread(target=self.awaiter_loop)
|
| 627 |
+
self.cancellation_thread = Thread(target=self.cancellation_loop)
|
| 628 |
+
self.metrics_thread = Thread(target=self.metrics_loop)
|
| 629 |
+
if self.executor.can_enqueue_requests():
|
| 630 |
+
self.running = True
|
| 631 |
+
self.awaiter_thread.start()
|
| 632 |
+
self.cancellation_thread.start()
|
| 633 |
+
self.metrics_thread.start()
|
| 634 |
+
else:
|
| 635 |
+
# In leader mode, worker ranks will wait here until leader is done.
|
| 636 |
+
self.executor.shutdown()
|
| 637 |
+
|
| 638 |
+
def handle_stop_request(self, triton_id, response_sender):
|
| 639 |
+
if triton_id is None or triton_id == "":
|
| 640 |
+
response_sender.send(
|
| 641 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
| 642 |
+
"A request id must be provided for request cancellation")),
|
| 643 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
| 644 |
+
return
|
| 645 |
+
|
| 646 |
+
if triton_id in self.triton_id_to_req_id:
|
| 647 |
+
req_id = self.triton_id_to_req_id[triton_id]
|
| 648 |
+
self.executor.cancel_request(req_id)
|
| 649 |
+
|
| 650 |
+
response_sender.send(
|
| 651 |
+
pb_utils.InferenceResponse(),
|
| 652 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
| 653 |
+
|
| 654 |
+
def execute(self, requests):
|
| 655 |
+
"""`execute` must be implemented in every Python model. `execute`
|
| 656 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
| 657 |
+
argument. This function is called when an inference is requested
|
| 658 |
+
for this model.
|
| 659 |
+
|
| 660 |
+
Parameters
|
| 661 |
+
----------
|
| 662 |
+
requests : list
|
| 663 |
+
A list of pb_utils.InferenceRequest
|
| 664 |
+
|
| 665 |
+
Returns
|
| 666 |
+
-------
|
| 667 |
+
list
|
| 668 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
| 669 |
+
be the same as `requests`
|
| 670 |
+
"""
|
| 671 |
+
if not self.executor.can_enqueue_requests():
|
| 672 |
+
return
|
| 673 |
+
|
| 674 |
+
# Convert to executor requests.
|
| 675 |
+
triton_requests = []
|
| 676 |
+
executor_requests = []
|
| 677 |
+
for request in requests:
|
| 678 |
+
response_sender = request.get_response_sender()
|
| 679 |
+
if get_input_scalar_by_name(request, 'stop'):
|
| 680 |
+
self.handle_stop_request(request.request_id(), response_sender)
|
| 681 |
+
else:
|
| 682 |
+
try:
|
| 683 |
+
converted = convert_request(request,
|
| 684 |
+
self.exclude_input_from_output,
|
| 685 |
+
self.decoupled)
|
| 686 |
+
except Exception as e:
|
| 687 |
+
response_sender.send(
|
| 688 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
| 689 |
+
f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
|
| 690 |
+
)),
|
| 691 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
| 692 |
+
else:
|
| 693 |
+
triton_requests.append(request)
|
| 694 |
+
executor_requests.append(converted)
|
| 695 |
+
|
| 696 |
+
with self.lock:
|
| 697 |
+
request_ids = self.executor.enqueue_requests(executor_requests)
|
| 698 |
+
for req_id, request in zip(request_ids, triton_requests):
|
| 699 |
+
triton_id = request.request_id()
|
| 700 |
+
self.req_id_to_response_sender[
|
| 701 |
+
req_id] = triton_id, request.get_response_sender()
|
| 702 |
+
self.triton_id_to_req_id[triton_id] = req_id
|
| 703 |
+
return None
|
| 704 |
+
|
| 705 |
+
def awaiter_loop(self):
|
| 706 |
+
"""Gets responses from executor and returns the results."""
|
| 707 |
+
while self.running:
|
| 708 |
+
for response in self.executor.await_responses(
|
| 709 |
+
timeout=datetime.timedelta(milliseconds=1)):
|
| 710 |
+
req_id = response.request_id
|
| 711 |
+
with self.lock:
|
| 712 |
+
if req_id not in self.req_id_to_response_sender:
|
| 713 |
+
continue
|
| 714 |
+
triton_id, response_sender = self.req_id_to_response_sender[
|
| 715 |
+
req_id]
|
| 716 |
+
|
| 717 |
+
triton_response, is_final = convert_response(response)
|
| 718 |
+
response_sender.send(
|
| 719 |
+
triton_response,
|
| 720 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
|
| 721 |
+
if is_final else 0)
|
| 722 |
+
|
| 723 |
+
if is_final:
|
| 724 |
+
with self.lock:
|
| 725 |
+
del self.triton_id_to_req_id[triton_id]
|
| 726 |
+
del self.req_id_to_response_sender[req_id]
|
| 727 |
+
# Remove local reference so response_sender can be cleaned properly.
|
| 728 |
+
del response_sender
|
| 729 |
+
|
| 730 |
+
def cancellation_loop(self):
|
| 731 |
+
"""Checks if any pending requests have been cancelled."""
|
| 732 |
+
while self.running:
|
| 733 |
+
time.sleep(self.cancellation_check_period_ms / 1000.0)
|
| 734 |
+
with self.lock:
|
| 735 |
+
for req_id, (triton_id, response_sender
|
| 736 |
+
) in self.req_id_to_response_sender.items():
|
| 737 |
+
if response_sender.is_cancelled():
|
| 738 |
+
self.executor.cancel_request(req_id)
|
| 739 |
+
# Remove local reference so response_sender can be cleaned properly.
|
| 740 |
+
del response_sender
|
| 741 |
+
|
| 742 |
+
def metrics_loop(self):
|
| 743 |
+
"""Updates triton metrics using stats from the executor."""
|
| 744 |
+
while self.running:
|
| 745 |
+
time.sleep(self.stats_check_period_ms / 1000.0)
|
| 746 |
+
for stat in self.executor.get_latest_iteration_stats():
|
| 747 |
+
try:
|
| 748 |
+
for key, metric in self.all_metrics.items():
|
| 749 |
+
value = None
|
| 750 |
+
if hasattr(stat, key):
|
| 751 |
+
value = getattr(stat, key)
|
| 752 |
+
elif stat.kv_cache_stats is not None and hasattr(
|
| 753 |
+
stat.kv_cache_stats, key):
|
| 754 |
+
value = getattr(stat.kv_cache_stats, key)
|
| 755 |
+
elif stat.static_batching_stats is not None and hasattr(
|
| 756 |
+
stat.static_batching_stats, key):
|
| 757 |
+
value = getattr(stat.static_batching_stats, key)
|
| 758 |
+
elif stat.inflight_batching_stats is not None and hasattr(
|
| 759 |
+
stat.inflight_batching_stats, key):
|
| 760 |
+
value = getattr(stat.inflight_batching_stats, key)
|
| 761 |
+
if value is not None:
|
| 762 |
+
if key == "timestamp":
|
| 763 |
+
value = convert_timestamp_to_seconds(value)
|
| 764 |
+
metric.set(value)
|
| 765 |
+
else:
|
| 766 |
+
pb_utils.Logger.log_warn(
|
| 767 |
+
f"Metric \"{key}\" not found.")
|
| 768 |
+
except Exception as e:
|
| 769 |
+
pb_utils.Logger.log_warn(
|
| 770 |
+
f"Error while processing metrics: {e}")
|
| 771 |
+
|
| 772 |
+
def finalize(self):
|
| 773 |
+
"""`finalize` is called only once when the model is being unloaded.
|
| 774 |
+
Implementing `finalize` function is optional. This function allows
|
| 775 |
+
the model to perform any necessary clean ups before exit.
|
| 776 |
+
"""
|
| 777 |
+
if self.executor.can_enqueue_requests():
|
| 778 |
+
self.running = False
|
| 779 |
+
self.awaiter_thread.join()
|
| 780 |
+
self.cancellation_thread.join()
|
| 781 |
+
self.metrics_thread.join()
|
| 782 |
+
self.executor.shutdown()
|
tensorrt_llm/1/rank0.engine
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c545694cbc76c5a65d4650a2d7897cc98ab2382bce3d198acfa97c003bfea6c
|
| 3 |
+
size 47006220780
|
tensorrt_llm/config.pbtxt
ADDED
|
@@ -0,0 +1,537 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
name: "tensorrt_llm"
|
| 28 |
+
backend: "tensorrtllm"
|
| 29 |
+
max_batch_size: 16
|
| 30 |
+
|
| 31 |
+
model_transaction_policy {
|
| 32 |
+
decoupled: true
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
input [
|
| 37 |
+
{
|
| 38 |
+
name: "input_ids"
|
| 39 |
+
data_type: TYPE_INT32
|
| 40 |
+
dims: [ -1 ]
|
| 41 |
+
allow_ragged_batch: true
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
name: "input_lengths"
|
| 45 |
+
data_type: TYPE_INT32
|
| 46 |
+
dims: [ 1 ]
|
| 47 |
+
reshape: { shape: [ ] }
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
name: "request_output_len"
|
| 51 |
+
data_type: TYPE_INT32
|
| 52 |
+
dims: [ 1 ]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
name: "draft_input_ids"
|
| 56 |
+
data_type: TYPE_INT32
|
| 57 |
+
dims: [ -1 ]
|
| 58 |
+
optional: true
|
| 59 |
+
allow_ragged_batch: true
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
name: "decoder_input_ids"
|
| 63 |
+
data_type: TYPE_INT32
|
| 64 |
+
dims: [ -1 ]
|
| 65 |
+
optional: true
|
| 66 |
+
allow_ragged_batch: true
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
name: "decoder_input_lengths"
|
| 70 |
+
data_type: TYPE_INT32
|
| 71 |
+
dims: [ 1 ]
|
| 72 |
+
optional: true
|
| 73 |
+
reshape: { shape: [ ] }
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
name: "draft_logits"
|
| 77 |
+
data_type: TYPE_FP32
|
| 78 |
+
dims: [ -1, -1 ]
|
| 79 |
+
optional: true
|
| 80 |
+
allow_ragged_batch: true
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
name: "draft_acceptance_threshold"
|
| 84 |
+
data_type: TYPE_FP32
|
| 85 |
+
dims: [ 1 ]
|
| 86 |
+
reshape: { shape: [ ] }
|
| 87 |
+
optional: true
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
name: "end_id"
|
| 91 |
+
data_type: TYPE_INT32
|
| 92 |
+
dims: [ 1 ]
|
| 93 |
+
reshape: { shape: [ ] }
|
| 94 |
+
optional: true
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
name: "pad_id"
|
| 98 |
+
data_type: TYPE_INT32
|
| 99 |
+
dims: [ 1 ]
|
| 100 |
+
reshape: { shape: [ ] }
|
| 101 |
+
optional: true
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
name: "stop_words_list"
|
| 105 |
+
data_type: TYPE_INT32
|
| 106 |
+
dims: [ 2, -1 ]
|
| 107 |
+
optional: true
|
| 108 |
+
allow_ragged_batch: true
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
name: "bad_words_list"
|
| 112 |
+
data_type: TYPE_INT32
|
| 113 |
+
dims: [ 2, -1 ]
|
| 114 |
+
optional: true
|
| 115 |
+
allow_ragged_batch: true
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
name: "embedding_bias"
|
| 119 |
+
data_type: TYPE_FP32
|
| 120 |
+
dims: [ -1 ]
|
| 121 |
+
optional: true
|
| 122 |
+
allow_ragged_batch: true
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
name: "beam_width"
|
| 126 |
+
data_type: TYPE_INT32
|
| 127 |
+
dims: [ 1 ]
|
| 128 |
+
reshape: { shape: [ ] }
|
| 129 |
+
optional: true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
name: "temperature"
|
| 133 |
+
data_type: TYPE_FP32
|
| 134 |
+
dims: [ 1 ]
|
| 135 |
+
reshape: { shape: [ ] }
|
| 136 |
+
optional: true
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
name: "runtime_top_k"
|
| 140 |
+
data_type: TYPE_INT32
|
| 141 |
+
dims: [ 1 ]
|
| 142 |
+
reshape: { shape: [ ] }
|
| 143 |
+
optional: true
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
name: "runtime_top_p"
|
| 147 |
+
data_type: TYPE_FP32
|
| 148 |
+
dims: [ 1 ]
|
| 149 |
+
reshape: { shape: [ ] }
|
| 150 |
+
optional: true
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
name: "runtime_top_p_min"
|
| 154 |
+
data_type: TYPE_FP32
|
| 155 |
+
dims: [ 1 ]
|
| 156 |
+
reshape: { shape: [ ] }
|
| 157 |
+
optional: true
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
name: "runtime_top_p_decay"
|
| 161 |
+
data_type: TYPE_FP32
|
| 162 |
+
dims: [ 1 ]
|
| 163 |
+
reshape: { shape: [ ] }
|
| 164 |
+
optional: true
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
name: "runtime_top_p_reset_ids"
|
| 168 |
+
data_type: TYPE_INT32
|
| 169 |
+
dims: [ 1 ]
|
| 170 |
+
reshape: { shape: [ ] }
|
| 171 |
+
optional: true
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
name: "len_penalty"
|
| 175 |
+
data_type: TYPE_FP32
|
| 176 |
+
dims: [ 1 ]
|
| 177 |
+
reshape: { shape: [ ] }
|
| 178 |
+
optional: true
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
name: "early_stopping"
|
| 182 |
+
data_type: TYPE_BOOL
|
| 183 |
+
dims: [ 1 ]
|
| 184 |
+
reshape: { shape: [ ] }
|
| 185 |
+
optional: true
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
name: "repetition_penalty"
|
| 189 |
+
data_type: TYPE_FP32
|
| 190 |
+
dims: [ 1 ]
|
| 191 |
+
reshape: { shape: [ ] }
|
| 192 |
+
optional: true
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
name: "min_length"
|
| 196 |
+
data_type: TYPE_INT32
|
| 197 |
+
dims: [ 1 ]
|
| 198 |
+
reshape: { shape: [ ] }
|
| 199 |
+
optional: true
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
name: "beam_search_diversity_rate"
|
| 203 |
+
data_type: TYPE_FP32
|
| 204 |
+
dims: [ 1 ]
|
| 205 |
+
reshape: { shape: [ ] }
|
| 206 |
+
optional: true
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
name: "presence_penalty"
|
| 210 |
+
data_type: TYPE_FP32
|
| 211 |
+
dims: [ 1 ]
|
| 212 |
+
reshape: { shape: [ ] }
|
| 213 |
+
optional: true
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
name: "frequency_penalty"
|
| 217 |
+
data_type: TYPE_FP32
|
| 218 |
+
dims: [ 1 ]
|
| 219 |
+
reshape: { shape: [ ] }
|
| 220 |
+
optional: true
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
name: "random_seed"
|
| 224 |
+
data_type: TYPE_UINT64
|
| 225 |
+
dims: [ 1 ]
|
| 226 |
+
reshape: { shape: [ ] }
|
| 227 |
+
optional: true
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
name: "return_log_probs"
|
| 231 |
+
data_type: TYPE_BOOL
|
| 232 |
+
dims: [ 1 ]
|
| 233 |
+
reshape: { shape: [ ] }
|
| 234 |
+
optional: true
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
name: "return_context_logits"
|
| 238 |
+
data_type: TYPE_BOOL
|
| 239 |
+
dims: [ 1 ]
|
| 240 |
+
reshape: { shape: [ ] }
|
| 241 |
+
optional: true
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
name: "return_generation_logits"
|
| 245 |
+
data_type: TYPE_BOOL
|
| 246 |
+
dims: [ 1 ]
|
| 247 |
+
reshape: { shape: [ ] }
|
| 248 |
+
optional: true
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
name: "stop"
|
| 252 |
+
data_type: TYPE_BOOL
|
| 253 |
+
dims: [ 1 ]
|
| 254 |
+
optional: true
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
name: "streaming"
|
| 258 |
+
data_type: TYPE_BOOL
|
| 259 |
+
dims: [ 1 ]
|
| 260 |
+
optional: true
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
name: "prompt_embedding_table"
|
| 264 |
+
data_type: TYPE_FP16
|
| 265 |
+
dims: [ -1, -1 ]
|
| 266 |
+
optional: true
|
| 267 |
+
allow_ragged_batch: true
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
name: "prompt_vocab_size"
|
| 271 |
+
data_type: TYPE_INT32
|
| 272 |
+
dims: [ 1 ]
|
| 273 |
+
reshape: { shape: [ ] }
|
| 274 |
+
optional: true
|
| 275 |
+
},
|
| 276 |
+
# the unique task ID for the given LoRA.
|
| 277 |
+
# To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
|
| 278 |
+
# The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
|
| 279 |
+
# If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
|
| 280 |
+
{
|
| 281 |
+
name: "lora_task_id"
|
| 282 |
+
data_type: TYPE_UINT64
|
| 283 |
+
dims: [ 1 ]
|
| 284 |
+
reshape: { shape: [ ] }
|
| 285 |
+
optional: true
|
| 286 |
+
},
|
| 287 |
+
# weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
|
| 288 |
+
# where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
|
| 289 |
+
# each of the in / out tensors are first flattened and then concatenated together in the format above.
|
| 290 |
+
# D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
|
| 291 |
+
{
|
| 292 |
+
name: "lora_weights"
|
| 293 |
+
data_type: TYPE_FP16
|
| 294 |
+
dims: [ -1, -1 ]
|
| 295 |
+
optional: true
|
| 296 |
+
allow_ragged_batch: true
|
| 297 |
+
},
|
| 298 |
+
# module identifier (same size a first dimension of lora_weights)
|
| 299 |
+
# See LoraModule::ModuleType for model id mapping
|
| 300 |
+
#
|
| 301 |
+
# "attn_qkv": 0 # compbined qkv adapter
|
| 302 |
+
# "attn_q": 1 # q adapter
|
| 303 |
+
# "attn_k": 2 # k adapter
|
| 304 |
+
# "attn_v": 3 # v adapter
|
| 305 |
+
# "attn_dense": 4 # adapter for the dense layer in attention
|
| 306 |
+
# "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
|
| 307 |
+
# "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
|
| 308 |
+
# "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
|
| 309 |
+
#
|
| 310 |
+
# last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
|
| 311 |
+
{
|
| 312 |
+
name: "lora_config"
|
| 313 |
+
data_type: TYPE_INT32
|
| 314 |
+
dims: [ -1, 3 ]
|
| 315 |
+
optional: true
|
| 316 |
+
allow_ragged_batch: true
|
| 317 |
+
}
|
| 318 |
+
]
|
| 319 |
+
output [
|
| 320 |
+
{
|
| 321 |
+
name: "output_ids"
|
| 322 |
+
data_type: TYPE_INT32
|
| 323 |
+
dims: [ -1, -1 ]
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
name: "sequence_length"
|
| 327 |
+
data_type: TYPE_INT32
|
| 328 |
+
dims: [ -1 ]
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
name: "cum_log_probs"
|
| 332 |
+
data_type: TYPE_FP32
|
| 333 |
+
dims: [ -1 ]
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
name: "output_log_probs"
|
| 337 |
+
data_type: TYPE_FP32
|
| 338 |
+
dims: [ -1, -1 ]
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
name: "context_logits"
|
| 342 |
+
data_type: TYPE_FP32
|
| 343 |
+
dims: [ -1, -1 ]
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
name: "generation_logits"
|
| 347 |
+
data_type: TYPE_FP32
|
| 348 |
+
dims: [ -1, -1, -1 ]
|
| 349 |
+
}
|
| 350 |
+
]
|
| 351 |
+
instance_group [
|
| 352 |
+
{
|
| 353 |
+
count: 1
|
| 354 |
+
kind : KIND_CPU
|
| 355 |
+
}
|
| 356 |
+
]
|
| 357 |
+
parameters: {
|
| 358 |
+
key: "max_beam_width"
|
| 359 |
+
value: {
|
| 360 |
+
string_value: "1"
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
parameters: {
|
| 364 |
+
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
|
| 365 |
+
value: {
|
| 366 |
+
string_value: "no"
|
| 367 |
+
}
|
| 368 |
+
}
|
| 369 |
+
parameters: {
|
| 370 |
+
key: "gpt_model_type"
|
| 371 |
+
value: {
|
| 372 |
+
string_value: "inflight_batching"
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
parameters: {
|
| 376 |
+
key: "gpt_model_path"
|
| 377 |
+
value: {
|
| 378 |
+
string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
|
| 379 |
+
}
|
| 380 |
+
}
|
| 381 |
+
parameters: {
|
| 382 |
+
key: "encoder_model_path"
|
| 383 |
+
value: {
|
| 384 |
+
string_value: "${encoder_engine_dir}"
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
parameters: {
|
| 388 |
+
key: "max_tokens_in_paged_kv_cache"
|
| 389 |
+
value: {
|
| 390 |
+
string_value: "${max_tokens_in_paged_kv_cache}"
|
| 391 |
+
}
|
| 392 |
+
}
|
| 393 |
+
parameters: {
|
| 394 |
+
key: "max_attention_window_size"
|
| 395 |
+
value: {
|
| 396 |
+
string_value: "${max_attention_window_size}"
|
| 397 |
+
}
|
| 398 |
+
}
|
| 399 |
+
parameters: {
|
| 400 |
+
key: "sink_token_length"
|
| 401 |
+
value: {
|
| 402 |
+
string_value: "${sink_token_length}"
|
| 403 |
+
}
|
| 404 |
+
}
|
| 405 |
+
parameters: {
|
| 406 |
+
key: "batch_scheduler_policy"
|
| 407 |
+
value: {
|
| 408 |
+
string_value: "guaranteed_no_evict"
|
| 409 |
+
}
|
| 410 |
+
}
|
| 411 |
+
parameters: {
|
| 412 |
+
key: "kv_cache_free_gpu_mem_fraction"
|
| 413 |
+
value: {
|
| 414 |
+
string_value: "0.8"
|
| 415 |
+
}
|
| 416 |
+
}
|
| 417 |
+
parameters: {
|
| 418 |
+
key: "kv_cache_host_memory_bytes"
|
| 419 |
+
value: {
|
| 420 |
+
string_value: "${kv_cache_host_memory_bytes}"
|
| 421 |
+
}
|
| 422 |
+
}
|
| 423 |
+
parameters: {
|
| 424 |
+
key: "kv_cache_onboard_blocks"
|
| 425 |
+
value: {
|
| 426 |
+
string_value: "${kv_cache_onboard_blocks}"
|
| 427 |
+
}
|
| 428 |
+
}
|
| 429 |
+
# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
|
| 430 |
+
# parameters: {
|
| 431 |
+
# key: "enable_trt_overlap"
|
| 432 |
+
# value: {
|
| 433 |
+
# string_value: "${enable_trt_overlap}"
|
| 434 |
+
# }
|
| 435 |
+
# }
|
| 436 |
+
parameters: {
|
| 437 |
+
key: "exclude_input_in_output"
|
| 438 |
+
value: {
|
| 439 |
+
string_value: "true"
|
| 440 |
+
}
|
| 441 |
+
}
|
| 442 |
+
parameters: {
|
| 443 |
+
key: "cancellation_check_period_ms"
|
| 444 |
+
value: {
|
| 445 |
+
string_value: "${cancellation_check_period_ms}"
|
| 446 |
+
}
|
| 447 |
+
}
|
| 448 |
+
parameters: {
|
| 449 |
+
key: "stats_check_period_ms"
|
| 450 |
+
value: {
|
| 451 |
+
string_value: "${stats_check_period_ms}"
|
| 452 |
+
}
|
| 453 |
+
}
|
| 454 |
+
parameters: {
|
| 455 |
+
key: "iter_stats_max_iterations"
|
| 456 |
+
value: {
|
| 457 |
+
string_value: "${iter_stats_max_iterations}"
|
| 458 |
+
}
|
| 459 |
+
}
|
| 460 |
+
parameters: {
|
| 461 |
+
key: "request_stats_max_iterations"
|
| 462 |
+
value: {
|
| 463 |
+
string_value: "${request_stats_max_iterations}"
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
parameters: {
|
| 467 |
+
key: "enable_kv_cache_reuse"
|
| 468 |
+
value: {
|
| 469 |
+
string_value: "${enable_kv_cache_reuse}"
|
| 470 |
+
}
|
| 471 |
+
}
|
| 472 |
+
parameters: {
|
| 473 |
+
key: "normalize_log_probs"
|
| 474 |
+
value: {
|
| 475 |
+
string_value: "${normalize_log_probs}"
|
| 476 |
+
}
|
| 477 |
+
}
|
| 478 |
+
parameters: {
|
| 479 |
+
key: "enable_chunked_context"
|
| 480 |
+
value: {
|
| 481 |
+
string_value: "${enable_chunked_context}"
|
| 482 |
+
}
|
| 483 |
+
}
|
| 484 |
+
parameters: {
|
| 485 |
+
key: "gpu_device_ids"
|
| 486 |
+
value: {
|
| 487 |
+
string_value: "${gpu_device_ids}"
|
| 488 |
+
}
|
| 489 |
+
}
|
| 490 |
+
parameters: {
|
| 491 |
+
key: "lora_cache_optimal_adapter_size"
|
| 492 |
+
value: {
|
| 493 |
+
string_value: "${lora_cache_optimal_adapter_size}"
|
| 494 |
+
}
|
| 495 |
+
}
|
| 496 |
+
parameters: {
|
| 497 |
+
key: "lora_cache_max_adapter_size"
|
| 498 |
+
value: {
|
| 499 |
+
string_value: "${lora_cache_max_adapter_size}"
|
| 500 |
+
}
|
| 501 |
+
}
|
| 502 |
+
parameters: {
|
| 503 |
+
key: "lora_cache_gpu_memory_fraction"
|
| 504 |
+
value: {
|
| 505 |
+
string_value: "${lora_cache_gpu_memory_fraction}"
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
parameters: {
|
| 509 |
+
key: "lora_cache_host_memory_bytes"
|
| 510 |
+
value: {
|
| 511 |
+
string_value: "${lora_cache_host_memory_bytes}"
|
| 512 |
+
}
|
| 513 |
+
}
|
| 514 |
+
parameters: {
|
| 515 |
+
key: "decoding_mode"
|
| 516 |
+
value: {
|
| 517 |
+
string_value: "${decoding_mode}"
|
| 518 |
+
}
|
| 519 |
+
}
|
| 520 |
+
parameters: {
|
| 521 |
+
key: "executor_worker_path"
|
| 522 |
+
value: {
|
| 523 |
+
string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
|
| 524 |
+
}
|
| 525 |
+
}
|
| 526 |
+
parameters: {
|
| 527 |
+
key: "medusa_choices"
|
| 528 |
+
value: {
|
| 529 |
+
string_value: "${medusa_choices}"
|
| 530 |
+
}
|
| 531 |
+
}
|
| 532 |
+
parameters: {
|
| 533 |
+
key: "gpu_weights_percent"
|
| 534 |
+
value: {
|
| 535 |
+
string_value: "${gpu_weights_percent}"
|
| 536 |
+
}
|
| 537 |
+
}
|
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (2.72 kB). View file
|
|
|
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc
ADDED
|
Binary file (9.05 kB). View file
|
|
|
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc
ADDED
|
Binary file (9.73 kB). View file
|
|
|
tensorrt_llm_bls/1/lib/decode.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
from collections.abc import Generator
|
| 28 |
+
from dataclasses import dataclass
|
| 29 |
+
from typing import Optional
|
| 30 |
+
|
| 31 |
+
import numpy as np
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class RequestValidationError(Exception):
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _validate_that(condition: bool, msg: str):
|
| 39 |
+
if not condition:
|
| 40 |
+
raise RequestValidationError(msg)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _validate_non_empty(data, msg: str):
|
| 44 |
+
_validate_that(data is not None and data.size > 0, msg)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _validate_single_gt_0(data, msg: str):
|
| 48 |
+
_validate_non_empty(data, msg)
|
| 49 |
+
_validate_that(data.flatten()[0] > 0, msg)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _single_value(data: Optional[np.ndarray]):
|
| 53 |
+
if data is None:
|
| 54 |
+
return None
|
| 55 |
+
return data.flatten()[0]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class Request:
|
| 60 |
+
text_input: np.ndarray = np.array([])
|
| 61 |
+
decoder_text_input: np.ndarray = None
|
| 62 |
+
max_tokens: np.ndarray = np.array([])
|
| 63 |
+
bad_words: Optional[np.ndarray] = None
|
| 64 |
+
stop_words: Optional[np.ndarray] = None
|
| 65 |
+
end_id: Optional[np.ndarray] = None
|
| 66 |
+
pad_id: Optional[np.ndarray] = None
|
| 67 |
+
top_k: Optional[np.ndarray] = None
|
| 68 |
+
top_p: Optional[np.ndarray] = None
|
| 69 |
+
temperature: Optional[np.ndarray] = None
|
| 70 |
+
length_penalty: Optional[np.ndarray] = None
|
| 71 |
+
repetition_penalty: Optional[np.ndarray] = None
|
| 72 |
+
min_length: Optional[np.ndarray] = None
|
| 73 |
+
return_log_probs: Optional[np.ndarray] = None
|
| 74 |
+
prompt_embedding_table: Optional[np.ndarray] = None
|
| 75 |
+
prompt_vocab_size: Optional[np.ndarray] = None
|
| 76 |
+
embedding_bias_words: Optional[np.ndarray] = None
|
| 77 |
+
embedding_bias_weights: Optional[np.ndarray] = None
|
| 78 |
+
num_draft_tokens: Optional[np.ndarray] = None
|
| 79 |
+
use_draft_logits: Optional[np.ndarray] = None
|
| 80 |
+
stream: Optional[np.ndarray] = None
|
| 81 |
+
beam_width: Optional[np.ndarray] = None
|
| 82 |
+
return_context_logits: Optional[np.ndarray] = None
|
| 83 |
+
return_generation_logits: Optional[np.ndarray] = None
|
| 84 |
+
random_seed: Optional[np.ndarray] = None
|
| 85 |
+
presence_penalty: Optional[np.ndarray] = None
|
| 86 |
+
frequency_penalty: Optional[np.ndarray] = None
|
| 87 |
+
|
| 88 |
+
def validate(self):
|
| 89 |
+
_validate_non_empty(self.text_input, "text_input is required")
|
| 90 |
+
_validate_single_gt_0(self.max_tokens,
|
| 91 |
+
"max_tokens must be a single value > 0")
|
| 92 |
+
|
| 93 |
+
num_draft_tokens = _single_value(self.num_draft_tokens)
|
| 94 |
+
stream = _single_value(self.stream)
|
| 95 |
+
_single_value(self.return_generation_logits)
|
| 96 |
+
context_logits = _single_value(self.return_context_logits)
|
| 97 |
+
|
| 98 |
+
if num_draft_tokens:
|
| 99 |
+
_validate_that(
|
| 100 |
+
not stream,
|
| 101 |
+
"streaming is not supported with speculative decoding")
|
| 102 |
+
_validate_that(
|
| 103 |
+
not context_logits,
|
| 104 |
+
"context logits are not supported with speculative decoding")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
@dataclass
|
| 108 |
+
class DraftRequest:
|
| 109 |
+
draft_input_ids: Optional[np.ndarray] = None
|
| 110 |
+
draft_logits: Optional[np.ndarray] = None
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@dataclass
|
| 114 |
+
class PreprocResponse:
|
| 115 |
+
input_ids: np.ndarray = np.array([])
|
| 116 |
+
decoder_input_ids: np.ndarray = None
|
| 117 |
+
input_lengths: np.ndarray = np.array([])
|
| 118 |
+
decoder_input_lengths: np.ndarray = None
|
| 119 |
+
bad_words_list: Optional[np.ndarray] = None
|
| 120 |
+
stop_words_list: Optional[np.ndarray] = None
|
| 121 |
+
embedding_bias: Optional[np.ndarray] = None
|
| 122 |
+
end_id: Optional[np.ndarray] = None
|
| 123 |
+
pad_id: Optional[np.ndarray] = None
|
| 124 |
+
|
| 125 |
+
@classmethod
|
| 126 |
+
def with_new_inputs(cls,
|
| 127 |
+
other,
|
| 128 |
+
input_ids: Optional[np.ndarray] = None,
|
| 129 |
+
input_lengths: Optional[np.ndarray] = None):
|
| 130 |
+
return cls(
|
| 131 |
+
input_ids=(input_ids
|
| 132 |
+
if input_ids is not None else other.input_ids),
|
| 133 |
+
input_lengths=(input_lengths if input_lengths is not None else
|
| 134 |
+
other.input_lengths),
|
| 135 |
+
decoder_input_ids=other.decoder_input_ids,
|
| 136 |
+
decoder_input_lengths=other.decoder_input_lengths,
|
| 137 |
+
bad_words_list=other.bad_words_list,
|
| 138 |
+
stop_words_list=other.stop_words_list,
|
| 139 |
+
end_id=other.end_id,
|
| 140 |
+
pad_id=other.pad_id,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@dataclass
|
| 145 |
+
class GenerationResponse:
|
| 146 |
+
output_ids: np.ndarray = np.array([])
|
| 147 |
+
sequence_length: np.ndarray = np.array([])
|
| 148 |
+
cum_log_probs: Optional[np.ndarray] = None
|
| 149 |
+
output_log_probs: Optional[np.ndarray] = None
|
| 150 |
+
context_logits: Optional[np.ndarray] = None
|
| 151 |
+
generation_logits: Optional[np.ndarray] = None
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@dataclass
|
| 155 |
+
class Response:
|
| 156 |
+
text_output: np.ndarray = np.array([])
|
| 157 |
+
cum_log_probs: Optional[np.ndarray] = None
|
| 158 |
+
output_log_probs: Optional[np.ndarray] = None
|
| 159 |
+
context_logits: Optional[np.ndarray] = None
|
| 160 |
+
generation_logits: Optional[np.ndarray] = None
|
| 161 |
+
|
| 162 |
+
def __eq__(self, o) -> bool:
|
| 163 |
+
"""Just for testing"""
|
| 164 |
+
if not isinstance(o, Response):
|
| 165 |
+
return False
|
| 166 |
+
return (np.array_equal(self.text_output, o.text_output)
|
| 167 |
+
and np.array_equal(self.cum_log_probs, o.cum_log_probs)
|
| 168 |
+
and np.array_equal(self.output_log_probs, o.output_log_probs)
|
| 169 |
+
and np.array_equal(self.context_logits, o.context_logits) and
|
| 170 |
+
np.array_equal(self.generation_logits, o.generation_logits))
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class Decoder:
|
| 174 |
+
|
| 175 |
+
def __init__(self, streaming=False, accumulate=False):
|
| 176 |
+
self._streaming = streaming
|
| 177 |
+
self._accumulate = accumulate
|
| 178 |
+
|
| 179 |
+
self._accumulated_tokens = None
|
| 180 |
+
|
| 181 |
+
def decode(self,
|
| 182 |
+
request: Request,
|
| 183 |
+
speculative_decoding=False) -> Generator[Response, None, None]:
|
| 184 |
+
preproc_response = self.preprocess(request)
|
| 185 |
+
|
| 186 |
+
if speculative_decoding:
|
| 187 |
+
for gen_response in self._spec_generate(preproc_response, request):
|
| 188 |
+
yield self.postprocess(gen_response)
|
| 189 |
+
else:
|
| 190 |
+
if not self._streaming:
|
| 191 |
+
gen_response = self._generate_non_streaming(
|
| 192 |
+
preproc_response, request)
|
| 193 |
+
yield self.postprocess(gen_response)
|
| 194 |
+
else:
|
| 195 |
+
for gen_response in self._generate(preproc_response, request):
|
| 196 |
+
yield self.postprocess(gen_response)
|
| 197 |
+
|
| 198 |
+
def encountered_stop_words(self, input_ids, stop_words_ids):
|
| 199 |
+
for stop_word_ids in stop_words_ids:
|
| 200 |
+
if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
|
| 201 |
+
return True
|
| 202 |
+
return False
|
| 203 |
+
|
| 204 |
+
def _spec_generate(
|
| 205 |
+
self, preproc: PreprocResponse,
|
| 206 |
+
request: Request) -> Generator[GenerationResponse, None, None]:
|
| 207 |
+
|
| 208 |
+
prompt_input_ids: np.ndarray = preproc.input_ids[0]
|
| 209 |
+
input_ids: np.ndarray = prompt_input_ids
|
| 210 |
+
output_len: int = request.max_tokens[0][0]
|
| 211 |
+
last_input_ids: np.ndarray = None
|
| 212 |
+
draft_output_ids: np.ndarray = None
|
| 213 |
+
draft_logits: np.ndarray = None
|
| 214 |
+
|
| 215 |
+
target_response: GenerationResponse = None
|
| 216 |
+
|
| 217 |
+
cur_preproc = preproc
|
| 218 |
+
|
| 219 |
+
counter = 0
|
| 220 |
+
while True:
|
| 221 |
+
counter += 1
|
| 222 |
+
num_draft_tokens = min(
|
| 223 |
+
request.num_draft_tokens[0][0],
|
| 224 |
+
len(prompt_input_ids) + output_len - len(input_ids) - 1)
|
| 225 |
+
|
| 226 |
+
draft_request = None
|
| 227 |
+
if num_draft_tokens > 0:
|
| 228 |
+
draft_response: GenerationResponse = self._draft_generate_non_streaming(
|
| 229 |
+
cur_preproc, request, num_draft_tokens)
|
| 230 |
+
seq_len: int = draft_response.sequence_length[0][0]
|
| 231 |
+
# [1, beamWidth, outputLength] -> [outputLen]
|
| 232 |
+
draft_output_ids = draft_response.output_ids[0][0]
|
| 233 |
+
# [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
|
| 234 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
| 235 |
+
0]:
|
| 236 |
+
if draft_response.generation_logits is not None:
|
| 237 |
+
draft_logits = draft_response.generation_logits[0][0]
|
| 238 |
+
|
| 239 |
+
input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
|
| 240 |
+
draft_request = DraftRequest(
|
| 241 |
+
draft_input_ids=np.expand_dims(input_draft_tokens, 0))
|
| 242 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
| 243 |
+
0]:
|
| 244 |
+
draft_request.draft_logits = np.expand_dims(
|
| 245 |
+
draft_logits[-len(input_draft_tokens):], 0)
|
| 246 |
+
else:
|
| 247 |
+
draft_request = DraftRequest()
|
| 248 |
+
target_response = self._generate_non_streaming(
|
| 249 |
+
cur_preproc, request, draft_request)
|
| 250 |
+
last_input_ids = input_ids
|
| 251 |
+
input_ids = target_response.output_ids[0][0]
|
| 252 |
+
cur_preproc = PreprocResponse.with_new_inputs(
|
| 253 |
+
cur_preproc, np.expand_dims(input_ids, 0),
|
| 254 |
+
np.array([[len(input_ids)]], dtype=np.int32))
|
| 255 |
+
|
| 256 |
+
# Evaluate criteria to stop generation loop.
|
| 257 |
+
# If we've hit or exceeded the max output length, should stop
|
| 258 |
+
length_stop = (len(input_ids) >=
|
| 259 |
+
len(prompt_input_ids) + output_len)
|
| 260 |
+
if length_stop:
|
| 261 |
+
break
|
| 262 |
+
# If draft and target have same outputs, should stop. Normally target should return 1 more token.
|
| 263 |
+
# If they are the same length, they should differ at the last token
|
| 264 |
+
target_draft_equal = draft_output_ids is not None and np.array_equal(
|
| 265 |
+
draft_output_ids, input_ids)
|
| 266 |
+
if target_draft_equal:
|
| 267 |
+
break
|
| 268 |
+
# If tokens no longer change, should stop, means we have hit early stopping
|
| 269 |
+
last_current_equal = np.array_equal(last_input_ids, input_ids)
|
| 270 |
+
if last_current_equal:
|
| 271 |
+
break
|
| 272 |
+
# Need to check if stop words was encountered
|
| 273 |
+
hit_stop_words = self.encountered_stop_words(
|
| 274 |
+
input_ids, preproc.stop_words_list[0])
|
| 275 |
+
if hit_stop_words:
|
| 276 |
+
break
|
| 277 |
+
|
| 278 |
+
yield target_response
|
| 279 |
+
|
| 280 |
+
def _draft_generate_non_streaming(
|
| 281 |
+
self, preproc: PreprocResponse, request: Request,
|
| 282 |
+
num_draft_tokens: int) -> GenerationResponse:
|
| 283 |
+
raise NotImplementedError()
|
| 284 |
+
|
| 285 |
+
def _generate(
|
| 286 |
+
self,
|
| 287 |
+
preproc: PreprocResponse,
|
| 288 |
+
request: Request,
|
| 289 |
+
draft_request: Optional[DraftRequest] = None
|
| 290 |
+
) -> Generator[GenerationResponse, None, None]:
|
| 291 |
+
raise NotImplementedError()
|
| 292 |
+
|
| 293 |
+
def _generate_non_streaming(
|
| 294 |
+
self,
|
| 295 |
+
preproc: PreprocResponse,
|
| 296 |
+
request: Request,
|
| 297 |
+
draft_request: Optional[DraftRequest] = None
|
| 298 |
+
) -> GenerationResponse:
|
| 299 |
+
raise NotImplementedError()
|
| 300 |
+
|
| 301 |
+
def postprocess(self, gen_response: GenerationResponse) -> Response:
|
| 302 |
+
if self._accumulate and self._streaming:
|
| 303 |
+
new_tokens: np.ndarray = gen_response.output_ids
|
| 304 |
+
if new_tokens.ndim != 3:
|
| 305 |
+
raise Exception("Expected output_ids tensor to have 3 dims.")
|
| 306 |
+
if new_tokens.shape[0] != 1:
|
| 307 |
+
raise Exception("Expected batch size of 1")
|
| 308 |
+
if new_tokens.shape[1] != 1:
|
| 309 |
+
raise Exception(
|
| 310 |
+
"Accumulation of tokens is only implemented for beam width = 1"
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
self._accumulated_tokens = new_tokens if (
|
| 314 |
+
self._accumulated_tokens is None) else np.concatenate(
|
| 315 |
+
(self._accumulated_tokens, new_tokens), axis=2)
|
| 316 |
+
sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]],
|
| 317 |
+
dtype=np.int32)
|
| 318 |
+
return self._postprocess(self._accumulated_tokens,
|
| 319 |
+
sequence_lengths, gen_response)
|
| 320 |
+
else:
|
| 321 |
+
return self._postprocess(gen_response.output_ids, None,
|
| 322 |
+
gen_response)
|
| 323 |
+
|
| 324 |
+
def _postprocess(self, tokens: np.ndarray,
|
| 325 |
+
sequence_lengths: Optional[np.ndarray],
|
| 326 |
+
gen_response: GenerationResponse) -> Response:
|
| 327 |
+
raise NotImplementedError()
|
| 328 |
+
|
| 329 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
| 330 |
+
raise NotImplementedError()
|
| 331 |
+
|
| 332 |
+
def reset_decoder(self):
|
| 333 |
+
self._accumulated_tokens = None
|
tensorrt_llm_bls/1/lib/triton_decoder.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
from collections.abc import Callable
|
| 28 |
+
from typing import Dict, Optional
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
import triton_python_backend_utils as pb_utils
|
| 32 |
+
from lib.decode import *
|
| 33 |
+
from typing_extensions import override
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class TritonDecoder(Decoder):
|
| 37 |
+
|
| 38 |
+
def __init__(self,
|
| 39 |
+
streaming=False,
|
| 40 |
+
accumulate=False,
|
| 41 |
+
preproc_model_name="preprocessing",
|
| 42 |
+
postproc_model_name="postprocessing",
|
| 43 |
+
llm_model_name="tensorrt_llm",
|
| 44 |
+
draft_llm_model_name: Optional[str] = None):
|
| 45 |
+
super().__init__(streaming=streaming, accumulate=accumulate)
|
| 46 |
+
self.preproc_model_name = preproc_model_name
|
| 47 |
+
self.postproc_model_name = postproc_model_name
|
| 48 |
+
self.llm_model_name = llm_model_name
|
| 49 |
+
self.draft_llm_model_name = draft_llm_model_name
|
| 50 |
+
|
| 51 |
+
self._preproc_outputs = [
|
| 52 |
+
"INPUT_ID",
|
| 53 |
+
"DECODER_INPUT_ID",
|
| 54 |
+
"REQUEST_INPUT_LEN",
|
| 55 |
+
"REQUEST_DECODER_INPUT_LEN",
|
| 56 |
+
"BAD_WORDS_IDS",
|
| 57 |
+
"STOP_WORDS_IDS",
|
| 58 |
+
"EMBEDDING_BIAS",
|
| 59 |
+
"OUT_PAD_ID",
|
| 60 |
+
"OUT_END_ID",
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
self._llm_outputs = [
|
| 64 |
+
"output_ids",
|
| 65 |
+
"sequence_length",
|
| 66 |
+
"cum_log_probs",
|
| 67 |
+
"output_log_probs",
|
| 68 |
+
"context_logits",
|
| 69 |
+
"generation_logits",
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
self._postproc_outputs = [
|
| 73 |
+
"OUTPUT",
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
self.input_names = [
|
| 77 |
+
"text_input",
|
| 78 |
+
"decoder_text_input",
|
| 79 |
+
"max_tokens",
|
| 80 |
+
"bad_words",
|
| 81 |
+
"stop_words",
|
| 82 |
+
"end_id",
|
| 83 |
+
"pad_id",
|
| 84 |
+
"top_k",
|
| 85 |
+
"top_p",
|
| 86 |
+
"temperature",
|
| 87 |
+
"length_penalty",
|
| 88 |
+
"repetition_penalty",
|
| 89 |
+
"min_length",
|
| 90 |
+
"presence_penalty",
|
| 91 |
+
"frequency_penalty",
|
| 92 |
+
"random_seed",
|
| 93 |
+
"return_log_probs",
|
| 94 |
+
"return_context_logits",
|
| 95 |
+
"return_generation_logits",
|
| 96 |
+
"beam_width",
|
| 97 |
+
"stream",
|
| 98 |
+
"prompt_embedding_table",
|
| 99 |
+
"prompt_vocab_size",
|
| 100 |
+
"embedding_bias_words",
|
| 101 |
+
"embedding_bias_weights",
|
| 102 |
+
"num_draft_tokens",
|
| 103 |
+
"use_draft_logits",
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
self.__undo_reshape_whitelist = {
|
| 107 |
+
"max_tokens",
|
| 108 |
+
"end_id",
|
| 109 |
+
"pad_id",
|
| 110 |
+
"top_k",
|
| 111 |
+
"top_p",
|
| 112 |
+
"temperature",
|
| 113 |
+
"length_penalty",
|
| 114 |
+
"repetition_penalty",
|
| 115 |
+
"min_length",
|
| 116 |
+
"presence_penalty",
|
| 117 |
+
"frequency_penalty",
|
| 118 |
+
"random_seed",
|
| 119 |
+
"return_log_probs",
|
| 120 |
+
"return_context_logits",
|
| 121 |
+
"return_generation_logits",
|
| 122 |
+
"beam_width",
|
| 123 |
+
"stream",
|
| 124 |
+
"prompt_vocab_size",
|
| 125 |
+
"num_draft_tokens",
|
| 126 |
+
"use_draft_logits",
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
def _exec_triton_request(self, request):
|
| 130 |
+
responses = request.exec(decoupled=True)
|
| 131 |
+
for r in responses:
|
| 132 |
+
if r.has_error():
|
| 133 |
+
raise pb_utils.TritonModelException(r.error().message())
|
| 134 |
+
yield r
|
| 135 |
+
|
| 136 |
+
def _exec_triton_request_single(self, request):
|
| 137 |
+
responses = request.exec(decoupled=False)
|
| 138 |
+
if responses.has_error():
|
| 139 |
+
raise pb_utils.TritonModelException(responses.error().message())
|
| 140 |
+
return responses
|
| 141 |
+
|
| 142 |
+
def create_triton_response(self, response: Response):
|
| 143 |
+
name_map = {
|
| 144 |
+
"text_output": "text_output",
|
| 145 |
+
"cum_log_probs": "cum_log_probs",
|
| 146 |
+
"output_log_probs": "output_log_probs",
|
| 147 |
+
"context_logits": "context_logits",
|
| 148 |
+
"generation_logits": "generation_logits"
|
| 149 |
+
}
|
| 150 |
+
tensors = self.create_triton_tensors(response, name_map)
|
| 151 |
+
return pb_utils.InferenceResponse(output_tensors=tensors)
|
| 152 |
+
|
| 153 |
+
def convert_triton_request(self, triton_request) -> Request:
|
| 154 |
+
request = Request()
|
| 155 |
+
for triton_name in self.input_names:
|
| 156 |
+
tensor = pb_utils.get_input_tensor_by_name(triton_request,
|
| 157 |
+
triton_name)
|
| 158 |
+
target_name = triton_name
|
| 159 |
+
if tensor is None:
|
| 160 |
+
continue
|
| 161 |
+
if not hasattr(request, target_name):
|
| 162 |
+
raise AttributeError(
|
| 163 |
+
f"Request has no attribute '{target_name}'")
|
| 164 |
+
setattr(request, target_name, tensor.as_numpy())
|
| 165 |
+
return request
|
| 166 |
+
|
| 167 |
+
def convert_triton_response(self,
|
| 168 |
+
triton_response,
|
| 169 |
+
response_factory: Callable,
|
| 170 |
+
name_map=None):
|
| 171 |
+
response = response_factory()
|
| 172 |
+
for tensor in triton_response.output_tensors():
|
| 173 |
+
if tensor is None:
|
| 174 |
+
continue
|
| 175 |
+
triton_name = tensor.name()
|
| 176 |
+
value = tensor.as_numpy()
|
| 177 |
+
target_name = triton_name
|
| 178 |
+
if name_map and triton_name in name_map:
|
| 179 |
+
target_name = name_map[triton_name]
|
| 180 |
+
if name_map and not triton_name in name_map:
|
| 181 |
+
continue
|
| 182 |
+
if target_name is None:
|
| 183 |
+
# explicitly ignore this triton input
|
| 184 |
+
continue
|
| 185 |
+
if not hasattr(response, target_name):
|
| 186 |
+
raise AttributeError(
|
| 187 |
+
f"response object has not attribute '{target_name}'")
|
| 188 |
+
setattr(response, target_name, value)
|
| 189 |
+
return response
|
| 190 |
+
|
| 191 |
+
def __undo_reshape(self, x, name):
|
| 192 |
+
if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
|
| 193 |
+
# handle reshapes
|
| 194 |
+
return np.expand_dims(x, 0)
|
| 195 |
+
else:
|
| 196 |
+
return x
|
| 197 |
+
|
| 198 |
+
def create_triton_tensors(self, obj, name_map: dict):
|
| 199 |
+
tensors = []
|
| 200 |
+
for name, triton_name in name_map.items():
|
| 201 |
+
if triton_name is None:
|
| 202 |
+
continue
|
| 203 |
+
value = getattr(obj, name)
|
| 204 |
+
if value is None:
|
| 205 |
+
continue
|
| 206 |
+
t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name))
|
| 207 |
+
tensors.append(t)
|
| 208 |
+
return tensors
|
| 209 |
+
|
| 210 |
+
@override
|
| 211 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
| 212 |
+
input_tensors = self._get_preproc_tensors(request)
|
| 213 |
+
triton_req = pb_utils.InferenceRequest(
|
| 214 |
+
model_name=self.preproc_model_name,
|
| 215 |
+
inputs=input_tensors,
|
| 216 |
+
requested_output_names=self._preproc_outputs)
|
| 217 |
+
triton_output = self._exec_triton_request_single(triton_req)
|
| 218 |
+
return self._get_preproc_response(triton_output)
|
| 219 |
+
|
| 220 |
+
def _get_preproc_tensors(self, request: Request):
|
| 221 |
+
name_map = {
|
| 222 |
+
"text_input": "QUERY",
|
| 223 |
+
"decoder_text_input": "DECODER_QUERY",
|
| 224 |
+
"max_tokens": "REQUEST_OUTPUT_LEN",
|
| 225 |
+
"bad_words": "BAD_WORDS_DICT",
|
| 226 |
+
"stop_words": "STOP_WORDS_DICT",
|
| 227 |
+
"embedding_bias_words": "EMBEDDING_BIAS_WORDS",
|
| 228 |
+
"embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
|
| 229 |
+
"pad_id": "PAD_ID",
|
| 230 |
+
"end_id": "END_ID",
|
| 231 |
+
}
|
| 232 |
+
return self.create_triton_tensors(request, name_map)
|
| 233 |
+
|
| 234 |
+
def _get_preproc_response(self, triton_output):
|
| 235 |
+
name_map = {
|
| 236 |
+
"INPUT_ID": "input_ids",
|
| 237 |
+
"DECODER_INPUT_ID": "decoder_input_ids",
|
| 238 |
+
"REQUEST_INPUT_LEN": "input_lengths",
|
| 239 |
+
"REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
|
| 240 |
+
"BAD_WORDS_IDS": "bad_words_list",
|
| 241 |
+
"STOP_WORDS_IDS": "stop_words_list",
|
| 242 |
+
"EMBEDDING_BIAS": "embedding_bias",
|
| 243 |
+
"OUT_PAD_ID": "pad_id",
|
| 244 |
+
"OUT_END_ID": "end_id",
|
| 245 |
+
}
|
| 246 |
+
return self.convert_triton_response(triton_output, PreprocResponse,
|
| 247 |
+
name_map)
|
| 248 |
+
|
| 249 |
+
@override
|
| 250 |
+
def _draft_generate_non_streaming(
|
| 251 |
+
self, preproc: PreprocResponse, request: Request,
|
| 252 |
+
num_draft_tokens: int) -> GenerationResponse:
|
| 253 |
+
input_tensors = self._get_llm_tensors(preproc, request,
|
| 254 |
+
num_draft_tokens, None, True)
|
| 255 |
+
triton_req = pb_utils.InferenceRequest(
|
| 256 |
+
model_name=self.draft_llm_model_name,
|
| 257 |
+
inputs=input_tensors,
|
| 258 |
+
requested_output_names=self._llm_outputs)
|
| 259 |
+
triton_response = self._exec_triton_request_single(triton_req)
|
| 260 |
+
llm_response = self._get_llm_response(triton_response)
|
| 261 |
+
return llm_response
|
| 262 |
+
|
| 263 |
+
@override
|
| 264 |
+
def _generate(
|
| 265 |
+
self,
|
| 266 |
+
preproc: PreprocResponse,
|
| 267 |
+
request: Request,
|
| 268 |
+
draft_request: Optional[DraftRequest] = None
|
| 269 |
+
) -> Generator[GenerationResponse, None, None]:
|
| 270 |
+
input_tensors = self._get_llm_tensors(preproc, request, None,
|
| 271 |
+
draft_request)
|
| 272 |
+
triton_req = pb_utils.InferenceRequest(
|
| 273 |
+
model_name=self.llm_model_name,
|
| 274 |
+
inputs=input_tensors,
|
| 275 |
+
requested_output_names=self._llm_outputs)
|
| 276 |
+
for r in self._exec_triton_request(triton_req):
|
| 277 |
+
yield self._get_llm_response(r)
|
| 278 |
+
|
| 279 |
+
@override
|
| 280 |
+
def _generate_non_streaming(
|
| 281 |
+
self,
|
| 282 |
+
preproc: PreprocResponse,
|
| 283 |
+
request: Request,
|
| 284 |
+
draft_request: Optional[DraftRequest] = None
|
| 285 |
+
) -> GenerationResponse:
|
| 286 |
+
input_tensors = self._get_llm_tensors(preproc, request, None,
|
| 287 |
+
draft_request)
|
| 288 |
+
triton_req = pb_utils.InferenceRequest(
|
| 289 |
+
model_name=self.llm_model_name,
|
| 290 |
+
inputs=input_tensors,
|
| 291 |
+
requested_output_names=self._llm_outputs)
|
| 292 |
+
r = self._exec_triton_request_single(triton_req)
|
| 293 |
+
return self._get_llm_response(r)
|
| 294 |
+
|
| 295 |
+
def _get_llm_tensors(self,
|
| 296 |
+
preproc: PreprocResponse,
|
| 297 |
+
request: Request,
|
| 298 |
+
num_output_tokens: Optional[int] = None,
|
| 299 |
+
draft_request: Optional[DraftRequest] = None,
|
| 300 |
+
is_draft_model_request: bool = False):
|
| 301 |
+
tensors = []
|
| 302 |
+
tensors.extend(self._get_tensors_from_preproc(preproc))
|
| 303 |
+
tensors.extend(
|
| 304 |
+
self._get_llm_tensors_from_request(request, num_output_tokens,
|
| 305 |
+
draft_request,
|
| 306 |
+
is_draft_model_request))
|
| 307 |
+
return tensors
|
| 308 |
+
|
| 309 |
+
def _get_tensors_from_preproc(self, preproc: PreprocResponse):
|
| 310 |
+
name_map = {
|
| 311 |
+
"input_ids": "input_ids",
|
| 312 |
+
"decoder_input_ids": "decoder_input_ids",
|
| 313 |
+
"input_lengths": "input_lengths",
|
| 314 |
+
"bad_words_list": "bad_words_list",
|
| 315 |
+
"stop_words_list": "stop_words_list",
|
| 316 |
+
"embedding_bias": "embedding_bias",
|
| 317 |
+
"pad_id": "pad_id",
|
| 318 |
+
"end_id": "end_id",
|
| 319 |
+
}
|
| 320 |
+
return self.create_triton_tensors(preproc, name_map)
|
| 321 |
+
|
| 322 |
+
def _get_llm_tensors_from_request(
|
| 323 |
+
self,
|
| 324 |
+
request: Request,
|
| 325 |
+
num_output_tokens: Optional[int] = None,
|
| 326 |
+
draft_request: Optional[DraftRequest] = None,
|
| 327 |
+
is_draft_model_request: bool = False):
|
| 328 |
+
name_map: Dict[str, Optional[str]] = {
|
| 329 |
+
"beam_width": "beam_width",
|
| 330 |
+
"top_k": "runtime_top_k",
|
| 331 |
+
"top_p": "runtime_top_p",
|
| 332 |
+
"length_penalty": "len_penalty",
|
| 333 |
+
"repetition_penalty": "repetition_penalty",
|
| 334 |
+
"min_length": "min_length",
|
| 335 |
+
"presence_penalty": "presence_penalty",
|
| 336 |
+
"frequency_penalty": "frequency_penalty",
|
| 337 |
+
"random_seed": "random_seed",
|
| 338 |
+
"return_log_probs": "return_log_probs",
|
| 339 |
+
"stream": "streaming",
|
| 340 |
+
"prompt_embedding_table": "prompt_embedding_table",
|
| 341 |
+
"prompt_vocab_size": "prompt_vocab_size",
|
| 342 |
+
}
|
| 343 |
+
tensors = self.create_triton_tensors(request, name_map)
|
| 344 |
+
|
| 345 |
+
out_len = request.max_tokens[0][0] if request.max_tokens else None
|
| 346 |
+
if num_output_tokens is not None:
|
| 347 |
+
out_len = num_output_tokens
|
| 348 |
+
elif draft_request:
|
| 349 |
+
if draft_request.draft_input_ids is not None:
|
| 350 |
+
out_len = len(draft_request.draft_input_ids[0]) + 1
|
| 351 |
+
else:
|
| 352 |
+
out_len = 1
|
| 353 |
+
|
| 354 |
+
if out_len is None:
|
| 355 |
+
raise Exception("Could not determine request_output_len")
|
| 356 |
+
else:
|
| 357 |
+
tensors.append(
|
| 358 |
+
pb_utils.Tensor("request_output_len",
|
| 359 |
+
np.array([[out_len]], dtype=np.int32)))
|
| 360 |
+
|
| 361 |
+
if draft_request:
|
| 362 |
+
if draft_request.draft_input_ids is not None:
|
| 363 |
+
tensors.append(
|
| 364 |
+
pb_utils.Tensor("draft_input_ids",
|
| 365 |
+
draft_request.draft_input_ids))
|
| 366 |
+
if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
|
| 367 |
+
0]:
|
| 368 |
+
tensors.append(
|
| 369 |
+
pb_utils.Tensor("draft_logits",
|
| 370 |
+
draft_request.draft_logits))
|
| 371 |
+
|
| 372 |
+
return_context_logits = False
|
| 373 |
+
return_generation_logits = False
|
| 374 |
+
if draft_request is None:
|
| 375 |
+
if is_draft_model_request:
|
| 376 |
+
return_generation_logits = request.use_draft_logits[
|
| 377 |
+
0] if request.use_draft_logits is not None else False
|
| 378 |
+
else:
|
| 379 |
+
return_context_logits = request.return_context_logits[
|
| 380 |
+
0] if request.return_context_logits is not None else False
|
| 381 |
+
return_generation_logits = request.return_generation_logits[
|
| 382 |
+
0] if request.return_generation_logits is not None else False
|
| 383 |
+
|
| 384 |
+
tensors.append(
|
| 385 |
+
pb_utils.Tensor("return_context_logits",
|
| 386 |
+
np.array([[return_context_logits]])))
|
| 387 |
+
tensors.append(
|
| 388 |
+
pb_utils.Tensor("return_generation_logits",
|
| 389 |
+
np.array([[return_generation_logits]])))
|
| 390 |
+
return tensors
|
| 391 |
+
|
| 392 |
+
def _get_llm_response(self, triton_output):
|
| 393 |
+
name_map = {
|
| 394 |
+
"output_ids": "output_ids",
|
| 395 |
+
"sequence_length": "sequence_length",
|
| 396 |
+
"cum_log_probs": "cum_log_probs",
|
| 397 |
+
"output_log_probs": "output_log_probs",
|
| 398 |
+
"context_logits": "context_logits",
|
| 399 |
+
"generation_logits": "generation_logits",
|
| 400 |
+
}
|
| 401 |
+
return self.convert_triton_response(triton_output, GenerationResponse,
|
| 402 |
+
name_map)
|
| 403 |
+
|
| 404 |
+
def _postprocess(self, tokens: np.ndarray,
|
| 405 |
+
sequence_lengths: Optional[np.ndarray],
|
| 406 |
+
gen_response: GenerationResponse) -> Response:
|
| 407 |
+
input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
|
| 408 |
+
gen_response)
|
| 409 |
+
triton_req = pb_utils.InferenceRequest(
|
| 410 |
+
model_name=self.postproc_model_name,
|
| 411 |
+
inputs=input_tensors,
|
| 412 |
+
requested_output_names=self._postproc_outputs)
|
| 413 |
+
r = self._exec_triton_request_single(triton_req)
|
| 414 |
+
response = self._get_response(r, gen_response)
|
| 415 |
+
return response
|
| 416 |
+
|
| 417 |
+
def _get_postproc_tensors(self, tokens: np.ndarray,
|
| 418 |
+
sequence_lengths: Optional[np.ndarray],
|
| 419 |
+
gen_response: GenerationResponse):
|
| 420 |
+
tensors = [
|
| 421 |
+
pb_utils.Tensor("TOKENS_BATCH", tokens),
|
| 422 |
+
pb_utils.Tensor(
|
| 423 |
+
"SEQUENCE_LENGTH", sequence_lengths
|
| 424 |
+
if sequence_lengths else gen_response.sequence_length)
|
| 425 |
+
]
|
| 426 |
+
return tensors
|
| 427 |
+
|
| 428 |
+
def _get_response(self, triton_output, gen_res: GenerationResponse):
|
| 429 |
+
tensors = triton_output.output_tensors()
|
| 430 |
+
t_map = {}
|
| 431 |
+
for named_t in tensors:
|
| 432 |
+
name = named_t.name()
|
| 433 |
+
t = named_t.as_numpy()
|
| 434 |
+
t_map[name] = t
|
| 435 |
+
response = Response(text_output=t_map["OUTPUT"],
|
| 436 |
+
cum_log_probs=gen_res.cum_log_probs,
|
| 437 |
+
output_log_probs=gen_res.output_log_probs,
|
| 438 |
+
context_logits=gen_res.context_logits,
|
| 439 |
+
generation_logits=gen_res.generation_logits)
|
| 440 |
+
return response
|
tensorrt_llm_bls/1/model.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
import json
|
| 28 |
+
import traceback
|
| 29 |
+
|
| 30 |
+
import triton_python_backend_utils as pb_utils
|
| 31 |
+
from lib.triton_decoder import TritonDecoder
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TritonPythonModel:
|
| 35 |
+
|
| 36 |
+
def initialize(self, args):
|
| 37 |
+
|
| 38 |
+
# Parse model configs
|
| 39 |
+
model_config = json.loads(args['model_config'])
|
| 40 |
+
|
| 41 |
+
params = model_config['parameters']
|
| 42 |
+
|
| 43 |
+
accumulate_tokens_str = ''
|
| 44 |
+
if 'accumulate_tokens' in params:
|
| 45 |
+
accumulate_tokens_str = params['accumulate_tokens']['string_value']
|
| 46 |
+
|
| 47 |
+
self.accumulate_tokens = accumulate_tokens_str.lower() in [
|
| 48 |
+
'true', 'yes', '1', 't'
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
| 52 |
+
model_config)
|
| 53 |
+
|
| 54 |
+
self.logger = pb_utils.Logger
|
| 55 |
+
|
| 56 |
+
self.llm_model_name = "tensorrt_llm"
|
| 57 |
+
if "tensorrt_llm_model_name" in params:
|
| 58 |
+
self.llm_model_name = params["tensorrt_llm_model_name"][
|
| 59 |
+
"string_value"]
|
| 60 |
+
self.draft_llm_model_name = None
|
| 61 |
+
if "tensorrt_llm_draft_model_name" in params:
|
| 62 |
+
self.draft_llm_model_name = params[
|
| 63 |
+
"tensorrt_llm_draft_model_name"]["string_value"]
|
| 64 |
+
|
| 65 |
+
self.decoder = TritonDecoder(
|
| 66 |
+
streaming=self.decoupled,
|
| 67 |
+
accumulate=self.accumulate_tokens,
|
| 68 |
+
preproc_model_name="preprocessing",
|
| 69 |
+
postproc_model_name="postprocessing",
|
| 70 |
+
llm_model_name=self.llm_model_name,
|
| 71 |
+
draft_llm_model_name=self.draft_llm_model_name)
|
| 72 |
+
|
| 73 |
+
def execute(self, requests):
|
| 74 |
+
|
| 75 |
+
responses = []
|
| 76 |
+
|
| 77 |
+
for request in requests:
|
| 78 |
+
if self.decoupled:
|
| 79 |
+
response_sender = request.get_response_sender()
|
| 80 |
+
try:
|
| 81 |
+
|
| 82 |
+
req = self.decoder.convert_triton_request(request)
|
| 83 |
+
req.validate()
|
| 84 |
+
speculative_decode = (req.num_draft_tokens is not None
|
| 85 |
+
and req.num_draft_tokens[0][0] > 0)
|
| 86 |
+
if speculative_decode and (self.draft_llm_model_name is None
|
| 87 |
+
or self.draft_llm_model_name == ""):
|
| 88 |
+
raise Exception(
|
| 89 |
+
"cannot perform speculative decoding without draft model"
|
| 90 |
+
)
|
| 91 |
+
res_gen = self.decoder.decode(
|
| 92 |
+
req, speculative_decoding=speculative_decode)
|
| 93 |
+
|
| 94 |
+
for res in res_gen:
|
| 95 |
+
triton_response = self.decoder.create_triton_response(res)
|
| 96 |
+
if self.decoupled:
|
| 97 |
+
response_sender.send(triton_response)
|
| 98 |
+
else:
|
| 99 |
+
responses.append(triton_response)
|
| 100 |
+
|
| 101 |
+
if self.decoupled:
|
| 102 |
+
response_sender.send(
|
| 103 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
| 104 |
+
|
| 105 |
+
except Exception:
|
| 106 |
+
self.logger.log_error(traceback.format_exc())
|
| 107 |
+
# If encountering an error, send a response with err msg
|
| 108 |
+
error_response = pb_utils.InferenceResponse(
|
| 109 |
+
output_tensors=[],
|
| 110 |
+
error=pb_utils.TritonError(traceback.format_exc()))
|
| 111 |
+
|
| 112 |
+
if self.decoupled:
|
| 113 |
+
response_sender.send(error_response)
|
| 114 |
+
response_sender.send(
|
| 115 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
| 116 |
+
else:
|
| 117 |
+
responses.append(error_response)
|
| 118 |
+
|
| 119 |
+
self.decoder.reset_decoder()
|
| 120 |
+
if self.decoupled:
|
| 121 |
+
return None
|
| 122 |
+
else:
|
| 123 |
+
assert len(responses) == len(requests)
|
| 124 |
+
return responses
|
| 125 |
+
|
| 126 |
+
def finalize(self):
|
| 127 |
+
"""`finalize` is called only once when the model is being unloaded.
|
| 128 |
+
Implementing `finalize` function is optional. This function allows
|
| 129 |
+
the model to perform any necessary clean ups before exit.
|
| 130 |
+
"""
|
| 131 |
+
print('Cleaning up...')
|
tensorrt_llm_bls/config.pbtxt
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Redistribution and use in source and binary forms, with or without
|
| 4 |
+
# modification, are permitted provided that the following conditions
|
| 5 |
+
# are met:
|
| 6 |
+
# * Redistributions of source code must retain the above copyright
|
| 7 |
+
# notice, this list of conditions and the following disclaimer.
|
| 8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
| 9 |
+
# notice, this list of conditions and the following disclaimer in the
|
| 10 |
+
# documentation and/or other materials provided with the distribution.
|
| 11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 12 |
+
# contributors may be used to endorse or promote products derived
|
| 13 |
+
# from this software without specific prior written permission.
|
| 14 |
+
#
|
| 15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 26 |
+
|
| 27 |
+
name: "tensorrt_llm_bls"
|
| 28 |
+
backend: "python"
|
| 29 |
+
max_batch_size: 16
|
| 30 |
+
|
| 31 |
+
model_transaction_policy {
|
| 32 |
+
decoupled: true
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
input [
|
| 36 |
+
{
|
| 37 |
+
name: "text_input"
|
| 38 |
+
data_type: TYPE_STRING
|
| 39 |
+
dims: [ -1 ]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
name: "decoder_text_input"
|
| 43 |
+
data_type: TYPE_STRING
|
| 44 |
+
dims: [ -1 ]
|
| 45 |
+
optional: true
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
name: "max_tokens"
|
| 49 |
+
data_type: TYPE_INT32
|
| 50 |
+
dims: [ -1 ]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
name: "bad_words"
|
| 54 |
+
data_type: TYPE_STRING
|
| 55 |
+
dims: [ -1 ]
|
| 56 |
+
optional: true
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
name: "stop_words"
|
| 60 |
+
data_type: TYPE_STRING
|
| 61 |
+
dims: [ -1 ]
|
| 62 |
+
optional: true
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
name: "end_id"
|
| 66 |
+
data_type: TYPE_INT32
|
| 67 |
+
dims: [ 1 ]
|
| 68 |
+
optional: true
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
name: "pad_id"
|
| 72 |
+
data_type: TYPE_INT32
|
| 73 |
+
dims: [ 1 ]
|
| 74 |
+
optional: true
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
name: "top_k"
|
| 78 |
+
data_type: TYPE_INT32
|
| 79 |
+
dims: [ 1 ]
|
| 80 |
+
optional: true
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
name: "top_p"
|
| 84 |
+
data_type: TYPE_FP32
|
| 85 |
+
dims: [ 1 ]
|
| 86 |
+
optional: true
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
name: "temperature"
|
| 90 |
+
data_type: TYPE_FP32
|
| 91 |
+
dims: [ 1 ]
|
| 92 |
+
optional: true
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
name: "length_penalty"
|
| 96 |
+
data_type: TYPE_FP32
|
| 97 |
+
dims: [ 1 ]
|
| 98 |
+
optional: true
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
name: "repetition_penalty"
|
| 102 |
+
data_type: TYPE_FP32
|
| 103 |
+
dims: [ 1 ]
|
| 104 |
+
optional: true
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
name: "min_length"
|
| 108 |
+
data_type: TYPE_INT32
|
| 109 |
+
dims: [ 1 ]
|
| 110 |
+
optional: true
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
name: "presence_penalty"
|
| 114 |
+
data_type: TYPE_FP32
|
| 115 |
+
dims: [ 1 ]
|
| 116 |
+
optional: true
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
name: "frequency_penalty"
|
| 120 |
+
data_type: TYPE_FP32
|
| 121 |
+
dims: [ 1 ]
|
| 122 |
+
optional: true
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
name: "random_seed"
|
| 126 |
+
data_type: TYPE_UINT64
|
| 127 |
+
dims: [ 1 ]
|
| 128 |
+
optional: true
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
name: "return_log_probs"
|
| 132 |
+
data_type: TYPE_BOOL
|
| 133 |
+
dims: [ 1 ]
|
| 134 |
+
reshape: { shape: [ ] }
|
| 135 |
+
optional: true
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
name: "return_context_logits"
|
| 139 |
+
data_type: TYPE_BOOL
|
| 140 |
+
dims: [ 1 ]
|
| 141 |
+
reshape: { shape: [ ] }
|
| 142 |
+
optional: true
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
name: "return_generation_logits"
|
| 146 |
+
data_type: TYPE_BOOL
|
| 147 |
+
dims: [ 1 ]
|
| 148 |
+
reshape: { shape: [ ] }
|
| 149 |
+
optional: true
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
name: "beam_width"
|
| 153 |
+
data_type: TYPE_INT32
|
| 154 |
+
dims: [ 1 ]
|
| 155 |
+
optional: true
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
name: "stream"
|
| 159 |
+
data_type: TYPE_BOOL
|
| 160 |
+
dims: [ 1 ]
|
| 161 |
+
optional: true
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
name: "prompt_embedding_table"
|
| 165 |
+
data_type: TYPE_FP16
|
| 166 |
+
dims: [ -1, -1 ]
|
| 167 |
+
optional: true
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
name: "prompt_vocab_size"
|
| 171 |
+
data_type: TYPE_INT32
|
| 172 |
+
dims: [ 1 ]
|
| 173 |
+
optional: true
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
name: "embedding_bias_words"
|
| 177 |
+
data_type: TYPE_STRING
|
| 178 |
+
dims: [ -1 ]
|
| 179 |
+
optional: true
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
name: "embedding_bias_weights"
|
| 183 |
+
data_type: TYPE_FP32
|
| 184 |
+
dims: [ -1 ]
|
| 185 |
+
optional: true
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
name: "num_draft_tokens",
|
| 189 |
+
data_type: TYPE_INT32,
|
| 190 |
+
dims: [ 1 ]
|
| 191 |
+
optional: true
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
name: "use_draft_logits",
|
| 195 |
+
data_type: TYPE_BOOL,
|
| 196 |
+
dims: [ 1 ]
|
| 197 |
+
reshape: { shape: [ ] }
|
| 198 |
+
optional: true
|
| 199 |
+
}
|
| 200 |
+
]
|
| 201 |
+
output [
|
| 202 |
+
{
|
| 203 |
+
name: "text_output"
|
| 204 |
+
data_type: TYPE_STRING
|
| 205 |
+
dims: [ -1 ]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
name: "cum_log_probs"
|
| 209 |
+
data_type: TYPE_FP32
|
| 210 |
+
dims: [ -1 ]
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
name: "output_log_probs"
|
| 214 |
+
data_type: TYPE_FP32
|
| 215 |
+
dims: [ -1, -1 ]
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
name: "context_logits"
|
| 219 |
+
data_type: TYPE_FP32
|
| 220 |
+
dims: [ -1, -1 ]
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
name: "generation_logits"
|
| 224 |
+
data_type: TYPE_FP32
|
| 225 |
+
dims: [ -1, -1, -1 ]
|
| 226 |
+
}
|
| 227 |
+
]
|
| 228 |
+
|
| 229 |
+
parameters: {
|
| 230 |
+
key: "accumulate_tokens"
|
| 231 |
+
value: {
|
| 232 |
+
string_value: "${accumulate_tokens}"
|
| 233 |
+
}
|
| 234 |
+
}
|
| 235 |
+
parameters: {
|
| 236 |
+
key: "tensorrt_llm_model_name"
|
| 237 |
+
value: {
|
| 238 |
+
string_value: "${tensorrt_llm_model_name}"
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
parameters: {
|
| 242 |
+
key: "tensorrt_llm_draft_model_name"
|
| 243 |
+
value: {
|
| 244 |
+
string_value: "${tensorrt_llm_draft_model_name}"
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
instance_group [
|
| 249 |
+
{
|
| 250 |
+
count: 1
|
| 251 |
+
kind : KIND_CPU
|
| 252 |
+
}
|
| 253 |
+
]
|