| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| name: "tensorrt_llm" |
| backend: "tensorrtllm" |
| max_batch_size: 32 |
|
|
| model_transaction_policy { |
| decoupled: True |
| } |
|
|
| input [ |
| { |
| name: "input_ids" |
| data_type: TYPE_INT32 |
| dims: [ -1 ] |
| allow_ragged_batch: true |
| }, |
| { |
| name: "input_lengths" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| }, |
| { |
| name: "request_output_len" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| }, |
| { |
| name: "draft_input_ids" |
| data_type: TYPE_INT32 |
| dims: [ -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "decoder_input_ids" |
| data_type: TYPE_INT32 |
| dims: [ -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "decoder_input_lengths" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| optional: true |
| reshape: { shape: [ ] } |
| }, |
| { |
| name: "draft_logits" |
| data_type: TYPE_FP32 |
| dims: [ -1, -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "draft_acceptance_threshold" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "end_id" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "pad_id" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "stop_words_list" |
| data_type: TYPE_INT32 |
| dims: [ 2, -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "bad_words_list" |
| data_type: TYPE_INT32 |
| dims: [ 2, -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "embedding_bias" |
| data_type: TYPE_FP32 |
| dims: [ -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "beam_width" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "temperature" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "runtime_top_k" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "runtime_top_p" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "runtime_top_p_min" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "runtime_top_p_decay" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "runtime_top_p_reset_ids" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "len_penalty" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "early_stopping" |
| data_type: TYPE_BOOL |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "repetition_penalty" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "min_length" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "beam_search_diversity_rate" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "presence_penalty" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "frequency_penalty" |
| data_type: TYPE_FP32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "random_seed" |
| data_type: TYPE_UINT64 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "return_log_probs" |
| data_type: TYPE_BOOL |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "return_context_logits" |
| data_type: TYPE_BOOL |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "return_generation_logits" |
| data_type: TYPE_BOOL |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "stop" |
| data_type: TYPE_BOOL |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "streaming" |
| data_type: TYPE_BOOL |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| { |
| name: "prompt_embedding_table" |
| data_type: TYPE_FP16 |
| dims: [ -1, -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| { |
| name: "prompt_vocab_size" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| |
| |
| |
| |
| { |
| name: "lora_task_id" |
| data_type: TYPE_UINT64 |
| dims: [ 1 ] |
| reshape: { shape: [ ] } |
| optional: true |
| }, |
| |
| |
| |
| |
| { |
| name: "lora_weights" |
| data_type: TYPE_FP16 |
| dims: [ -1, -1 ] |
| optional: true |
| allow_ragged_batch: true |
| }, |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| { |
| name: "lora_config" |
| data_type: TYPE_INT32 |
| dims: [ -1, 3 ] |
| optional: true |
| allow_ragged_batch: true |
| } |
| ] |
| output [ |
| { |
| name: "output_ids" |
| data_type: TYPE_INT32 |
| dims: [ -1, -1 ] |
| }, |
| { |
| name: "sequence_length" |
| data_type: TYPE_INT32 |
| dims: [ -1 ] |
| }, |
| { |
| name: "cum_log_probs" |
| data_type: TYPE_FP32 |
| dims: [ -1 ] |
| }, |
| { |
| name: "output_log_probs" |
| data_type: TYPE_FP32 |
| dims: [ -1, -1 ] |
| }, |
| { |
| name: "context_logits" |
| data_type: TYPE_FP32 |
| dims: [ -1, -1 ] |
| }, |
| { |
| name: "generation_logits" |
| data_type: TYPE_FP32 |
| dims: [ -1, -1, -1 ] |
| }, |
| { |
| name: "batch_index" |
| data_type: TYPE_INT32 |
| dims: [ 1 ] |
| } |
| ] |
| instance_group [ |
| { |
| count: 1 |
| kind : KIND_CPU |
| } |
| ] |
| parameters: { |
| key: "max_beam_width" |
| value: { |
| string_value: "1" |
| } |
| } |
| parameters: { |
| key: "FORCE_CPU_ONLY_INPUT_TENSORS" |
| value: { |
| string_value: "no" |
| } |
| } |
| parameters: { |
| key: "gpt_model_type" |
| value: { |
| string_value: "inflight_fused_batching" |
| } |
| } |
| parameters: { |
| key: "gpt_model_path" |
| value: { |
| string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1" |
| } |
| } |
| parameters: { |
| key: "encoder_model_path" |
| value: { |
| string_value: "${encoder_engine_dir}" |
| } |
| } |
| parameters: { |
| key: "max_tokens_in_paged_kv_cache" |
| value: { |
| string_value: "${max_tokens_in_paged_kv_cache}" |
| } |
| } |
| parameters: { |
| key: "max_attention_window_size" |
| value: { |
| string_value: "${max_attention_window_size}" |
| } |
| } |
| parameters: { |
| key: "sink_token_length" |
| value: { |
| string_value: "${sink_token_length}" |
| } |
| } |
| parameters: { |
| key: "batch_scheduler_policy" |
| value: { |
| string_value: "guaranteed_no_evict" |
| } |
| } |
| parameters: { |
| key: "kv_cache_free_gpu_mem_fraction" |
| value: { |
| string_value: "0.1" |
| } |
| } |
| parameters: { |
| key: "kv_cache_host_memory_bytes" |
| value: { |
| string_value: "${kv_cache_host_memory_bytes}" |
| } |
| } |
| parameters: { |
| key: "kv_cache_onboard_blocks" |
| value: { |
| string_value: "${kv_cache_onboard_blocks}" |
| } |
| } |
| |
| |
| |
| |
| |
| |
| |
| parameters: { |
| key: "exclude_input_in_output" |
| value: { |
| string_value: "True" |
| } |
| } |
| parameters: { |
| key: "cancellation_check_period_ms" |
| value: { |
| string_value: "${cancellation_check_period_ms}" |
| } |
| } |
| parameters: { |
| key: "stats_check_period_ms" |
| value: { |
| string_value: "${stats_check_period_ms}" |
| } |
| } |
| parameters: { |
| key: "iter_stats_max_iterations" |
| value: { |
| string_value: "${iter_stats_max_iterations}" |
| } |
| } |
| parameters: { |
| key: "request_stats_max_iterations" |
| value: { |
| string_value: "${request_stats_max_iterations}" |
| } |
| } |
| parameters: { |
| key: "enable_kv_cache_reuse" |
| value: { |
| string_value: "True" |
| } |
| } |
| parameters: { |
| key: "normalize_log_probs" |
| value: { |
| string_value: "${normalize_log_probs}" |
| } |
| } |
| parameters: { |
| key: "enable_chunked_context" |
| value: { |
| string_value: "${enable_chunked_context}" |
| } |
| } |
| parameters: { |
| key: "gpu_device_ids" |
| value: { |
| string_value: "0,1" |
| } |
| } |
| parameters: { |
| key: "lora_cache_optimal_adapter_size" |
| value: { |
| string_value: "${lora_cache_optimal_adapter_size}" |
| } |
| } |
| parameters: { |
| key: "lora_cache_max_adapter_size" |
| value: { |
| string_value: "${lora_cache_max_adapter_size}" |
| } |
| } |
| parameters: { |
| key: "lora_cache_gpu_memory_fraction" |
| value: { |
| string_value: "${lora_cache_gpu_memory_fraction}" |
| } |
| } |
| parameters: { |
| key: "lora_cache_host_memory_bytes" |
| value: { |
| string_value: "${lora_cache_host_memory_bytes}" |
| } |
| } |
| parameters: { |
| key: "decoding_mode" |
| value: { |
| string_value: "top_k_top_p" |
| } |
| } |
| parameters: { |
| key: "executor_worker_path" |
| value: { |
| string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" |
| } |
| } |
| parameters: { |
| key: "medusa_choices" |
| value: { |
| string_value: "${medusa_choices}" |
| } |
| } |
| parameters: { |
| key: "gpu_weights_percent" |
| value: { |
| string_value: "${gpu_weights_percent}" |
| } |
| } |
| parameters: { |
| key: "enable_context_fmha_fp32_acc" |
| value: { |
| string_value: "${enable_context_fmha_fp32_acc}" |
| } |
| } |
| parameters: { |
| key: "multi_block_mode" |
| value: { |
| string_value: "${multi_block_mode}" |
| } |
| } |
|
|