Respair's picture
Upload folder using huggingface_hub
b386992 verified
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import sys
from pathlib import Path
from nemo.deploy import DeployPyTriton
LOGGER = logging.getLogger("NeMo")
multimodal_supported = True
try:
from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
except Exception as e:
LOGGER.warning(f"Cannot import the TensorRTMMExporter exporter, it will not be available. {type(e).__name__}: {e}")
multimodal_supported = False
def get_args(argv):
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=f"Deploy nemo models to Triton",
)
# default modality is vision, can be changed to audio
parser.add_argument(
"-mod",
"--modality",
type=str,
required=False,
default="vision",
choices=["vision", "audio"],
help="Modality of the model",
)
parser.add_argument("-vc", "--visual_checkpoint", type=str, help="Source .nemo file for visual model")
parser.add_argument(
"-lc",
"--llm_checkpoint",
type=str,
required=False,
help="Source .nemo file for llm",
)
parser.add_argument(
"-mt",
"--model_type",
type=str,
required=True,
choices=["neva", "video-neva", "lita", "vila", "vita", "salm", "mllama"],
help="Type of the model that is supported.",
)
parser.add_argument(
"-lmt",
"--llm_model_type",
type=str,
required=True,
choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma", "mllama"],
help="Type of LLM. gptnext, gpt, llama, falcon, and starcoder are only supported."
" gptnext and gpt are the same and keeping it for backward compatibility",
)
parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
parser.add_argument(
"-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
)
parser.add_argument(
"-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
)
parser.add_argument(
"-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
)
parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
parser.add_argument(
"-dt",
"--dtype",
choices=["bfloat16", "float16"],
default="bfloat16",
type=str,
help="dtype of the model on TensorRT",
)
parser.add_argument("-mil", "--max_input_len", default=4096, type=int, help="Max input length of the model")
parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
parser.add_argument("-mbs", "--max_batch_size", default=1, type=int, help="Max batch size of the llm model")
parser.add_argument("-mml", "--max_multimodal_len", default=3072, type=int, help="Max length of multimodal input")
parser.add_argument(
"-vmb",
"--vision_max_batch_size",
default=1,
type=int,
help="Max batch size of the visual inputs, for lita/vita model with video inference, this should be set to 256",
)
parser.add_argument(
'--use_lora_plugin',
nargs='?',
const=None,
choices=['float16', 'float32', 'bfloat16'],
help="Activates the lora plugin which enables embedding sharing.",
)
parser.add_argument(
'--lora_target_modules',
nargs='+',
default=None,
choices=[
"attn_qkv",
"attn_q",
"attn_k",
"attn_v",
"attn_dense",
"mlp_h_to_4h",
"mlp_gate",
"mlp_4h_to_h",
],
help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
)
parser.add_argument(
'--max_lora_rank',
type=int,
default=64,
help='maximum lora rank for different lora modules. '
'It is used to compute the workspace size of lora plugin.',
)
parser.add_argument("--lora_checkpoint_path", default=None, type=str, help="The checkpoint path of LoRA weights")
args = parser.parse_args(argv)
return args
def get_trt_deployable(args):
if args.triton_model_repository is None:
trt_path = "/tmp/trt_model_dir/"
LOGGER.info(
"/tmp/trt_model_dir/ path will be used as the TensorRT folder. "
"Please set the --triton_model_repository parameter if you'd like to use a path that already "
"includes the TensorRT model files."
)
Path(trt_path).mkdir(parents=True, exist_ok=True)
else:
trt_path = args.triton_model_repository
if args.visual_checkpoint is None and args.triton_model_repository is None:
raise ValueError(
"The provided model repository is not a valid TensorRT model "
"directory. Please provide a --visual_checkpoint."
)
if args.visual_checkpoint is None and not os.path.isdir(args.triton_model_repository):
raise ValueError(
"The provided model repository is not a valid TensorRT model "
"directory. Please provide a --visual_checkpoint."
)
if args.visual_checkpoint is not None and args.model_type is None:
raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
exporter = TensorRTMMExporter(
model_dir=trt_path,
load_model=(args.visual_checkpoint is None),
modality=args.modality,
)
if args.visual_checkpoint is not None:
try:
LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT.")
exporter.export(
visual_checkpoint_path=args.visual_checkpoint,
llm_checkpoint_path=args.llm_checkpoint,
model_type=args.model_type,
llm_model_type=args.llm_model_type,
tensor_parallel_size=args.num_gpus,
max_input_len=args.max_input_len,
max_output_len=args.max_output_len,
vision_max_batch_size=args.vision_max_batch_size,
max_batch_size=args.max_batch_size,
max_multimodal_len=args.max_multimodal_len,
dtype=args.dtype,
use_lora_plugin=args.use_lora_plugin,
lora_target_modules=args.lora_target_modules,
max_lora_rank=args.max_lora_rank,
lora_checkpoint_path=args.lora_checkpoint_path,
)
except Exception as error:
raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
return exporter
def nemo_deploy(argv):
args = get_args(argv)
loglevel = logging.INFO
LOGGER.setLevel(loglevel)
LOGGER.info("Logging level set to {}".format(loglevel))
LOGGER.info(args)
triton_deployable = get_trt_deployable(args)
try:
nm = DeployPyTriton(
model=triton_deployable,
triton_model_name=args.triton_model_name,
triton_model_version=args.triton_model_version,
max_batch_size=args.max_batch_size,
http_port=args.triton_port,
address=args.triton_http_address,
)
LOGGER.info("Triton deploy function will be called.")
nm.deploy()
except Exception as error:
LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
return
try:
LOGGER.info("Model serving on Triton is will be started.")
nm.serve()
except Exception as error:
LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
return
LOGGER.info("Model serving will be stopped.")
nm.stop()
if __name__ == '__main__':
nemo_deploy(sys.argv[1:])