NeMo_Canary / scripts /deploy /nlp /deploy_vllm_triton.py
Respair's picture
Upload folder using huggingface_hub
b386992 verified
# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import sys
import tempfile
from nemo.deploy import DeployPyTriton
# Configure the NeMo logger to look the same as vLLM
logging.basicConfig(format="%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s", datefmt="%m-%d %H:%M:%S")
LOGGER = logging.getLogger("NeMo")
try:
from nemo.export.vllm_exporter import vLLMExporter
except Exception as e:
LOGGER.error(f"Cannot import the vLLM exporter. {type(e).__name__}: {e}")
sys.exit(1)
def get_args(argv):
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=f"Export NeMo models to vLLM and deploy them on Triton",
)
parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
parser.add_argument(
"-mt",
"--model_type",
type=str,
required=True,
choices=["llama", "mistral", "mixtral", "starcoder2", "gemma"],
help="Type of the model",
)
parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
parser.add_argument(
"-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
)
parser.add_argument(
"-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
)
parser.add_argument(
"-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the vLLM conversion"
)
parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
parser.add_argument(
"-dt",
"--dtype",
choices=["bfloat16", "float16", "fp8", "int8"],
default="bfloat16",
type=str,
help="dtype of the model on vLLM",
)
parser.add_argument(
"-mml", "--max_model_len", default=512, type=int, help="Max input + ouptut length of the model"
)
parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
parser.add_argument(
"-lc", "--lora_ckpt", default=[], type=str, nargs="+", help="List of LoRA checkpoints in HF format"
)
parser.add_argument(
"-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
)
parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
parser.add_argument(
'-ws',
'--weight_storage',
default='auto',
choices=['auto', 'cache', 'file', 'memory'],
help='Strategy for storing converted weights for vLLM: "file" - always write weights into a file, '
'"memory" - always do an in-memory conversion, "cache" - reuse existing files if they are '
'newer than the nemo checkpoint, "auto" - use "cache" for multi-GPU runs and "memory" '
'for single-GPU runs.',
)
parser.add_argument(
"-gmu",
'--gpu_memory_utilization',
default=0.9,
type=float,
help="GPU memory utilization percentage for vLLM.",
)
parser.add_argument(
"-q",
"--quantization",
choices=["fp8"],
help="Quantization method for vLLM.",
)
args = parser.parse_args(argv)
return args
def get_vllm_deployable(args, model_dir):
exporter = vLLMExporter()
exporter.export(
nemo_checkpoint=args.nemo_checkpoint,
model_dir=model_dir,
model_type=args.model_type,
tensor_parallel_size=args.tensor_parallelism_size,
max_model_len=args.max_model_len,
lora_checkpoints=args.lora_ckpt,
dtype=args.dtype,
weight_storage=args.weight_storage,
gpu_memory_utilization=args.gpu_memory_utilization,
quantization=args.quantization,
)
return exporter
def nemo_deploy(argv):
args = get_args(argv)
if args.debug_mode:
loglevel = logging.DEBUG
else:
loglevel = logging.INFO
LOGGER.setLevel(loglevel)
LOGGER.info("Logging level set to {}".format(loglevel))
LOGGER.info(args)
# If no model_dir was supplied, create a temporary directory.
# This directory should persist while the model is being served, becaue it may contain
# converted LoRA checkpoints, and those are accessed by vLLM at request time.
tempdir = None
model_dir = args.triton_model_repository
if model_dir is None:
tempdir = tempfile.TemporaryDirectory()
model_dir = tempdir.name
LOGGER.info(
f"{model_dir} will be used for the vLLM intermediate folder. "
+ "Please set the --triton_model_repository parameter if you'd like to use a path that already "
+ "includes the vLLM model files."
)
elif not os.path.exists(model_dir):
os.makedirs(model_dir)
try:
triton_deployable = get_vllm_deployable(args, model_dir=model_dir)
nm = DeployPyTriton(
model=triton_deployable,
triton_model_name=args.triton_model_name,
triton_model_version=args.triton_model_version,
max_batch_size=args.max_batch_size,
http_port=args.triton_port,
address=args.triton_http_address,
streaming=args.enable_streaming,
)
LOGGER.info("Starting the Triton server...")
nm.deploy()
nm.serve()
LOGGER.info("Stopping the Triton server...")
nm.stop()
except Exception as error:
LOGGER.error("An error has occurred while setting up or serving the model. Error message: " + str(error))
return
# Clean up the temporary directory
finally:
if tempdir is not None:
tempdir.cleanup()
if __name__ == '__main__':
nemo_deploy(sys.argv[1:])