NeMo_Canary / scripts /deploy /nlp /deploy_vllm_triton.py

Upload folder using huggingface_hub

b386992 verified 7 months ago

6.6 kB

	# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import logging
	import os
	import sys
	import tempfile

	from nemo.deploy import DeployPyTriton

	# Configure the NeMo logger to look the same as vLLM
	logging.basicConfig(format="%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s", datefmt="%m-%d %H:%M:%S")
	LOGGER = logging.getLogger("NeMo")

	try:
	from nemo.export.vllm_exporter import vLLMExporter
	except Exception as e:
	LOGGER.error(f"Cannot import the vLLM exporter. {type(e).__name__}: {e}")
	sys.exit(1)


	def get_args(argv):
	parser = argparse.ArgumentParser(
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	description=f"Export NeMo models to vLLM and deploy them on Triton",
	)
	parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
	parser.add_argument(
	"-mt",
	"--model_type",
	type=str,
	required=True,
	choices=["llama", "mistral", "mixtral", "starcoder2", "gemma"],
	help="Type of the model",
	)
	parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
	parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
	parser.add_argument(
	"-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
	)
	parser.add_argument(
	"-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
	)
	parser.add_argument(
	"-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the vLLM conversion"
	)
	parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
	parser.add_argument(
	"-dt",
	"--dtype",
	choices=["bfloat16", "float16", "fp8", "int8"],
	default="bfloat16",
	type=str,
	help="dtype of the model on vLLM",
	)
	parser.add_argument(
	"-mml", "--max_model_len", default=512, type=int, help="Max input + ouptut length of the model"
	)
	parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
	parser.add_argument(
	"-lc", "--lora_ckpt", default=[], type=str, nargs="+", help="List of LoRA checkpoints in HF format"
	)
	parser.add_argument(
	"-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
	)
	parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
	parser.add_argument(
	'-ws',
	'--weight_storage',
	default='auto',
	choices=['auto', 'cache', 'file', 'memory'],
	help='Strategy for storing converted weights for vLLM: "file" - always write weights into a file, '
	'"memory" - always do an in-memory conversion, "cache" - reuse existing files if they are '
	'newer than the nemo checkpoint, "auto" - use "cache" for multi-GPU runs and "memory" '
	'for single-GPU runs.',
	)
	parser.add_argument(
	"-gmu",
	'--gpu_memory_utilization',
	default=0.9,
	type=float,
	help="GPU memory utilization percentage for vLLM.",
	)
	parser.add_argument(
	"-q",
	"--quantization",
	choices=["fp8"],
	help="Quantization method for vLLM.",
	)
	args = parser.parse_args(argv)
	return args


	def get_vllm_deployable(args, model_dir):
	exporter = vLLMExporter()
	exporter.export(
	nemo_checkpoint=args.nemo_checkpoint,
	model_dir=model_dir,
	model_type=args.model_type,
	tensor_parallel_size=args.tensor_parallelism_size,
	max_model_len=args.max_model_len,
	lora_checkpoints=args.lora_ckpt,
	dtype=args.dtype,
	weight_storage=args.weight_storage,
	gpu_memory_utilization=args.gpu_memory_utilization,
	quantization=args.quantization,
	)
	return exporter


	def nemo_deploy(argv):
	args = get_args(argv)

	if args.debug_mode:
	loglevel = logging.DEBUG
	else:
	loglevel = logging.INFO

	LOGGER.setLevel(loglevel)
	LOGGER.info("Logging level set to {}".format(loglevel))
	LOGGER.info(args)

	# If no model_dir was supplied, create a temporary directory.
	# This directory should persist while the model is being served, becaue it may contain
	# converted LoRA checkpoints, and those are accessed by vLLM at request time.
	tempdir = None
	model_dir = args.triton_model_repository
	if model_dir is None:
	tempdir = tempfile.TemporaryDirectory()
	model_dir = tempdir.name
	LOGGER.info(
	f"{model_dir} will be used for the vLLM intermediate folder. "
	+ "Please set the --triton_model_repository parameter if you'd like to use a path that already "
	+ "includes the vLLM model files."
	)
	elif not os.path.exists(model_dir):
	os.makedirs(model_dir)

	try:
	triton_deployable = get_vllm_deployable(args, model_dir=model_dir)

	nm = DeployPyTriton(
	model=triton_deployable,
	triton_model_name=args.triton_model_name,
	triton_model_version=args.triton_model_version,
	max_batch_size=args.max_batch_size,
	http_port=args.triton_port,
	address=args.triton_http_address,
	streaming=args.enable_streaming,
	)

	LOGGER.info("Starting the Triton server...")
	nm.deploy()
	nm.serve()

	LOGGER.info("Stopping the Triton server...")
	nm.stop()

	except Exception as error:
	LOGGER.error("An error has occurred while setting up or serving the model. Error message: " + str(error))
	return

	# Clean up the temporary directory
	finally:
	if tempdir is not None:
	tempdir.cleanup()


	if __name__ == '__main__':
	nemo_deploy(sys.argv[1:])