|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
import logging |
|
|
import os |
|
|
import sys |
|
|
import tempfile |
|
|
|
|
|
from nemo.deploy import DeployPyTriton |
|
|
|
|
|
|
|
|
logging.basicConfig(format="%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s", datefmt="%m-%d %H:%M:%S") |
|
|
LOGGER = logging.getLogger("NeMo") |
|
|
|
|
|
try: |
|
|
from nemo.export.vllm_exporter import vLLMExporter |
|
|
except Exception as e: |
|
|
LOGGER.error(f"Cannot import the vLLM exporter. {type(e).__name__}: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def get_args(argv): |
|
|
parser = argparse.ArgumentParser( |
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
description=f"Export NeMo models to vLLM and deploy them on Triton", |
|
|
) |
|
|
parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") |
|
|
parser.add_argument( |
|
|
"-mt", |
|
|
"--model_type", |
|
|
type=str, |
|
|
required=True, |
|
|
choices=["llama", "mistral", "mixtral", "starcoder2", "gemma"], |
|
|
help="Type of the model", |
|
|
) |
|
|
parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service") |
|
|
parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service") |
|
|
parser.add_argument( |
|
|
"-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the vLLM conversion" |
|
|
) |
|
|
parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size") |
|
|
parser.add_argument( |
|
|
"-dt", |
|
|
"--dtype", |
|
|
choices=["bfloat16", "float16", "fp8", "int8"], |
|
|
default="bfloat16", |
|
|
type=str, |
|
|
help="dtype of the model on vLLM", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-mml", "--max_model_len", default=512, type=int, help="Max input + ouptut length of the model" |
|
|
) |
|
|
parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") |
|
|
parser.add_argument( |
|
|
"-lc", "--lora_ckpt", default=[], type=str, nargs="+", help="List of LoRA checkpoints in HF format" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences." |
|
|
) |
|
|
parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") |
|
|
parser.add_argument( |
|
|
'-ws', |
|
|
'--weight_storage', |
|
|
default='auto', |
|
|
choices=['auto', 'cache', 'file', 'memory'], |
|
|
help='Strategy for storing converted weights for vLLM: "file" - always write weights into a file, ' |
|
|
'"memory" - always do an in-memory conversion, "cache" - reuse existing files if they are ' |
|
|
'newer than the nemo checkpoint, "auto" - use "cache" for multi-GPU runs and "memory" ' |
|
|
'for single-GPU runs.', |
|
|
) |
|
|
parser.add_argument( |
|
|
"-gmu", |
|
|
'--gpu_memory_utilization', |
|
|
default=0.9, |
|
|
type=float, |
|
|
help="GPU memory utilization percentage for vLLM.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-q", |
|
|
"--quantization", |
|
|
choices=["fp8"], |
|
|
help="Quantization method for vLLM.", |
|
|
) |
|
|
args = parser.parse_args(argv) |
|
|
return args |
|
|
|
|
|
|
|
|
def get_vllm_deployable(args, model_dir): |
|
|
exporter = vLLMExporter() |
|
|
exporter.export( |
|
|
nemo_checkpoint=args.nemo_checkpoint, |
|
|
model_dir=model_dir, |
|
|
model_type=args.model_type, |
|
|
tensor_parallel_size=args.tensor_parallelism_size, |
|
|
max_model_len=args.max_model_len, |
|
|
lora_checkpoints=args.lora_ckpt, |
|
|
dtype=args.dtype, |
|
|
weight_storage=args.weight_storage, |
|
|
gpu_memory_utilization=args.gpu_memory_utilization, |
|
|
quantization=args.quantization, |
|
|
) |
|
|
return exporter |
|
|
|
|
|
|
|
|
def nemo_deploy(argv): |
|
|
args = get_args(argv) |
|
|
|
|
|
if args.debug_mode: |
|
|
loglevel = logging.DEBUG |
|
|
else: |
|
|
loglevel = logging.INFO |
|
|
|
|
|
LOGGER.setLevel(loglevel) |
|
|
LOGGER.info("Logging level set to {}".format(loglevel)) |
|
|
LOGGER.info(args) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tempdir = None |
|
|
model_dir = args.triton_model_repository |
|
|
if model_dir is None: |
|
|
tempdir = tempfile.TemporaryDirectory() |
|
|
model_dir = tempdir.name |
|
|
LOGGER.info( |
|
|
f"{model_dir} will be used for the vLLM intermediate folder. " |
|
|
+ "Please set the --triton_model_repository parameter if you'd like to use a path that already " |
|
|
+ "includes the vLLM model files." |
|
|
) |
|
|
elif not os.path.exists(model_dir): |
|
|
os.makedirs(model_dir) |
|
|
|
|
|
try: |
|
|
triton_deployable = get_vllm_deployable(args, model_dir=model_dir) |
|
|
|
|
|
nm = DeployPyTriton( |
|
|
model=triton_deployable, |
|
|
triton_model_name=args.triton_model_name, |
|
|
triton_model_version=args.triton_model_version, |
|
|
max_batch_size=args.max_batch_size, |
|
|
http_port=args.triton_port, |
|
|
address=args.triton_http_address, |
|
|
streaming=args.enable_streaming, |
|
|
) |
|
|
|
|
|
LOGGER.info("Starting the Triton server...") |
|
|
nm.deploy() |
|
|
nm.serve() |
|
|
|
|
|
LOGGER.info("Stopping the Triton server...") |
|
|
nm.stop() |
|
|
|
|
|
except Exception as error: |
|
|
LOGGER.error("An error has occurred while setting up or serving the model. Error message: " + str(error)) |
|
|
return |
|
|
|
|
|
|
|
|
finally: |
|
|
if tempdir is not None: |
|
|
tempdir.cleanup() |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
nemo_deploy(sys.argv[1:]) |
|
|
|