File size: 6,595 Bytes

b386992

# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import os
import sys
import tempfile

from nemo.deploy import DeployPyTriton

# Configure the NeMo logger to look the same as vLLM
logging.basicConfig(format="%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s", datefmt="%m-%d %H:%M:%S")
LOGGER = logging.getLogger("NeMo")

try:
    from nemo.export.vllm_exporter import vLLMExporter
except Exception as e:
    LOGGER.error(f"Cannot import the vLLM exporter. {type(e).__name__}: {e}")
    sys.exit(1)


def get_args(argv):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=f"Export NeMo models to vLLM and deploy them on Triton",
    )
    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
    parser.add_argument(
        "-mt",
        "--model_type",
        type=str,
        required=True,
        choices=["llama", "mistral", "mixtral", "starcoder2", "gemma"],
        help="Type of the model",
    )
    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
    parser.add_argument(
        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
    )
    parser.add_argument(
        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
    )
    parser.add_argument(
        "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the vLLM conversion"
    )
    parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
    parser.add_argument(
        "-dt",
        "--dtype",
        choices=["bfloat16", "float16", "fp8", "int8"],
        default="bfloat16",
        type=str,
        help="dtype of the model on vLLM",
    )
    parser.add_argument(
        "-mml", "--max_model_len", default=512, type=int, help="Max input + ouptut length of the model"
    )
    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
    parser.add_argument(
        "-lc", "--lora_ckpt", default=[], type=str, nargs="+", help="List of LoRA checkpoints in HF format"
    )
    parser.add_argument(
        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
    )
    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
    parser.add_argument(
        '-ws',
        '--weight_storage',
        default='auto',
        choices=['auto', 'cache', 'file', 'memory'],
        help='Strategy for storing converted weights for vLLM: "file" - always write weights into a file, '
        '"memory" - always do an in-memory conversion, "cache" - reuse existing files if they are '
        'newer than the nemo checkpoint, "auto" - use "cache" for multi-GPU runs and "memory" '
        'for single-GPU runs.',
    )
    parser.add_argument(
        "-gmu",
        '--gpu_memory_utilization',
        default=0.9,
        type=float,
        help="GPU memory utilization percentage for vLLM.",
    )
    parser.add_argument(
        "-q",
        "--quantization",
        choices=["fp8"],
        help="Quantization method for vLLM.",
    )
    args = parser.parse_args(argv)
    return args


def get_vllm_deployable(args, model_dir):
    exporter = vLLMExporter()
    exporter.export(
        nemo_checkpoint=args.nemo_checkpoint,
        model_dir=model_dir,
        model_type=args.model_type,
        tensor_parallel_size=args.tensor_parallelism_size,
        max_model_len=args.max_model_len,
        lora_checkpoints=args.lora_ckpt,
        dtype=args.dtype,
        weight_storage=args.weight_storage,
        gpu_memory_utilization=args.gpu_memory_utilization,
        quantization=args.quantization,
    )
    return exporter


def nemo_deploy(argv):
    args = get_args(argv)

    if args.debug_mode:
        loglevel = logging.DEBUG
    else:
        loglevel = logging.INFO

    LOGGER.setLevel(loglevel)
    LOGGER.info("Logging level set to {}".format(loglevel))
    LOGGER.info(args)

    # If no model_dir was supplied, create a temporary directory.
    # This directory should persist while the model is being served, becaue it may contain
    # converted LoRA checkpoints, and those are accessed by vLLM at request time.
    tempdir = None
    model_dir = args.triton_model_repository
    if model_dir is None:
        tempdir = tempfile.TemporaryDirectory()
        model_dir = tempdir.name
        LOGGER.info(
            f"{model_dir} will be used for the vLLM intermediate folder. "
            + "Please set the --triton_model_repository parameter if you'd like to use a path that already "
            + "includes the vLLM model files."
        )
    elif not os.path.exists(model_dir):
        os.makedirs(model_dir)

    try:
        triton_deployable = get_vllm_deployable(args, model_dir=model_dir)

        nm = DeployPyTriton(
            model=triton_deployable,
            triton_model_name=args.triton_model_name,
            triton_model_version=args.triton_model_version,
            max_batch_size=args.max_batch_size,
            http_port=args.triton_port,
            address=args.triton_http_address,
            streaming=args.enable_streaming,
        )

        LOGGER.info("Starting the Triton server...")
        nm.deploy()
        nm.serve()

        LOGGER.info("Stopping the Triton server...")
        nm.stop()

    except Exception as error:
        LOGGER.error("An error has occurred while setting up or serving the model. Error message: " + str(error))
        return

    # Clean up the temporary directory
    finally:
        if tempdir is not None:
            tempdir.cleanup()


if __name__ == '__main__':
    nemo_deploy(sys.argv[1:])