|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use_pytriton = True |
|
|
try: |
|
|
from pytriton.model_config import ModelConfig |
|
|
from pytriton.triton import Triton, TritonConfig |
|
|
except Exception: |
|
|
use_pytriton = False |
|
|
|
|
|
from nemo.deploy.deploy_base import DeployBase |
|
|
|
|
|
|
|
|
class DeployPyTriton(DeployBase): |
|
|
""" |
|
|
Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy. |
|
|
|
|
|
Example: |
|
|
from nemo.deploy import DeployPyTriton, NemoQueryLLM |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
|
|
|
trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files") |
|
|
trt_llm_exporter.export( |
|
|
nemo_checkpoint_path="/path/for/nemo/checkpoint", |
|
|
model_type="llama", |
|
|
tensor_parallelism_size=1, |
|
|
) |
|
|
|
|
|
nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", http_port=8000) |
|
|
nm.deploy() |
|
|
nm.run() |
|
|
nq = NemoQueryLLM(url="localhost", model_name="model_name") |
|
|
|
|
|
prompts = ["hello, testing GPT inference", "another GPT inference test?"] |
|
|
output = nq.query_llm(prompts=prompts, max_output_len=100) |
|
|
print("prompts: ", prompts) |
|
|
print("") |
|
|
print("output: ", output) |
|
|
print("") |
|
|
|
|
|
prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"] |
|
|
output = nq.query_llm(prompts=prompts, max_output_len=250) |
|
|
print("prompts: ", prompts) |
|
|
print("") |
|
|
print("output: ", output) |
|
|
print("") |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
triton_model_name: str, |
|
|
triton_model_version: int = 1, |
|
|
checkpoint_path: str = None, |
|
|
model=None, |
|
|
max_batch_size: int = 128, |
|
|
http_port: int = 8000, |
|
|
grpc_port: int = 8001, |
|
|
address="0.0.0.0", |
|
|
allow_grpc=True, |
|
|
allow_http=True, |
|
|
streaming=False, |
|
|
pytriton_log_verbose=0, |
|
|
): |
|
|
""" |
|
|
A nemo checkpoint or model is expected for serving on Triton Inference Server. |
|
|
|
|
|
Args: |
|
|
triton_model_name (str): Name for the service |
|
|
triton_model_version(int): Version for the service |
|
|
checkpoint_path (str): path of the nemo file |
|
|
model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable |
|
|
max_batch_size (int): max batch size |
|
|
port (int) : port for the Triton server |
|
|
address (str): http address for Triton server to bind. |
|
|
""" |
|
|
|
|
|
super().__init__( |
|
|
triton_model_name=triton_model_name, |
|
|
triton_model_version=triton_model_version, |
|
|
checkpoint_path=checkpoint_path, |
|
|
model=model, |
|
|
max_batch_size=max_batch_size, |
|
|
http_port=http_port, |
|
|
grpc_port=grpc_port, |
|
|
address=address, |
|
|
allow_grpc=allow_grpc, |
|
|
allow_http=allow_http, |
|
|
streaming=streaming, |
|
|
pytriton_log_verbose=pytriton_log_verbose, |
|
|
) |
|
|
|
|
|
def deploy(self): |
|
|
""" |
|
|
Deploys any models to Triton Inference Server. |
|
|
""" |
|
|
|
|
|
self._init_nemo_model() |
|
|
|
|
|
try: |
|
|
if self.streaming: |
|
|
|
|
|
triton_config = TritonConfig( |
|
|
log_verbose=self.pytriton_log_verbose, |
|
|
allow_grpc=self.allow_grpc, |
|
|
allow_http=self.allow_http, |
|
|
grpc_address=self.address, |
|
|
) |
|
|
self.triton = Triton(config=triton_config) |
|
|
self.triton.bind( |
|
|
model_name=self.triton_model_name, |
|
|
model_version=self.triton_model_version, |
|
|
infer_func=self.model.triton_infer_fn_streaming, |
|
|
inputs=self.model.get_triton_input, |
|
|
outputs=self.model.get_triton_output, |
|
|
config=ModelConfig(decoupled=True), |
|
|
) |
|
|
else: |
|
|
triton_config = TritonConfig( |
|
|
http_address=self.address, |
|
|
http_port=self.http_port, |
|
|
grpc_address=self.address, |
|
|
grpc_port=self.grpc_port, |
|
|
allow_grpc=self.allow_grpc, |
|
|
allow_http=self.allow_http, |
|
|
) |
|
|
self.triton = Triton(config=triton_config) |
|
|
self.triton.bind( |
|
|
model_name=self.triton_model_name, |
|
|
model_version=self.triton_model_version, |
|
|
infer_func=self.model.triton_infer_fn, |
|
|
inputs=self.model.get_triton_input, |
|
|
outputs=self.model.get_triton_output, |
|
|
config=ModelConfig(max_batch_size=self.max_batch_size), |
|
|
) |
|
|
except Exception as e: |
|
|
self.triton = None |
|
|
print(e) |
|
|
|
|
|
def serve(self): |
|
|
""" |
|
|
Starts serving the model and waits for the requests |
|
|
""" |
|
|
|
|
|
if self.triton is None: |
|
|
raise Exception("deploy should be called first.") |
|
|
|
|
|
try: |
|
|
self.triton.serve() |
|
|
except Exception as e: |
|
|
self.triton = None |
|
|
print(e) |
|
|
|
|
|
def run(self): |
|
|
""" |
|
|
Starts serving the model asynchronously. |
|
|
""" |
|
|
|
|
|
if self.triton is None: |
|
|
raise Exception("deploy should be called first.") |
|
|
|
|
|
self.triton.run() |
|
|
|
|
|
def stop(self): |
|
|
""" |
|
|
Stops serving the model. |
|
|
""" |
|
|
|
|
|
if self.triton is None: |
|
|
raise Exception("deploy should be called first.") |
|
|
|
|
|
self.triton.stop() |
|
|
|