|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
import sys |
|
|
|
|
|
from nemo.deploy.nlp import NemoQueryLLMHF |
|
|
|
|
|
|
|
|
def get_args(argv): |
|
|
parser = argparse.ArgumentParser( |
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
description="Query a HuggingFace model deployed on Triton Inference Server", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-u", |
|
|
"--url", |
|
|
default="0.0.0.0", |
|
|
type=str, |
|
|
help="URL of the Triton Inference Server (e.g. localhost or IP address)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-mn", "--model_name", required=True, type=str, help="Name of the model as deployed on Triton server" |
|
|
) |
|
|
prompt_group = parser.add_mutually_exclusive_group(required=True) |
|
|
prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Text prompt to send to the model") |
|
|
prompt_group.add_argument( |
|
|
"-pf", "--prompt_file", required=False, type=str, help="Path to file containing the prompt text" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-mol", "--max_output_len", default=128, type=int, help="Maximum number of tokens to generate in the response" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-tk", "--top_k", default=1, type=int, help="Number of highest probability tokens to consider for sampling" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-tpp", "--top_p", default=0.0, type=float, help="Cumulative probability threshold for token sampling" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-t", |
|
|
"--temperature", |
|
|
default=1.0, |
|
|
type=float, |
|
|
help="Temperature for controlling randomness in sampling (higher = more random)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-it", |
|
|
"--init_timeout", |
|
|
default=60.0, |
|
|
type=float, |
|
|
help="Timeout in seconds when initializing connection to Triton server", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-ol", "--output_logits", default=False, action='store_true', help="Return raw logits from model output" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-os", |
|
|
"--output_scores", |
|
|
default=False, |
|
|
action='store_true', |
|
|
help="Return token probability scores from model output", |
|
|
) |
|
|
|
|
|
args = parser.parse_args(argv) |
|
|
return args |
|
|
|
|
|
|
|
|
def query_llm( |
|
|
url, |
|
|
model_name, |
|
|
prompts, |
|
|
max_output_len=128, |
|
|
top_k=1, |
|
|
top_p=0.0, |
|
|
temperature=1.0, |
|
|
output_logits=False, |
|
|
output_scores=False, |
|
|
init_timeout=60.0, |
|
|
): |
|
|
"""Query a HuggingFace language model deployed on Triton Inference Server. |
|
|
|
|
|
Args: |
|
|
url (str): URL of the Triton Inference Server (e.g. localhost or IP address) |
|
|
model_name (str): Name of the model as deployed on Triton server |
|
|
prompts (List[str]): List of text prompts to send to the model |
|
|
max_output_len (int, optional): Maximum number of tokens to generate in the response. Defaults to 128. |
|
|
top_k (int, optional): Number of highest probability tokens to consider for sampling. Defaults to 1. |
|
|
top_p (float, optional): Cumulative probability threshold for token sampling. Defaults to 0.0. |
|
|
temperature (float, optional): Temperature for controlling randomness in sampling (higher = more random). Defaults to 1.0. |
|
|
output_logits (bool, optional): Return raw logits from model output. Defaults to False. |
|
|
output_scores (bool, optional): Return token probability scores from model output. Defaults to False. |
|
|
init_timeout (float, optional): Timeout in seconds when initializing connection to Triton server. Defaults to 60.0. |
|
|
|
|
|
Returns: |
|
|
List[str]: Generated text responses for each input prompt |
|
|
""" |
|
|
|
|
|
nemo_query = NemoQueryLLMHF(url, model_name) |
|
|
return nemo_query.query_llm( |
|
|
prompts=prompts, |
|
|
max_length=max_output_len, |
|
|
top_k=top_k, |
|
|
top_p=top_p, |
|
|
temperature=temperature, |
|
|
output_logits=output_logits, |
|
|
output_scores=output_scores, |
|
|
init_timeout=init_timeout, |
|
|
) |
|
|
|
|
|
|
|
|
def query(argv): |
|
|
"""Query a HuggingFace language model deployed on Triton Inference Server using command line arguments. |
|
|
|
|
|
This function parses command line arguments and sends queries to a deployed model. It supports |
|
|
reading prompts either directly from command line or from a file. |
|
|
|
|
|
Args: |
|
|
argv (List[str]): Command line arguments passed to the script, excluding the script name. |
|
|
Expected arguments include: |
|
|
- url: URL of Triton server |
|
|
- model_name: Name of deployed model |
|
|
- prompt: Text prompt or prompt_file: Path to file containing prompt |
|
|
- max_output_len: Maximum tokens to generate |
|
|
- top_k: Top-k sampling parameter |
|
|
- top_p: Top-p sampling parameter |
|
|
- temperature: Sampling temperature |
|
|
- output_logits: Whether to return logits |
|
|
- output_scores: Whether to return scores |
|
|
- init_timeout: Connection timeout |
|
|
|
|
|
Returns: |
|
|
List[str]: Generated text responses from the model |
|
|
""" |
|
|
|
|
|
args = get_args(argv) |
|
|
|
|
|
if args.prompt_file is not None: |
|
|
with open(args.prompt_file, "r") as f: |
|
|
args.prompt = f.read() |
|
|
|
|
|
outputs = query_llm( |
|
|
url=args.url, |
|
|
model_name=args.model_name, |
|
|
prompts=[args.prompt], |
|
|
max_output_len=args.max_output_len, |
|
|
top_k=args.top_k, |
|
|
top_p=args.top_p, |
|
|
temperature=args.temperature, |
|
|
output_logits=args.output_logits, |
|
|
output_scores=args.output_scores, |
|
|
init_timeout=args.init_timeout, |
|
|
) |
|
|
print(outputs) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
query(sys.argv[1:]) |
|
|
|