NeMo_Canary / tutorials /llm /reasoning /evaluation /deploy_and_get_responses.py
Respair's picture
Upload folder using huggingface_hub
b386992 verified
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import signal
import subprocess
import requests
from nemo.collections.llm import api
from nemo.collections.llm.evaluation.base import wait_for_fastapi_server
from nemo.utils import logging
logging.setLevel(logging.INFO)
deploy_process = None
base_url = None
chat_url = None
model_name = None
def parse_args():
parser = argparse.ArgumentParser(description='Evaluate model on benchmark dataset')
parser.add_argument('--checkpoint_path', type=str, required=True, help='Path to the model checkpoint')
parser.add_argument(
'--dataset',
type=str,
required=True,
choices=['gpqa_main', 'mmlu', 'gpqa_diamond'],
help='Dataset to evaluate on (gpqa, mmlu)',
)
parser.add_argument(
'--output_prefix', type=str, default='evaluation_results', help='Prefix for the output file name'
)
parser.add_argument(
'--max_tokens', type=int, default=2048, help='Maximum number of tokens to generate in the response'
)
return parser.parse_args()
def create_benchmark_prompt(question, choice1, choice2, choice3, choice4):
"""Create benchmark prompt in the specified format"""
prompt = f"""Given the following question and four candidate answers (A, B, C and D), choose the best answer.
Question: {question} A. {choice1} B. {choice2} C. {choice3} D. {choice4}
For simple problems, directly provide the answer with minimal explanation. For complex problems, use step-by-step format. Always conclude with: The final answer is [the_answer_letter], where the [the_answer_letter] is one of A, B, C or D."""
return prompt
def load_model(checkpoint_path):
"""Initialize and load the model for inference"""
global deploy_process, base_url, chat_url, model_name
SCRIPTS_PATH = "/opt/NeMo/scripts"
WORKSPACE = "."
deploy_script = f"{SCRIPTS_PATH}/deploy/nlp/deploy_in_fw_oai_server_eval.py"
deploy_process = subprocess.Popen(
['python', deploy_script, '--nemo_checkpoint', checkpoint_path],
)
base_url = "http://0.0.0.0:8886"
model_name = "triton_model"
chat_url = f"{base_url}/v1/chat/completions/"
wait_for_fastapi_server(base_url=base_url, max_retries=600, retry_interval=10)
logging.info("Model loaded and server is ready for inference")
def get_response(prompt, max_tokens):
chat_payload = {
"messages": [{"role": "system", "content": "detailed thinking on"}, {"role": "user", "content": prompt}],
"model": model_name,
"max_tokens": max_tokens,
}
response = requests.post(chat_url, json=chat_payload)
return response.content.decode()
def main():
args = parse_args()
# Determine dataset file and output file based on dataset selection
dataset_files = {
'gpqa_main': 'gpqa_dataset.jsonl',
'mmlu': 'mmlu_dataset_test.jsonl',
'gpqa_diamond': 'gpqa_diamond_dataset.jsonl',
}
dataset_file = dataset_files[args.dataset]
output_file = f"{args.output_prefix}_{args.dataset}_evaluation.jsonl"
try:
with open(dataset_file, "r") as f:
problems = [json.loads(line) for line in f]
load_model(args.checkpoint_path)
# Open output file once before the loop
with open(output_file, "w") as f:
for i, problem in enumerate(problems):
print(f"\n{'='*70}")
print(f"Problem {i+1}/{len(problems)}")
prompt = create_benchmark_prompt(
problem['Question'],
problem['Choice 1'],
problem['Choice 2'],
problem['Choice 3'],
problem['Choice 4'],
)
response = get_response(prompt, args.max_tokens)
# Create result entry
result = {
"question": problem['Question'],
"choices": {
"A": problem['Choice 1'],
"B": problem['Choice 2'],
"C": problem['Choice 3'],
"D": problem['Choice 4'],
},
"expected_answer": problem['Answer'],
"model_response": response,
}
# Write to JSONL file
f.write(json.dumps(result) + "\n")
print(f"All results written to {output_file}")
except Exception as e:
print(f"An error occurred: {e}")
finally:
print("Killing the server...")
deploy_process.send_signal(signal.SIGINT)
if __name__ == "__main__":
main()