NeMo_Canary / tutorials /llm /reasoning /evaluation /deploy_and_get_responses.py

Upload folder using huggingface_hub

b386992 verified 6 months ago

5.26 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import json
	import signal
	import subprocess

	import requests

	from nemo.collections.llm import api
	from nemo.collections.llm.evaluation.base import wait_for_fastapi_server
	from nemo.utils import logging

	logging.setLevel(logging.INFO)

	deploy_process = None
	base_url = None
	chat_url = None
	model_name = None


	def parse_args():
	parser = argparse.ArgumentParser(description='Evaluate model on benchmark dataset')
	parser.add_argument('--checkpoint_path', type=str, required=True, help='Path to the model checkpoint')
	parser.add_argument(
	'--dataset',
	type=str,
	required=True,
	choices=['gpqa_main', 'mmlu', 'gpqa_diamond'],
	help='Dataset to evaluate on (gpqa, mmlu)',
	)
	parser.add_argument(
	'--output_prefix', type=str, default='evaluation_results', help='Prefix for the output file name'
	)
	parser.add_argument(
	'--max_tokens', type=int, default=2048, help='Maximum number of tokens to generate in the response'
	)
	return parser.parse_args()


	def create_benchmark_prompt(question, choice1, choice2, choice3, choice4):
	"""Create benchmark prompt in the specified format"""
	prompt = f"""Given the following question and four candidate answers (A, B, C and D), choose the best answer.
	Question: {question} A. {choice1} B. {choice2} C. {choice3} D. {choice4}
	For simple problems, directly provide the answer with minimal explanation. For complex problems, use step-by-step format. Always conclude with: The final answer is [the_answer_letter], where the [the_answer_letter] is one of A, B, C or D."""
	return prompt


	def load_model(checkpoint_path):
	"""Initialize and load the model for inference"""
	global deploy_process, base_url, chat_url, model_name

	SCRIPTS_PATH = "/opt/NeMo/scripts"
	WORKSPACE = "."

	deploy_script = f"{SCRIPTS_PATH}/deploy/nlp/deploy_in_fw_oai_server_eval.py"
	deploy_process = subprocess.Popen(
	['python', deploy_script, '--nemo_checkpoint', checkpoint_path],
	)

	base_url = "http://0.0.0.0:8886"
	model_name = "triton_model"
	chat_url = f"{base_url}/v1/chat/completions/"

	wait_for_fastapi_server(base_url=base_url, max_retries=600, retry_interval=10)
	logging.info("Model loaded and server is ready for inference")


	def get_response(prompt, max_tokens):
	chat_payload = {
	"messages": [{"role": "system", "content": "detailed thinking on"}, {"role": "user", "content": prompt}],
	"model": model_name,
	"max_tokens": max_tokens,
	}
	response = requests.post(chat_url, json=chat_payload)
	return response.content.decode()


	def main():
	args = parse_args()

	# Determine dataset file and output file based on dataset selection
	dataset_files = {
	'gpqa_main': 'gpqa_dataset.jsonl',
	'mmlu': 'mmlu_dataset_test.jsonl',
	'gpqa_diamond': 'gpqa_diamond_dataset.jsonl',
	}

	dataset_file = dataset_files[args.dataset]
	output_file = f"{args.output_prefix}_{args.dataset}_evaluation.jsonl"

	try:
	with open(dataset_file, "r") as f:
	problems = [json.loads(line) for line in f]

	load_model(args.checkpoint_path)

	# Open output file once before the loop
	with open(output_file, "w") as f:
	for i, problem in enumerate(problems):
	print(f"\n{'='*70}")
	print(f"Problem {i+1}/{len(problems)}")

	prompt = create_benchmark_prompt(
	problem['Question'],
	problem['Choice 1'],
	problem['Choice 2'],
	problem['Choice 3'],
	problem['Choice 4'],
	)

	response = get_response(prompt, args.max_tokens)

	# Create result entry
	result = {
	"question": problem['Question'],
	"choices": {
	"A": problem['Choice 1'],
	"B": problem['Choice 2'],
	"C": problem['Choice 3'],
	"D": problem['Choice 4'],
	},
	"expected_answer": problem['Answer'],
	"model_response": response,
	}

	# Write to JSONL file
	f.write(json.dumps(result) + "\n")

	print(f"All results written to {output_file}")
	except Exception as e:
	print(f"An error occurred: {e}")
	finally:
	print("Killing the server...")
	deploy_process.send_signal(signal.SIGINT)


	if __name__ == "__main__":
	main()