File size: 5,258 Bytes

b386992

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import signal
import subprocess

import requests

from nemo.collections.llm import api
from nemo.collections.llm.evaluation.base import wait_for_fastapi_server
from nemo.utils import logging

logging.setLevel(logging.INFO)

deploy_process = None
base_url = None
chat_url = None
model_name = None


def parse_args():
    parser = argparse.ArgumentParser(description='Evaluate model on benchmark dataset')
    parser.add_argument('--checkpoint_path', type=str, required=True, help='Path to the model checkpoint')
    parser.add_argument(
        '--dataset',
        type=str,
        required=True,
        choices=['gpqa_main', 'mmlu', 'gpqa_diamond'],
        help='Dataset to evaluate on (gpqa, mmlu)',
    )
    parser.add_argument(
        '--output_prefix', type=str, default='evaluation_results', help='Prefix for the output file name'
    )
    parser.add_argument(
        '--max_tokens', type=int, default=2048, help='Maximum number of tokens to generate in the response'
    )
    return parser.parse_args()


def create_benchmark_prompt(question, choice1, choice2, choice3, choice4):
    """Create benchmark prompt in the specified format"""
    prompt = f"""Given the following question and four candidate answers (A, B, C and D), choose the best answer.
        Question: {question} A. {choice1} B. {choice2} C. {choice3} D. {choice4}
        For simple problems, directly provide the answer with minimal explanation. For complex problems, use step-by-step format. Always conclude with: The final answer is [the_answer_letter], where the [the_answer_letter] is one of A, B, C or D."""
    return prompt


def load_model(checkpoint_path):
    """Initialize and load the model for inference"""
    global deploy_process, base_url, chat_url, model_name

    SCRIPTS_PATH = "/opt/NeMo/scripts"
    WORKSPACE = "."

    deploy_script = f"{SCRIPTS_PATH}/deploy/nlp/deploy_in_fw_oai_server_eval.py"
    deploy_process = subprocess.Popen(
        ['python', deploy_script, '--nemo_checkpoint', checkpoint_path],
    )

    base_url = "http://0.0.0.0:8886"
    model_name = "triton_model"
    chat_url = f"{base_url}/v1/chat/completions/"

    wait_for_fastapi_server(base_url=base_url, max_retries=600, retry_interval=10)
    logging.info("Model loaded and server is ready for inference")


def get_response(prompt, max_tokens):
    chat_payload = {
        "messages": [{"role": "system", "content": "detailed thinking on"}, {"role": "user", "content": prompt}],
        "model": model_name,
        "max_tokens": max_tokens,
    }
    response = requests.post(chat_url, json=chat_payload)
    return response.content.decode()


def main():
    args = parse_args()

    # Determine dataset file and output file based on dataset selection
    dataset_files = {
        'gpqa_main': 'gpqa_dataset.jsonl',
        'mmlu': 'mmlu_dataset_test.jsonl',
        'gpqa_diamond': 'gpqa_diamond_dataset.jsonl',
    }

    dataset_file = dataset_files[args.dataset]
    output_file = f"{args.output_prefix}_{args.dataset}_evaluation.jsonl"

    try:
        with open(dataset_file, "r") as f:
            problems = [json.loads(line) for line in f]

        load_model(args.checkpoint_path)

        # Open output file once before the loop
        with open(output_file, "w") as f:
            for i, problem in enumerate(problems):
                print(f"\n{'='*70}")
                print(f"Problem {i+1}/{len(problems)}")

                prompt = create_benchmark_prompt(
                    problem['Question'],
                    problem['Choice 1'],
                    problem['Choice 2'],
                    problem['Choice 3'],
                    problem['Choice 4'],
                )

                response = get_response(prompt, args.max_tokens)

                # Create result entry
                result = {
                    "question": problem['Question'],
                    "choices": {
                        "A": problem['Choice 1'],
                        "B": problem['Choice 2'],
                        "C": problem['Choice 3'],
                        "D": problem['Choice 4'],
                    },
                    "expected_answer": problem['Answer'],
                    "model_response": response,
                }

                # Write to JSONL file
                f.write(json.dumps(result) + "\n")

            print(f"All results written to {output_file}")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        print("Killing the server...")
        deploy_process.send_signal(signal.SIGINT)


if __name__ == "__main__":
    main()