# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import csv import json import re import pandas as pd def extract_model_answer(response): if not response or "Internal Server Error" in response: return "Internal Server Error" # Look for the pattern "The final answer is " match = re.search(r"The final answer is ([A-D])", response) if match: return match.group(1) return "" def process_answers(input_file, output_file): # Read the JSONL file data = [] with open(input_file, 'r') as f: for line in f: if line.strip(): # Skip empty lines data.append(json.loads(line)) # Prepare CSV headers headers = [ 'Question', 'Choice A', 'Choice B', 'Choice C', 'Choice D', 'Expected Answer', 'Model Response', 'Extracted Model Answer', ] # Write to CSV with open(output_file, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(headers) # Process each question for question_data in data: question = question_data.get('question', '') choices = question_data.get('choices', {}) expected_answer = question_data.get('expected_answer', '') model_response = question_data.get('model_response', '') # Extract model answer extracted_answer = extract_model_answer(model_response) # Write row row = [ question, choices.get('A', ''), choices.get('B', ''), choices.get('C', ''), choices.get('D', ''), expected_answer, model_response, extracted_answer, ] writer.writerow(row) return output_file def evaluate_results(csv_file, model_name): # Read the CSV file df = pd.read_csv(csv_file) # Calculate metrics total = len(df) correct = len(df[df['Extracted Model Answer'] == df['Expected Answer']]) refusals = len(df[df['Extracted Model Answer'].str.contains('Internal Server Error', case=False, na=False)]) # Print results print(f"\nModel: {model_name}") print(f"Total problems: {total}") print(f"Correct answers: {correct}") print(f"Refusals: {refusals}") print(f"Accuracy: {correct/total*100:.1f}% ({correct}/{total})") def main(): # Set up argument parser parser = argparse.ArgumentParser(description='Process and evaluate model responses') parser.add_argument( '--input_file', type=str, required=True, help='Path to the input JSONL file containing model responses' ) parser.add_argument('--output_file', type=str, required=True, help='Path to the output CSV file') parser.add_argument('--model_name', type=str, required=True, help='Name of the model for reporting results') args = parser.parse_args() # Process answers and generate CSV print(f"Processing answers from {args.input_file}...") csv_file = process_answers(args.input_file, args.output_file) print(f"CSV file has been generated: {csv_file}") # Evaluate results print("\nEvaluating results...") evaluate_results(csv_file, args.model_name) if __name__ == "__main__": main()