|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
import csv |
|
|
import json |
|
|
import re |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
def extract_model_answer(response): |
|
|
if not response or "Internal Server Error" in response: |
|
|
return "Internal Server Error" |
|
|
|
|
|
|
|
|
match = re.search(r"The final answer is ([A-D])", response) |
|
|
if match: |
|
|
return match.group(1) |
|
|
return "" |
|
|
|
|
|
|
|
|
def process_answers(input_file, output_file): |
|
|
|
|
|
data = [] |
|
|
with open(input_file, 'r') as f: |
|
|
for line in f: |
|
|
if line.strip(): |
|
|
data.append(json.loads(line)) |
|
|
|
|
|
|
|
|
headers = [ |
|
|
'Question', |
|
|
'Choice A', |
|
|
'Choice B', |
|
|
'Choice C', |
|
|
'Choice D', |
|
|
'Expected Answer', |
|
|
'Model Response', |
|
|
'Extracted Model Answer', |
|
|
] |
|
|
|
|
|
|
|
|
with open(output_file, 'w', newline='') as f: |
|
|
writer = csv.writer(f) |
|
|
writer.writerow(headers) |
|
|
|
|
|
|
|
|
for question_data in data: |
|
|
question = question_data.get('question', '') |
|
|
choices = question_data.get('choices', {}) |
|
|
expected_answer = question_data.get('expected_answer', '') |
|
|
model_response = question_data.get('model_response', '') |
|
|
|
|
|
|
|
|
extracted_answer = extract_model_answer(model_response) |
|
|
|
|
|
|
|
|
row = [ |
|
|
question, |
|
|
choices.get('A', ''), |
|
|
choices.get('B', ''), |
|
|
choices.get('C', ''), |
|
|
choices.get('D', ''), |
|
|
expected_answer, |
|
|
model_response, |
|
|
extracted_answer, |
|
|
] |
|
|
writer.writerow(row) |
|
|
|
|
|
return output_file |
|
|
|
|
|
|
|
|
def evaluate_results(csv_file, model_name): |
|
|
|
|
|
df = pd.read_csv(csv_file) |
|
|
|
|
|
|
|
|
total = len(df) |
|
|
correct = len(df[df['Extracted Model Answer'] == df['Expected Answer']]) |
|
|
refusals = len(df[df['Extracted Model Answer'].str.contains('Internal Server Error', case=False, na=False)]) |
|
|
|
|
|
|
|
|
print(f"\nModel: {model_name}") |
|
|
print(f"Total problems: {total}") |
|
|
print(f"Correct answers: {correct}") |
|
|
print(f"Refusals: {refusals}") |
|
|
print(f"Accuracy: {correct/total*100:.1f}% ({correct}/{total})") |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
parser = argparse.ArgumentParser(description='Process and evaluate model responses') |
|
|
parser.add_argument( |
|
|
'--input_file', type=str, required=True, help='Path to the input JSONL file containing model responses' |
|
|
) |
|
|
parser.add_argument('--output_file', type=str, required=True, help='Path to the output CSV file') |
|
|
parser.add_argument('--model_name', type=str, required=True, help='Name of the model for reporting results') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
print(f"Processing answers from {args.input_file}...") |
|
|
csv_file = process_answers(args.input_file, args.output_file) |
|
|
print(f"CSV file has been generated: {csv_file}") |
|
|
|
|
|
|
|
|
print("\nEvaluating results...") |
|
|
evaluate_results(csv_file, args.model_name) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|