| |
|
| | from grader import is_equal |
| | import json |
| | import re |
| |
|
| |
|
| | def get_gold_list(datapath, dataset_name): |
| |
|
| | assert dataset_name in ["gsm8k", "math", "minerva_math", "gaokao2023en", "olympiadbench", "collegemath"] |
| |
|
| | gold_list = [] |
| | with open(datapath, "r") as f: |
| | for line in f: |
| | item = json.loads(line) |
| |
|
| | if dataset_name == "gsm8k": |
| | gold = item['answer'].split("#### ")[-1] |
| |
|
| | elif dataset_name == "math": |
| | gold = item['answer'] |
| |
|
| | elif dataset_name == "minerva_math": |
| | pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}" |
| | pattern_re = re.compile(pattern, re.DOTALL) |
| | solution = item['solution'] |
| | matches = pattern_re.findall(solution) |
| | if len(matches) == 0: |
| | gold = None |
| | else: |
| | gold = matches[-1] |
| | |
| | elif dataset_name == "gaokao2023en": |
| | gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer']) |
| |
|
| | elif dataset_name == "olympiadbench": |
| | gold = re.sub(r'^\$(.*)\$$', r'\1', item['final_answer'][0]) |
| | |
| | elif dataset_name == "collegemath": |
| | gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer']) |
| |
|
| | gold_list.append(gold) |
| |
|
| | return gold_list |
| |
|
| |
|
| | def get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name): |
| | |
| | gold_list = get_gold_list(test_gold_path, dataset_name) |
| |
|
| | """TODO |
| | Get the output_list from model_output_path |
| | output_list is a list of string (List[str]) |
| | Each string represents the model's response for a corresponding question in the benchmark |
| | Therefore, the length of output_list must match the length of gold_list. |
| | |
| | output_list = ... |
| | """ |
| |
|
| | correct = 0 |
| | for output, gold in zip(output_list, gold_list): |
| | if is_equal(output, gold, dataset_name): |
| | correct += 1 |
| |
|
| | print("accuracy on %s is %.4f" % (dataset_name, correct / len(gold_list))) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | """TODO |
| | Download test benchmarks from Qwen2.5-Math |
| | https://github.com/QwenLM/Qwen2.5-Math/tree/main/evaluation/data |
| | |
| | Prepare model_output_path and test_gold_path for each dataset |
| | """ |
| |
|
| | test_gold_path = "PATH_OF_THE_BENCHMARK" |
| | model_output_path = "PATH_OF_YOUR_MODEL_OUTPUTS" |
| | dataset_name = "DATASET_NAME" |
| |
|
| | get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name) |
| |
|