diff --git a/build/lib/opencompass/configs/datasets/korbench/korbench_gen.py b/build/lib/opencompass/configs/datasets/korbench/korbench_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..0492922abc9fd9d1af1f92e0f3be76a916d52e30 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/korbench/korbench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .korbench_single_0_shot_gen import korbench_0shot_single_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py b/build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..c69cad5e0ba40068a64b3ed324c78b82225f5b03 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py @@ -0,0 +1,60 @@ +from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle'] + +korbench_0shot_single_datasets = [] + +for category in categories: + # Prompt template + prompt_template = dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='HUMAN', + prompt='' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' # f-string + ) + ] + ) + ) + + # Reader configuration + reader_cfg = dict( + input_columns=['prompt'], + output_column='answer', + ) + + # Inference configuration + infer_cfg = dict( + prompt_template=prompt_template, + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + # Evaluation configuration + eval_cfg = dict( + evaluator=dict(type=korbenchEvaluator), + pred_role='BOT', + ) + + korbench_dataset = dict( + type=korbenchDataset, + abbr=f'korbench_{category}', + path='opencompass/korbench', + prompt_mode='0_shot', + category=category, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + ) + + korbench_0shot_single_datasets.append(korbench_dataset) diff --git a/build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py b/build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py new file mode 100644 index 0000000000000000000000000000000000000000..196a797847283c0da8f656ecf3537ffe6ec51eca --- /dev/null +++ b/build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py @@ -0,0 +1,116 @@ +from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess + +categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle'] + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{prompt}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +korbench_0shot_single_datasets = [] + +for category in categories: + # Prompt template + prompt_template = dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='HUMAN', + prompt='' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' # f-string + ) + ] + ) + ) + + # Reader configuration + reader_cfg = dict( + input_columns=['prompt'], + output_column='answer', + ) + + # Inference configuration + infer_cfg = dict( + prompt_template=prompt_template, + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), + ) + + # Evaluation configuration + eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt=GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=korbenchDataset, + path='opencompass/korbench', + prompt_mode='0_shot', + category=category, + reader_cfg=reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', + ) + + # Dataset + korbench_dataset = dict( + type=korbenchDataset, + abbr=f'korbench_{category}', + path='opencompass/korbench', + prompt_mode='0_shot', + category=category, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + mode='singlescore', + ) + + korbench_0shot_single_datasets.append(korbench_dataset) \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py b/build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..f62dc7a1dfe7f1f97ae7ba01077e9299c2c83b6c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py @@ -0,0 +1,54 @@ +from opencompass.datasets.korbench.korbench import ( + korbenchDataset, + korbenchEvaluator, +) + +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle'] + +korbench_3shot_single_datasets = [] + +for category in categories: + # Prompt template + prompt_template = dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='HUMAN', prompt='')], + round=[dict(role='HUMAN', prompt='{prompt}')], # f-string + ), + ) + + # Reader configuration + reader_cfg = dict( + input_columns=['prompt'], + output_column='answer', + ) + + # Inference configuration + infer_cfg = dict( + prompt_template=prompt_template, + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), + ) + + # Evaluation configuration + eval_cfg = dict( + evaluator=dict(type=korbenchEvaluator), + pred_role='BOT', + ) + + korbench_dataset = dict( + type=korbenchDataset, + abbr=f'korbench_{category}', + path='opencompass/korbench', + prompt_mode='3_shot', + category=category, + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + ) + + korbench_3shot_single_datasets.append(korbench_dataset) diff --git a/build/lib/opencompass/configs/datasets/korbench/readme.md b/build/lib/opencompass/configs/datasets/korbench/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..38aba63b27e7051eb15da43ff707719cff3f9615 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/korbench/readme.md @@ -0,0 +1,71 @@ +# KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks + +KOR-Bench is a dataset designed to evaluate large language models (LLMs) on tasks that require reasoning independent of prior knowledge. Created to assess reasoning and planning abilities, KOR-Bench introduces rule-based tasks that minimize the influence of pretrained knowledge, enabling a focused evaluation of intrinsic model capabilities. + +## Overview + +### Purpose + +Large language models, such as GPT-4 and Claude, excel in knowledge-based tasks but face challenges in applying reasoning skills to unfamiliar scenarios. KOR-Bench is built to evaluate such reasoning capabilities across five categories: +- **Operation**: Arithmetic and logical operations. +- **Logic**: Complex deductive and inductive reasoning. +- **Cipher**: Code-breaking and pattern discovery. +- **Puzzle**: Problem-solving with creative and logical reasoning. +- **Counterfactual**: Hypothetical reasoning in alternate scenarios. + +### Dataset Construction + +KOR-Bench tasks are designed with novel rules and configurations, ensuring no reliance on pretrained knowledge. Each task includes: +- **Rules**: Custom rule sets to guide reasoning. +- **Questions**: Carefully crafted problems that require the application of rules. +- **Evaluation Scenarios**: Zero-shot, three-shot, and subquestion-specific configurations. + +The dataset is structured to assess multistep reasoning, pattern recognition, and adaptability to new rules. + +### Dataset Access + +KOR-Bench is publicly available with detailed usage instructions in the [GitHub Repository](https://github.com/KOR-Bench/KOR-Bench). Download the dataset and leverage predefined evaluation scripts or customize your own. + +### Evaluation + +1. Install dependencies and configure your environment. +2. Run evaluations using `opencompass examples/eval_korbench.py` to assess LLM performance. +3. Analyze model performance across various reasoning tasks. + +### Example Command +```bash +opencompass examples/eval_korbench.py +``` + +## Baselines and Results +KOR-Bench includes baseline results for leading LLMs evaluated across various configurations, including zero-shot (gen) and few-shot modes. Below is a summary of the results. +| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | internlm2_5-1_8b-chat-turbomind | llama-3_1-8b-instruct-turbomind | glm-4-9b-chat-turbomind | gemma-2-9b-it-turbomind | +|---------|---------|--------|------|--------------------------------|---------------------------------|---------------------------------|--------------------------|--------------------------| +| korbench_mixed_Multi-Q | 21f998 | accuracy | gen | 0.60 | 0.20 | 9.60 | 8.70 | 7.80 | +| korbench_mixed_Multi-R | 21f998 | accuracy | gen | 1.70 | 0.10 | 8.80 | 12.10 | 9.80 | +| korbench_mixed_Multi-RQ | 21f998 | accuracy | gen | 1.50 | 0.10 | 6.40 | 8.60 | 6.00 | +| korbench_cipher | 21f998 | accuracy | gen | 8.80 | 0.80 | 14.00 | 6.80 | 6.40 | +| korbench_counterfactual | 21f998 | accuracy | gen | 83.60 | 17.20 | 88.80 | 90.40 | 87.60 | +| korbench_logic | 21f998 | accuracy | gen | 8.40 | 3.60 | 37.60 | 38.80 | 40.80 | +| korbench_operation | 21f998 | accuracy | gen | 56.00 | 25.20 | 68.40 | 63.60 | 67.60 | +| korbench_puzzle | 21f998 | accuracy | gen | 3.60 | 0.00 | 3.20 | 3.20 | 5.60 | +| korbench_cipher | 21f998 | accuracy | fewshot | 8.40 | 3.20 | 9.60 | 9.20 | 9.60 | +| korbench_counterfactual | 21f998 | accuracy | fewshot | 87.60 | 58.00 | 23.60 | 89.60 | 84.40 | +| korbench_logic | 21f998 | accuracy | fewshot | 45.20 | 19.60 | 24.40 | 38.40 | 54.00 | +| korbench_operation | 21f998 | accuracy | fewshot | 24.80 | 11.20 | 73.20 | 67.20 | 23.20 | +| korbench_puzzle | 21f998 | accuracy | fewshot | 4.80 | 2.40 | 1.60 | 3.60 | 6.80 | + +### Citation + +**BibTeX:** +```bibtex +@misc{ma2024korbenchbenchmarkinglanguagemodels, +title={KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks}, +author={Kaijing Ma and Xinrun Du and Yunran Wang and Haoran Zhang and Zhoufutu Wen and Xingwei Qu and Jian Yang and Jiaheng Liu and Minghao Liu and Xiang Yue and Wenhao Huang and Ge Zhang}, +year={2024}, +eprint={2410.06526}, +archivePrefix={arXiv}, +primaryClass={cs.DB}, +url={https://arxiv.org/abs/2410.06526}, +} +``` diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py new file mode 100644 index 0000000000000000000000000000000000000000..b50b6ecc02874e3f214a3ed95bc28e8b44f28421 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py @@ -0,0 +1,166 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + n=5, + k=3 +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..b1966fe989d4b80bd06d5d0c8575ce3f2798beb3 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livecodebench_gen_a4f90b import LCB_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py new file mode 100644 index 0000000000000000000000000000000000000000..6f2da11eaa32115bc2dd18310d8d3321b909db93 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py new file mode 100644 index 0000000000000000000000000000000000000000..ceb3514f468d401f1cf1b940bbdd1318707d9067 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py new file mode 100644 index 0000000000000000000000000000000000000000..c2591e759762b5b11707194b1a4c0611b8bb6669 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py @@ -0,0 +1,163 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py new file mode 100644 index 0000000000000000000000000000000000000000..b74e2c76ef8d109d4ab400b0ef7b925a1e35cf1a --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py @@ -0,0 +1,165 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_v4', + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_v4', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v4', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py new file mode 100644 index 0000000000000000000000000000000000000000..cd65c3ae18d698a0d2ef796af823672f6aad038e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py @@ -0,0 +1,165 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_split_v4', + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_split_v4', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_split_v4', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py new file mode 100644 index 0000000000000000000000000000000000000000..89bd9eb1553c44f145f5a8c6a0da440e47ab7e62 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py @@ -0,0 +1,132 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import (LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator) + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501 + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict(prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt=prompt_template)])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict(type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + release_version='release_v5', + start_date='2024-08-01', + end_date='2025-02-01'), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v5', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt= + 'You are an expert at Python programming, code execution, test case generation, and fuzzing.' # noqa: E501 + ), + ], + round=[dict(role='HUMAN', prompt='{prompt}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict(type=LCBCodeExecutionEvaluator, ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501 + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[dict(role='HUMAN', prompt='{prompt}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcb_test_output_eval_cfg = dict( + evaluator=dict(type=LCBTestOutputEvaluator, ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py new file mode 100644 index 0000000000000000000000000000000000000000..f2a0d77efd4de95267d9fd0be574e963dd7a2bed --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_v1', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + release_version='release_v1', +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + # LCBCodeExecution_dataset, + # LCBTestOutput_dataset, +] diff --git a/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py new file mode 100644 index 0000000000000000000000000000000000000000..56a96db2b514880be068564b4a11ec8f72ef4ae4 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py @@ -0,0 +1,168 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + release_version='v6', + extractor_version='v2', + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation_repeat_6', + path='opencompass/code_generation_lite', + release_version='v6', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg, + n=6, +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/README.md b/build/lib/opencompass/configs/datasets/livemathbench/README.md new file mode 100644 index 0000000000000000000000000000000000000000..24949f204e60bee14cda9826692413d7da38a1aa --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/README.md @@ -0,0 +1,74 @@ +# LiveMathBench + +## v202412 + +### Details of Datsets + +| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving | +| -- | -- | -- | -- | -- | -- | +| AMC | cn | 0 | 0 | 0 | 46 | +| AMC | en | 0 | 0 | 0 | 46 | +| CCEE | cn | 0 | 0 | 13 | 31 | +| CCEE | en | 0 | 0 | 13 | 31 | +| CNMO | cn | 0 | 0 | 0 | 18 | +| CNMO | en | 0 | 0 | 0 | 18 | +| WLPMC | cn | 0 | 0 | 0 | 11 | +| WLPMC | en | 0 | 0 | 0 | 11 | + + +### How to use + +#### G-Pass@k +```python +from mmengine.config import read_base + +with read_base(): + from opencompass.datasets.livemathbench_gen import livemathbench_datasets + +livemathbench_datasets[0]['eval_cfg']['evaluator'].update( + { + 'model_name': 'Qwen/Qwen2.5-72B-Instruct', + 'url': [ + 'http://0.0.0.0:23333/v1', + '...' + ] # set url of evaluation models + } +) +livemathbench_dataset['infer_cfg']['inferencer'].update(dict( + max_out_len=32768 # for o1-like models you need to update max_out_len +)) + +``` + +#### Greedy +```python +from mmengine.config import read_base + +with read_base(): + from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets + +livemathbench_datasets[0]['eval_cfg']['evaluator'].update( + { + 'model_name': 'Qwen/Qwen2.5-72B-Instruct', + 'url': [ + 'http://0.0.0.0:23333/v1', + '...' + ] # set url of evaluation models + } +) +livemathbench_dataset['infer_cfg']['inferencer'].update(dict( + max_out_len=32768 # for o1-like models you need to update max_out_len +)) + +``` + +### Output Samples + +| dataset | version | metric | mode | Qwen2.5-72B-Instruct | +|----- | ----- | ----- | ----- | -----| +| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx | + diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..4977f1e21e4d5d5a59c0a2328d20c733e26deaf6 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livemathbench_gen_9befbf import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py new file mode 100644 index 0000000000000000000000000000000000000000..4b8ca8fd3923de9f43ecaa3a0e891738375b3959 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_reader_cfg = dict( + input_columns=['prompt'], + output_column='answer' +) + +livemathbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=16384, + temperature=1.0 + ) +) + +livemathbench_eval_cfg = dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='Qwen/Qwen2.5-72B-Instruct', + url=['http://172.30.40.154:23333/v1/'] #'https://api.openai.com/v1/' + ) +) + +livemathbench_datasets = [ + dict( + type=LiveMathBenchDataset, + abbr='LiveMathBench-k1-n1', + path='opencompass/LiveMathBench202412', + k=1, # K@Pass + n=1, # Run times + reader_cfg=livemathbench_reader_cfg, + infer_cfg=livemathbench_infer_cfg, + eval_cfg=livemathbench_eval_cfg + ) +] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2da4ade8ade0361177f7e4d0cd460861b12317 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='', + k=16, + n=48, + dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'], + dataset_languages=['cn', 'en'], + cot=True, + version='202412', + abbr='LiveMathBench-v202412', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=8192 + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py new file mode 100644 index 0000000000000000000000000000000000000000..d0f7302322f399718319f3761eb4632816ad670e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_reader_cfg = dict( + input_columns=['prompt'], + output_column='answer' +) + +livemathbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=2048, + temperature=1.0 + ) +) + +livemathbench_eval_cfg = dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='Qwen/Qwen2.5-72B-Instruct', + url=[] + ) +) + +livemathbench_datasets = [ + dict( + type=LiveMathBenchDataset, + abbr='LiveMathBench', + path='', + k=32, + n=5, + reader_cfg=livemathbench_reader_cfg, + infer_cfg=livemathbench_infer_cfg, + eval_cfg=livemathbench_eval_cfg + ) +] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..c1d72d150146d881a65796bb00abd0c5a20d4a53 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f66615454242dcb5610a362408a010e764a036 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='', + k=1, + n=1, + dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'], + dataset_languages=['cn', 'en'], + cot=True, + version='202412', + abbr='LiveMathBench-v202412', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=8192 + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0c421196ff604ef7001d8c47fcb5a688ad523e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py @@ -0,0 +1,120 @@ +""" +Summary: A config for LiveMathBench-Hard-202412 Dataset Evaluation. +Setting: + Shot: 0-shot + Evaluator: + - CascadeEvaluator + - MATHVerifyEvaluator + - GenericLLMEvaluator + Repeat: 32 +Avaliable Models: + - Instruct/Chat Models +""" +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import CustomDataset +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.evaluator import ( + CascadeEvaluator, + GenericLLMEvaluator, + MATHVerifyEvaluator, +) + +livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + + +# Inference configuration +livemathbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{question}\nRemember to put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +# Template for the LLM judge +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + : \n{question}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + + + +splits = ['hard_cn', 'hard_en'] +# Dataset configuration +livemathbench_datasets = [ + dict( + type=CustomDataset, + abbr=f'livemathbench_hard_custom_{split}', + path='data/LiveMathBench', + local_mode=True, + file_name=f'202412/{split}.jsonl', + reader_cfg=livemathbench_reader_cfg, + infer_cfg=livemathbench_infer_cfg, + eval_cfg=dict( + # Evaluation configuration using LLM as judge + evaluator=dict( + type=CascadeEvaluator, + rule_evaluator=dict( + type=MATHVerifyEvaluator, + ), + llm_evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=CustomDataset, + path='data/LiveMathBench', + local_mode=True, + file_name=f'202412/{split}.jsonl', + reader_cfg=livemathbench_reader_cfg, + ), + judge_cfg={}, + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + parallel=False + ), + ), + n=1, # repeat n times + ) for split in splits +] diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py new file mode 100644 index 0000000000000000000000000000000000000000..1e0852736aea45438cd81fc86e06293fc7c69e52 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py @@ -0,0 +1,96 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import CustomDataset +from opencompass.datasets import generic_llmjudge_postprocess +from itertools import product + +livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + + +# Inference configuration +livemathbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{question}\n', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +# Template for the LLM judge +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + : \n{question}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + + + +splits = ['hard_cn', 'hard_en'] +# Dataset configuration +livemathbench_datasets = [ + dict( + type=CustomDataset, + abbr=f'livemathbench_hard_custom_{split}_run{run_idx}', + path='data/LiveMathBench', + local_mode=True, + file_name=f'202412/{split}.jsonl', + reader_cfg=livemathbench_reader_cfg, + infer_cfg=livemathbench_infer_cfg, + eval_cfg=dict( + # # Evaluation configuration using LLM as judge + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=CustomDataset, + path='data/LiveMathBench', + local_mode=True, + file_name=f'202412/{split}.jsonl', + reader_cfg=livemathbench_reader_cfg, + ), + judge_cfg={}, + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ), + ) for split, run_idx in product(splits, range(8)) +] diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py new file mode 100644 index 0000000000000000000000000000000000000000..6b2f9f5a9d09024e9c2492b4c333260c12935603 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='', + k=16, + n=48, + dataset_splits=['hard'], + dataset_languages=['cn', 'en'], + cot=True, + version='202412', + abbr='LiveMathBench-v202412-Hard', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py new file mode 100644 index 0000000000000000000000000000000000000000..f956f83e0d3f4169623a90ce8ebe5f33d098d5c4 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='', + k=1, + n=1, + dataset_splits=['hard'], + dataset_languages=['cn', 'en'], + cot=True, + version='202412', + abbr='LiveMathBench-v202412-Hard', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py new file mode 100644 index 0000000000000000000000000000000000000000..6a51384764ebb14e26925708ea5603436886fcc6 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py @@ -0,0 +1,97 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets.livemathbench import LiveMathBenchDataset +from opencompass.datasets import generic_llmjudge_postprocess + +livemathbench_reader_cfg = dict( + input_columns=['question'], output_column='answer' +) + + +# Inference configuration +livemathbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{question}\n', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +# Template for the LLM judge +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + : \n{question}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + + +splits = ['hard'] +livemathbench_datasets = [] +for split in splits: + # Dataset configuration + livemathbench_datasets.append( + dict( + type=LiveMathBenchDataset, + abbr=f'livemathbench_{split}', + path='opencompass/LiveMathBench', + dataset_splits = [split], + dataset_languages= ['cn', 'en'], + reader_cfg=livemathbench_reader_cfg, + infer_cfg=livemathbench_infer_cfg, + eval_cfg=dict( + # # Evaluation configuration using LLM as judge + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=LiveMathBenchDataset, + path='opencompass/LiveMathBench202412', + dataset_splits = [split], + reader_cfg=livemathbench_reader_cfg, + ), + judge_cfg={}, + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ), + ) + ) diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py new file mode 100644 index 0000000000000000000000000000000000000000..1663b0ef52faf6fcb6371e2721164b7cdc649863 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='opencompass/LiveMathBench', + k=16, + n=48, + dataset_splits=['all'], + dataset_languages=['en'], + cot=True, + version='202505', + abbr='LiveMathBench-v202505', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=8192 + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py new file mode 100644 index 0000000000000000000000000000000000000000..5c801b9613386d0541eb4e501964a6bd1796160f --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='opencompass/LiveMathBench', + k=1, + n=1, + dataset_splits=['all'], + dataset_languages=['en'], + cot=True, + version='202505', + abbr='LiveMathBench-v202505', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, + max_out_len=8192 + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py new file mode 100644 index 0000000000000000000000000000000000000000..917bc92723889dd24d70f6c7fff30a725e54ed6d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='opencompass/LiveMathBench', + k=16, + n=48, + dataset_splits=['hard'], + dataset_languages=['en'], + cot=True, + version='202505', + abbr='LiveMathBench-v202505-Hard', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py new file mode 100644 index 0000000000000000000000000000000000000000..52e6a8b24790563fbf02fcf8579a00237267a7e8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator + +livemathbench_dataset = dict( + type=LiveMathBenchDataset, + path='opencompass/LiveMathBench', + k=1, + n=1, + dataset_splits=['hard'], + dataset_languages=['en'], + cot=True, + version='202505', + abbr='LiveMathBench-v202505-Hard', + reader_cfg=dict( + input_columns=['prompt'], + output_column='answer' + ), + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer + ), + ), + eval_cfg=dict( + evaluator=dict( + type=LiveMathBenchEvaluator, + model_name='', + url=[] + ) + ) +) +livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f1dd30e53c4c029fef2303551db48c04a25bbe --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livereasonbench_gen_0283c3 import simpleqa_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py new file mode 100644 index 0000000000000000000000000000000000000000..56ed90d808caacb7cfda82bc517c96d1cd2b0009 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py @@ -0,0 +1,136 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +# from opencompass.datasets import SimpleQADataset, simpleqa_postprocess +from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + +livereasonbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Question: {question}\n'), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=16384)) + +livereasonbench_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dict_postprocessor=dict(type=livereasonbench_postprocess), + ), + pred_role='BOT', +) + +livereasonbench_datasets = [ + dict( + abbr='LiveReasonBench-20241202', + type=LiveReasonBenchDataset, + path='opencompass/LiveReasonBench', + reader_cfg=livereasonbench_reader_cfg, + infer_cfg=livereasonbench_infer_cfg, + eval_cfg=livereasonbench_eval_cfg, + version='livereasonbench-20241202', + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py new file mode 100644 index 0000000000000000000000000000000000000000..d31f7a9958b13be33995c7c5d7b23f14b09b05c2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py @@ -0,0 +1,142 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + +livereasonbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Question: {question}\n'), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=16384)) + +livereasonbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=LiveReasonBenchDataset, + path='opencompass/LiveReasonBench', + reader_cfg=livereasonbench_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=livereasonbench_postprocess), + ), + pred_role='BOT', +) + +livereasonbench_datasets = [ + dict( + abbr='LiveReasonBench-20241202', + type=LiveReasonBenchDataset, + path='opencompass/LiveReasonBench', + reader_cfg=livereasonbench_reader_cfg, + infer_cfg=livereasonbench_infer_cfg, + eval_cfg=livereasonbench_eval_cfg, + version='livereasonbench-20241202', + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py new file mode 100644 index 0000000000000000000000000000000000000000..4cf710969aef85c6b50fbda9409b579dd4a90872 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py @@ -0,0 +1,142 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + +livereasonbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Question: {question}\n'), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +livereasonbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=LiveReasonBenchDataset, + path='opencompass/LiveReasonBench', + reader_cfg=livereasonbench_reader_cfg, + version='livereasonbench-20250428', + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=livereasonbench_postprocess), + ), +) + +livereasonbench_datasets = [ + dict( + abbr='LiveReasonBench-20250428', + type=LiveReasonBenchDataset, + path='opencompass/LiveReasonBench', + reader_cfg=livereasonbench_reader_cfg, + infer_cfg=livereasonbench_infer_cfg, + eval_cfg=livereasonbench_eval_cfg, + version='livereasonbench-20250428', + n=1 + ) +] diff --git a/build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py b/build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py new file mode 100644 index 0000000000000000000000000000000000000000..c0d2d2e99c7b0fbd537de394f19e968c0f78a7b3 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py @@ -0,0 +1,152 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +livereasonbench_subsets = { + 'biology': 'livestembench_bio', + 'chemistry': 'livestembench_che', + 'physics': 'livestembench_phy', +} + +livestembench_datasets = [] + +for name, subset in livereasonbench_subsets.items(): + livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + + livereasonbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192)) + + livereasonbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=LiveStemBenchDataset, + path='opencompass/livestembench', + reader_cfg=livereasonbench_reader_cfg, + version=subset, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=livereasonbench_postprocess), + ), + pred_role='BOT', + ) + + livestembench_datasets.append( + dict( + abbr=f'LiveStemBench-{name}', + type=LiveStemBenchDataset, + path='opencompass/livestembench', + reader_cfg=livereasonbench_reader_cfg, + infer_cfg=livereasonbench_infer_cfg, + eval_cfg=livereasonbench_eval_cfg, + version=subset, + mode='singlescore', + ) + ) diff --git a/build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py b/build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py new file mode 100644 index 0000000000000000000000000000000000000000..6db0aa787e1646b2faed0fbd47c1531dde3e5d17 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py @@ -0,0 +1,155 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess +from opencompass.utils import xml_tag_postprocessor + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +livereasonbench_subsets = { + 'biology': 'livestembench_bio', + 'chemistry': 'livestembench_che', + 'physics': 'livestembench_phy', +} + +livestembench_datasets = [] + +for name, subset in livereasonbench_subsets.items(): + livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + + livereasonbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192)) + + livereasonbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=LiveStemBenchDataset, + path='opencompass/livestembench', + reader_cfg=livereasonbench_reader_cfg, + version=subset, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=livereasonbench_postprocess), + pred_postprocessor=dict(type=xml_tag_postprocessor, tag=''), + + ), + pred_role='BOT', + ) + + livestembench_datasets.append( + dict( + abbr=f'LiveStemBench-{name}', + type=LiveStemBenchDataset, + path='opencompass/livestembench', + reader_cfg=livereasonbench_reader_cfg, + infer_cfg=livereasonbench_infer_cfg, + eval_cfg=livereasonbench_eval_cfg, + version=subset, + mode='singlescore', + ) + ) diff --git a/build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py b/build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..0269471b69b2dd18e7133960e54c302d05604fec --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livestembench_gen_3e3c50 import livestembench_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py b/build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py new file mode 100644 index 0000000000000000000000000000000000000000..0ca0eaec24d10e78f536ac2e230be752586ebbee --- /dev/null +++ b/build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py @@ -0,0 +1,152 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess + + +GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED +Just return the letters "A", "B", or "C", with no text around it. + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {answer} +Predicted answer: {prediction} +``` +""".strip() + +livereasonbench_subsets = { + 'biology': 'livestembench_bio', + 'chemistry': 'livestembench_che', + 'physics': 'livestembench_phy', +} + +livestembench_datasets = [] + +for name, subset in livereasonbench_subsets.items(): + livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + + livereasonbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='问题: {question}\n请逐步思考,并给出最终答案,答案放在 \\boxed{{}} 中。'), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192)) + + livereasonbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=LiveStemBenchDataset, + path='opencompass/livestembench', + reader_cfg=livereasonbench_reader_cfg, + version=subset, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=livereasonbench_postprocess), + ), + pred_role='BOT', + ) + + livestembench_datasets.append( + dict( + abbr=f'LiveStemBench-{name}', + type=LiveStemBenchDataset, + path='opencompass/livestembench', + reader_cfg=livereasonbench_reader_cfg, + infer_cfg=livereasonbench_infer_cfg, + eval_cfg=livereasonbench_eval_cfg, + version=subset, + mode='singlescore', + ) + ) diff --git a/build/lib/opencompass/configs/datasets/llm_compression/README.md b/build/lib/opencompass/configs/datasets/llm_compression/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b01bf89b87fda8d61adb8b0336e4095f662bb3d9 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/llm_compression/README.md @@ -0,0 +1,105 @@ +# LLM Compression + +## Introduction + +The following introduction comes from the abstract of [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937): + +>There is a belief that learning to compress well will lead to intelligence. Recently, language modeling has been shown to be equivalent to compression, which offers a compelling rationale for the success of large language models (LLMs): the development of more advanced language models is essentially enhancing compression which facilitates intelligence. ...our findings suggest that compression efficiency, as an unsupervised metric derived from raw text corpora, serves as a reliable evaluation measure that is linearly associated with the model capabilities. We open-source our compression datasets as well as our data collection pipelines to facilitate future researchers to assess compression properly. + + +## Official Links + +- Paper: [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937) +- GitHub Repository: [llm-compression-intelligence](https://github.com/hkust-nlp/llm-compression-intelligence) + + +## Overview and Usage + +### Dataset +The dataset, which consists of three external corpora, can be downloaded using the following python script: + +```python +from os import os.path as osp +from datasets import load_dataset + +data_path = "data/llm-compression" + +subset_mapping = { + 'arxiv_math': ['arxiv_math'], + 'commoncraw': ['cc'], + 'python': ['python'], +} + +for key, value in subset_mapping.items(): + llmc_dataset = load_dataset(r"hkust-nlp/llm-compression", name=value) + llmc_dataset["test"].to_json(osp.join(data_path, f"{key}.jsonl")) +``` + +Note: Refer to the original [repository](https://github.com/hkust-nlp/llm-compression-intelligence) for more details on data collection and design. + + +### Inference + +The inference stage (`SWCELossInferencer`) consists of the following key steps: + +1. For each candidate model, we obtain the encodings of each sample of the dataset using its tokenizer. +2. Concatenate the encodings of all samples into a single array and construct a PyTorch Dataset. Each item of `__getitem__` is a chunk of the array based on a sliding window. To reproduce results from the original paper, set `block_size=1900` and `stride=512`. +3. For each batch, calculate the cross entropy loss based on model logits and targets. The losses within each batch is reduced to a single loss by summation. +4. Output the losses and `total_chr_num` to `BPCEvaluator` for evaluation. + + +### Evaluation + +`BPCEvaluator`: Using the total loss for each batch and the total number of characters in the original dataset from the inference stage, calculate the Bits per Character (BPC) metric for each model: + +$$ BPC = \frac{TotalCrossEntropyLoss}{TotalCharacterNumber*log(2)} $$ + + +### Summarization + + + +### Config Files + +1. Dataset config: `configs/datasets/llm-compression.py` +2. Evaluation config: `examples/eval_llm_compression.py` + +## Evaluation Results +``` + metric version model commoncraw python arxiv_math average +0 bpc af04af qwen1.5-32b-hf 0.5910 0.2584 0.4080 0.4191 +1 bpc af04af qwen1.5-14b-hf 0.6459 0.2766 0.4310 0.4512 +2 bpc af04af qwen-14b-hf 0.6197 0.2849 0.4498 0.4515 +3 bpc af04af llama-30b-hf 0.5773 0.3212 0.4562 0.4516 +4 bpc af04af llama-2-13b-hf 0.5807 0.3336 0.4752 0.4632 +5 bpc af04af qwen1.5-7b-hf 0.6658 0.2935 0.4500 0.4698 +6 bpc af04af qwen-7b-hf 0.6453 0.3088 0.4830 0.4790 +7 bpc af04af llama-13b-hf 0.6083 0.3555 0.4865 0.4834 +8 bpc af04af llama-2-7b-hf 0.6117 0.3536 0.4995 0.4883 +9 bpc af04af llama-7b-hf 0.6285 0.3794 0.5096 0.5058 +10 bpc af04af qwen1.5-1.8b-hf 0.7448 0.4029 0.5625 0.5701 +11 bpc af04af qwen-1.8b-hf 0.7542 0.4175 0.5842 0.5853 +12 bpc af04af qwen1.5-0.5b-hf 0.8102 0.4520 0.6181 0.6268 +``` + + +## FAQ + +### I am getting this warning during inference, should I truncate long samples to `max_seq_len` to avoid further errors? +``` +Token indices sequence length is longer than the specified maximum sequence length for this model. Running this sequence through the model will result in indexing errors +``` +>A: This warning comes from the tokenizer indicating that the input sequence length exceeds the model's input length, but it does not affect the operation of the tokenizer. For loss calculation, as long as we set a `block_size` of the sliding window less than `max_seq_len`, we can safely ignore this warning. + + +## Reference +``` +@misc{huang2024compression, + title={Compression Represents Intelligence Linearly}, + author={Yuzhen Huang and Jinghan Zhang and Zifei Shan and Junxian He}, + year={2024}, + eprint={2404.09937}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py b/build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py new file mode 100644 index 0000000000000000000000000000000000000000..24681c01f375c15a910360afaf8799a325e99c3b --- /dev/null +++ b/build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py @@ -0,0 +1,50 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import SWCELossInferencer +from opencompass.openicl.icl_evaluator import BPCEvaluator +from opencompass.datasets import LLMCompressionDataset + + +# The three corpora for llm_compression used in the original paper +# See configs/datasets/llm_compression/README.md for more details +subset_mapping = { + 'arxiv_math': ['arxiv_math'], + 'commoncraw': ['cc'], + 'python': ['python'], +} + + +# Build LLM Compression datasets +llm_compression_datasets = [] +for _name in subset_mapping.keys(): + llm_cmp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{content}', + ), + # No in-context example, using ZeroRetriever + retriever=dict(type=ZeroRetriever), + # Calculates cross entropy loss for each batch based on a sliding context window + # Setting block_size=1900 and stride=512 according to the original paper + inferencer=dict(type=SWCELossInferencer, block_size=1900, stride=512), + ) + + # Calculates Bits per Character (BPC) based on the CE loss from the inference stage + llm_cmp_eval_cfg = dict(evaluator=dict(type=BPCEvaluator)) + + llm_compression_datasets.append( + dict( + abbr=f'llm_compression-{_name}', + type=LLMCompressionDataset, + path='./data/llm-compression', + name=_name, + samples=None, # Set small samples for testing + reader_cfg=dict( + input_columns=['content'], + output_column=None, + ), + infer_cfg=llm_cmp_infer_cfg, + eval_cfg=llm_cmp_eval_cfg, + )) + +del _name diff --git a/build/lib/opencompass/configs/datasets/longbench/longbench.py b/build/lib/opencompass/configs/datasets/longbench/longbench.py new file mode 100644 index 0000000000000000000000000000000000000000..12ee0489ac88982ec9db04738b603a398f58fbb4 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/longbench/longbench.py @@ -0,0 +1,26 @@ +from mmengine.config import read_base + +with read_base(): + from .longbench2wikimqa.longbench_2wikimqa_gen import LongBench_2wikimqa_datasets + from .longbenchhotpotqa.longbench_hotpotqa_gen import LongBench_hotpotqa_datasets + from .longbenchmusique.longbench_musique_gen import LongBench_musique_datasets + from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets + from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets + from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets + from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets + from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets + from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets + from .longbenchqmsum.longbench_qmsum_gen import LongBench_qmsum_datasets + from .longbenchvcsum.longbench_vcsum_gen import LongBench_vcsum_datasets + from .longbenchdureader.longbench_dureader_gen import LongBench_dureader_datasets + from .longbenchlcc.longbench_lcc_gen import LongBench_lcc_datasets + from .longbenchrepobench.longbench_repobench_gen import LongBench_repobench_datasets + from .longbenchpassage_retrieval_en.longbench_passage_retrieval_en_gen import LongBench_passage_retrieval_en_datasets + from .longbenchpassage_retrieval_zh.longbench_passage_retrieval_zh_gen import LongBench_passage_retrieval_zh_datasets + from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets + from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets + from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets + from .longbenchmulti_news.longbench_multi_news_gen import LongBench_multi_news_datasets + from .longbenchsamsum.longbench_samsum_gen import LongBench_samsum_datasets + +longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py b/build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..b798d5f7a4d47b2c698d8b7e8f1b438f124bcf95 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .longbenchv2_gen_75fbba import LongBenchv2_datasets diff --git a/build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py b/build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2e214c7fba76f3c225fb26bff4f45dd05a966e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import LongBenchv2Dataset, LongBenchv2Evaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +LongBenchv2_reader_cfg = dict( + input_columns=['context', 'question', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'difficulty', 'length'], + output_column='answer', +) + +LongBenchv2_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='Please read the following text and answer the questions below.\n \n {context} \n \n \n What is the correct answer to this question: {question} \n \n Choices: \n (A) {choice_A} \n (B) {choice_B} \n (C) {choice_C} \n (D) {choice_D} \n Let’s think step by step. Based on the above, what is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)', + ), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +LongBenchv2_eval_cfg = dict( + evaluator=dict(type=LongBenchv2Evaluator), + pred_role='BOT', + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') +) + +LongBenchv2_datasets = [ + dict( + type=LongBenchv2Dataset, + abbr='LongBenchv2', + path='opencompass/longbenchv2', + reader_cfg=LongBenchv2_reader_cfg, + infer_cfg=LongBenchv2_infer_cfg, + eval_cfg=LongBenchv2_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/lveval/lveval.md b/build/lib/opencompass/configs/datasets/lveval/lveval.md new file mode 100644 index 0000000000000000000000000000000000000000..ae48b64ac1f5c6c676a7ce009b9165a2c450667d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/lveval/lveval.md @@ -0,0 +1,165 @@ +# LVEval +## Introduction +The following introduction comes from the introduction in [LVEval](https://github.com/infinigence/LVEval) + +``` +LV-Eval是一个具备5个长度等级(16k、32k、64k、128k和256k)、最大文本测试长度达到256k的长文本评测基准。LV-Eval的平均文本长度达到102,380字,最小/最大文本长度为11,896/387,406字。LV-Eval主要有两类评测任务——单跳QA和多跳QA,共包含11个涵盖中英文的评测数据子集。LV-Eval设计时引入3个关键技术:干扰事实插入(Confusiong Facts Insertion,CFI)提高挑战性,关键词和短语替换(Keyword and Phrase Replacement,KPR)减少信息泄漏,以及基于关键词召回的评测指标(Answer Keywords,AK,指代结合答案关键词和字词黑名单的评价指标)提高评测数值客观性。我们希望LV-Eval为未来长文本大语言模型的研究发展提供有价值的性能参考。 +LV-Eval is a challenging long-context benchmark with five length levels (16k, 32k, 64k, 128k, and 256k) reaching up to 256k words. The average number of words is 102,380, and the Min/Max number of words is 11,896/387,406. LV-Eval features two main tasks, single-hop QA and multi-hop QA, comprising 11 bilingual datasets. The design of LV-Eval has incorporated three key techniques, namely confusing facts insertion (CFI), keyword and phrase replacement (KPR), and keyword-recall-based metrics (AK, short for metics with Answer Keywords and word blacklist) design, which jointly provide a challenging, mitigated-knowledge-leakege, and more accurate evaluation of the long-context capability of LLMs. We anticipate that LV-Eval will serve as a valuable resource for supporting future research on long-context LLMs. +``` + +## Official link + +### Paper + +[_LV_-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K](https://arxiv.org/abs/2402.05136) + +### Repository + +[LVEval](https://github.com/infinigence/LVEval) + +## Use cases + +In evaluation scripts, add LVEval dataset as other datasets by using +``` +from .datasets.lveval.lveval import LVEval_datasets as datasets +``` + +## Examples +Input example I (from lic_mixup datasets): +``` +请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。 + +文章:......文章 9\n\n标题:腐质酸\n内容:腐植酸是自然界中广泛存在的大分子有机物质,广泛应用于农林牧、石油、化工、建材、医药卫生、环保等各个领域。横跨几十个行业。特别是眼下提倡生态农业建设、无公害农业生产、绿色食品、无污染环保产品等,更使\"腐植酸\"备受推崇,事实证明,人类的生活和生存离不开腐植酸,它的确是一个发展中的有希望的朝阳产业,属于一个新型的特殊行业...... + +请现在基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。 + +问题:中国的文学受到印度哪些方面的影响? +回答: +``` +Output example I (from chatglm3-6b-32k): +``` +中国文学自印度文学大量吸收营养,在佛教东流之后,从语汇到修辞,从题材到体裁,即便审美取向也深受佛教与印度文学的感染。 +``` +Input example II (from factrecall_zh datasets): +``` +请基于给定的文章回答下述问题。 + +文章:......庚子年间,贝多芬,乃一德裔美籍学士,研究于物理理学。彼其良图,探求相对论、量子力学,尤有大进。质能等价公式 E=mc²,千古独步,声名于当世。诺贝尔物理学奖、以资尊荣,兹矣荣耀之大典。论其学术,涉时空能量,影响深远,以其义非常人,广为当世所知,声名播于天下,实乃现代物理学之奠基者...... + +现在请基于上述文章回答下面的问题。 + +问题:被世人广泛推崇为现代物理学奠基人的科学家叫什么名字? +回答: +``` +Output example II (from chatglm3-6b-32k): +``` +贝多芬 +``` +## Evaluation results + +``` +dataset version metric mode bluelm-7b-chat-32k-hf +----------------------------------------- --------- ------------- ------ ----------------------- +---------------------------------------- - - - - +--------- LVEval All --------- - - - - +---------------------------------------- - - - - +LVEval_qa - naive_average gen 12.00 +---------------------------------------- - - - - +--------- LVEval Tasks All --------- - - - - +---------------------------------------- - - - - +LVEval_single_hop_qa - naive_average gen 15.11 +LVEval_single_hop_cqa - naive_average gen 9.21 +LVEval_multi_hop_qa - naive_average gen 6.99 +LVEval_multi_hop_cqa - naive_average gen 9.90 +LVEval_factrecall_cqa - naive_average gen 21.28 +---------------------------------------- - - - - +--------- LVEval Datasets All --------- - - - - +---------------------------------------- - - - - +LVEval_loogle_SD_mixup - naive_average gen 12.81 +LVEval_cmrc_mixup - naive_average gen 17.41 +LVEval_multifieldqa_en_mixup - naive_average gen 7.10 +LVEval_multifieldqa_zh_mixup - naive_average gen 11.31 +LVEval_dureader_mixup - naive_average gen 13.19 +LVEval_loogle_CR_mixup - naive_average gen 5.17 +LVEval_loogle_MIR_mixup - naive_average gen 2.60 +LVEval_hotpotwikiqa_mixup - naive_average gen 10.20 +LVEval_lic_mixup - naive_average gen 9.60 +LVEval_factrecall_en - naive_average gen 23.67 +LVEval_factrecall_zh - naive_average gen 18.90 +---------------------------------------- - - - - +--------- LVEval Single_Hop QA --------- - - - - +---------------------------------------- - - - - +LVEval_loogle_SD_mixup_16k 83bc25 LVEval_f1 gen 35.05 +LVEval_loogle_SD_mixup_32k 83bc25 LVEval_f1 gen 13.37 +LVEval_loogle_SD_mixup_64k 83bc25 LVEval_f1 gen 6.32 +LVEval_loogle_SD_mixup_128k 83bc25 LVEval_f1 gen 5.28 +LVEval_loogle_SD_mixup_256k 83bc25 LVEval_f1 gen 4.00 +---------------------------------------- - - - - +LVEval_cmrc_mixup_16k 8bac4e LVEval_f1 gen 46.45 +LVEval_cmrc_mixup_32k 8bac4e LVEval_f1 gen 19.41 +LVEval_cmrc_mixup_64k 8bac4e LVEval_f1 gen 11.10 +LVEval_cmrc_mixup_128k 8bac4e LVEval_f1 gen 5.89 +LVEval_cmrc_mixup_256k 8bac4e LVEval_f1 gen 4.22 +---------------------------------------- - - - - +--------- LVEval Single_Hop CQA --------- - - - - +---------------------------------------- - - - - +LVEval_multifieldqa_en_mixup_16k 83bc25 LVEval_f1 gen 12.28 +LVEval_multifieldqa_en_mixup_32k 83bc25 LVEval_f1 gen 4.64 +LVEval_multifieldqa_en_mixup_64k 83bc25 LVEval_f1 gen 8.30 +LVEval_multifieldqa_en_mixup_128k 83bc25 LVEval_f1 gen 5.63 +LVEval_multifieldqa_en_mixup_256k 83bc25 LVEval_f1 gen 4.64 +---------------------------------------- - - - - +LVEval_multifieldqa_zh_mixup_16k ac4a0d LVEval_f1 gen 22.30 +LVEval_multifieldqa_zh_mixup_32k ac4a0d LVEval_f1 gen 17.46 +LVEval_multifieldqa_zh_mixup_64k ac4a0d LVEval_f1 gen 6.27 +LVEval_multifieldqa_zh_mixup_128k ac4a0d LVEval_f1 gen 5.84 +LVEval_multifieldqa_zh_mixup_256k ac4a0d LVEval_f1 gen 4.71 +---------------------------------------- - - - - +--------- LVEval Multi_Hop QA --------- - - - - +---------------------------------------- - - - - +LVEval_dureader_mixup_16k 8bac4e LVEval_rouge gen 18.04 +LVEval_dureader_mixup_32k 8bac4e LVEval_rouge gen 18.33 +LVEval_dureader_mixup_64k 8bac4e LVEval_rouge gen 12.56 +LVEval_dureader_mixup_128k 8bac4e LVEval_rouge gen 10.33 +LVEval_dureader_mixup_256k 8bac4e LVEval_rouge gen 6.69 +---------------------------------------- - - - - +LVEval_loogle_CR_mixup_16k 83bc25 LVEval_f1 gen 9.35 +LVEval_loogle_CR_mixup_32k 83bc25 LVEval_f1 gen 7.42 +LVEval_loogle_CR_mixup_64k 83bc25 LVEval_f1 gen 3.18 +LVEval_loogle_CR_mixup_128k 83bc25 LVEval_f1 gen 2.65 +LVEval_loogle_CR_mixup_256k 83bc25 LVEval_f1 gen 3.27 +---------------------------------------- - - - - +LVEval_loogle_MIR_mixup_16k 83bc25 LVEval_f1 gen 4.50 +LVEval_loogle_MIR_mixup_32k 83bc25 LVEval_f1 gen 3.19 +LVEval_loogle_MIR_mixup_64k 83bc25 LVEval_f1 gen 2.34 +LVEval_loogle_MIR_mixup_128k 83bc25 LVEval_f1 gen 1.76 +LVEval_loogle_MIR_mixup_256k 83bc25 LVEval_f1 gen 1.20 +---------------------------------------- - - - - +--------- LVEval Multi_Hop CQA --------- - - - - +---------------------------------------- - - - - +LVEval_hotpotwikiqa_mixup_16k e3c368 LVEval_f1 gen 19.80 +LVEval_hotpotwikiqa_mixup_32k e3c368 LVEval_f1 gen 12.59 +LVEval_hotpotwikiqa_mixup_64k e3c368 LVEval_f1 gen 7.33 +LVEval_hotpotwikiqa_mixup_128k e3c368 LVEval_f1 gen 7.85 +LVEval_hotpotwikiqa_mixup_256k e3c368 LVEval_f1 gen 3.42 +---------------------------------------- - - - - +LVEval_lic_mixup_16k fdd540 LVEval_f1 gen 21.36 +LVEval_lic_mixup_32k fdd540 LVEval_f1 gen 12.92 +LVEval_lic_mixup_64k fdd540 LVEval_f1 gen 4.62 +LVEval_lic_mixup_128k fdd540 LVEval_f1 gen 4.25 +LVEval_lic_mixup_256k fdd540 LVEval_f1 gen 4.85 +---------------------------------------- - - - - +--------- LVEval Factrecall CQA --------- - - - - +---------------------------------------- - - - - +LVEval_factrecall_en_16k fba966 f1 gen 58.33 +LVEval_factrecall_en_32k fba966 f1 gen 32.17 +LVEval_factrecall_en_64k fba966 f1 gen 15.33 +LVEval_factrecall_en_128k fba966 f1 gen 8.50 +LVEval_factrecall_en_256k fba966 f1 gen 4.00 +---------------------------------------- - - - - +LVEval_factrecall_zh_16k ef3320 f1 gen 20.00 +LVEval_factrecall_zh_32k ef3320 f1 gen 38.00 +LVEval_factrecall_zh_64k ef3320 f1 gen 20.50 +LVEval_factrecall_zh_128k ef3320 f1 gen 11.00 +LVEval_factrecall_zh_256k ef3320 f1 gen 5.00 +``` diff --git a/build/lib/opencompass/configs/datasets/lveval/lveval.py b/build/lib/opencompass/configs/datasets/lveval/lveval.py new file mode 100644 index 0000000000000000000000000000000000000000..ffd9f07d207172d80a92a36726d8cc367f534399 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/lveval/lveval.py @@ -0,0 +1,38 @@ +from mmengine.config import read_base + +with read_base(): + from .lvevalcmrc_mixup.lveval_cmrc_mixup_gen import ( + LVEval_cmrc_mixup_datasets, + ) + from .lvevaldureader_mixup.lveval_dureader_mixup_gen import ( + LVEval_dureader_mixup_datasets, + ) + from .lvevalfactrecall_en.lveval_factrecall_en_gen import ( + LVEval_factrecall_en_datasets, + ) + from .lvevalfactrecall_zh.lveval_factrecall_zh_gen import ( + LVEval_factrecall_zh_datasets, + ) + from .lvevalhotpotwikiqa_mixup.lveval_hotpotwikiqa_mixup_gen import ( + LVEval_hotpotwikiqa_mixup_datasets, + ) + from .lvevallic_mixup.lveval_lic_mixup_gen import LVEval_lic_mixup_datasets + from .lvevalloogle_CR_mixup.lveval_loogle_CR_mixup_gen import ( + LVEval_loogle_CR_mixup_datasets, + ) + from .lvevalloogle_MIR_mixup.lveval_loogle_MIR_mixup_gen import ( + LVEval_loogle_MIR_mixup_datasets, + ) + from .lvevalloogle_SD_mixup.lveval_loogle_SD_mixup_gen import ( + LVEval_loogle_SD_mixup_datasets, + ) + from .lvevalmultifieldqa_en_mixup.lveval_multifieldqa_en_mixup_gen import ( + LVEval_multifieldqa_en_mixup_datasets, + ) + from .lvevalmultifieldqa_zh_mixup.lveval_multifieldqa_zh_mixup_gen import ( + LVEval_multifieldqa_zh_mixup_datasets, + ) + +LVEval_datasets = sum( + (v for k, v in locals().items() if k.endswith('_datasets')), [] +) diff --git a/build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py b/build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..8f6ba03a4fc24e1043caae6409124baf749b4a36 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mastermath2024v1_gen_be6318 import mastermath2024v1_datasets diff --git a/build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py b/build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py new file mode 100644 index 0000000000000000000000000000000000000000..150acf5a2d1e23031a102298d33a7d3a9860c63e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MastermathDatasetv1, MastermathDatasetv1Evaluator +from opencompass.utils import first_option_postprocess + +mastermath2024v1_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +mastermath2024v1_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\n选项:\n' + '(A){A}\n' + '(B){B}\n' + '(C){C}\n' + '(D){D}\n' + '你的回答格式如下: "正确答案是 (在这里插入你的答案)"'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mastermath2024v1_eval_cfg = dict(evaluator=dict(type=MastermathDatasetv1Evaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + +mastermath2024v1_datasets = [dict( + abbr='Mastermath2024v1', + type=MastermathDatasetv1, + path='./data/mastermath2024v1/', + name='kaoyan_math_1_mcq_Sheet1.csv', + reader_cfg=mastermath2024v1_reader_cfg, + infer_cfg=mastermath2024v1_infer_cfg, + eval_cfg=mastermath2024v1_eval_cfg)] diff --git a/build/lib/opencompass/configs/datasets/matbench/matbench_gen.py b/build/lib/opencompass/configs/datasets/matbench/matbench_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..8e3e0f572259a8d5ad1714ec3fba3cc4202b83e5 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/matbench/matbench_gen.py @@ -0,0 +1,5 @@ +from mmengine.config import read_base + +with read_base(): + # from .matbench_gen_regex_judge import matbench_datasets # noqa: F401, F403 + from .matbench_llm_judge_gen_0e9276 import matbench_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py b/build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py new file mode 100644 index 0000000000000000000000000000000000000000..8b8a676b4db23a64d2995c4393ea3a7c5b8ad736 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py @@ -0,0 +1,55 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification + + + +matbench_reader_cfg = dict( + input_columns=['problem'], output_column='answer') + + +matbench_tasks = ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass'] + +matbench_datasets = [] + +for task in matbench_tasks: + if task in ['matbench_expt_is_metal','matbench_glass']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + matbench_eval_cfg = dict( + evaluator=dict(type=MatbenchEvaluator_classification), + pred_role='BOT') + + elif task in ['matbench_steels','matbench_expt_gap']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + + matbench_eval_cfg = dict( + evaluator=dict(type=MatbenchEvaluator_regression), + pred_role='BOT') + + + matbench_datasets.append( + dict( + type=MatbenchDataset, + path=f'opencompass/Matbench', + task=task, + abbr=task, + reader_cfg=matbench_reader_cfg, + infer_cfg=matbench_infer_cfg, + eval_cfg=matbench_eval_cfg)) + diff --git a/build/lib/opencompass/configs/datasets/matbench/matbench_llm_judge_gen_0e9276.py b/build/lib/opencompass/configs/datasets/matbench/matbench_llm_judge_gen_0e9276.py new file mode 100644 index 0000000000000000000000000000000000000000..45bf4206d7687fd294f9b52930378963ac19c915 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/matbench/matbench_llm_judge_gen_0e9276.py @@ -0,0 +1,146 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.matbench.matbench import MatbenchDataset +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets.matbench.post_process import numerical_llmjudge_postprocess +from opencompass.evaluator.generic_llm_evaluator import GenericLLMEvaluator + +JUDGE_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +REGRESSION_JUDGE_TEMPLATE = """ +Please function as a precise data extraction engine. Your sole task is to process the provided information and return a single specific numerical value as a float. + +Follow these strict output rules: + +1. Your output must be a single floating-point number. +2. Do not include any surrounding text, explanations, units, or additional characters. +3. If the provided answer is incomplete or does not yield a determined result, output 0. +4. Return only the numerical value without any reasoning or additional content. +Your specific task is to extract the float result from the following: + +: \n{problem}\n\n\n +: \n{prediction}\n\n\n + +Please provide the final numerical answer as a float: + +""".strip() + +matbench_reader_cfg = dict( + input_columns=['problem'], output_column='answer') + + +matbench_tasks = ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass'] + +matbench_datasets = [] + +for task in matbench_tasks: + if task in ['matbench_glass','matbench_expt_is_metal']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + matbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, # Use LLM as judge + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=JUDGE_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MatbenchDataset, + path='opencompass/Matbench', + task=task, + reader_cfg=matbench_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ) + + elif task in ['matbench_expt_gap','matbench_steels']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + matbench_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, # 使用LLM作为评估器 + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who extracts the answer as float of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=REGRESSION_JUDGE_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MatbenchDataset, + path='opencompass/Matbench', + task=task, + reader_cfg=matbench_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=numerical_llmjudge_postprocess), + ), + ) + + matbench_datasets.append( + dict( + type=MatbenchDataset, + path='opencompass/Matbench', + abbr=task, + task=task, + reader_cfg=matbench_reader_cfg, + infer_cfg=matbench_infer_cfg, + eval_cfg=matbench_eval_cfg)) + diff --git a/build/lib/opencompass/configs/datasets/matbench/matbench_regex_judge_gen_0e9276.py b/build/lib/opencompass/configs/datasets/matbench/matbench_regex_judge_gen_0e9276.py new file mode 100644 index 0000000000000000000000000000000000000000..26ee25fb5774f6a36025dadbfbe6e732c21d34f8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/matbench/matbench_regex_judge_gen_0e9276.py @@ -0,0 +1,65 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification, MatbenchEvaluator_classification_glass + +matbench_reader_cfg = dict( + input_columns=['problem'], output_column='answer') + + +matbench_tasks = ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass'] + + +matbench_datasets = [] + +for task in matbench_tasks: + if task in ['matbench_expt_is_metal']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + matbench_eval_cfg = dict( + evaluator=dict(type=MatbenchEvaluator_classification), + pred_role='BOT') + + if task in ['matbench_glass']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + matbench_eval_cfg = dict( + evaluator=dict(type=MatbenchEvaluator_classification_glass), + pred_role='BOT') + + + elif task in ['matbench_expt_gap','matbench_steels']: + matbench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[dict(role='HUMAN', prompt=f'{{problem}}')])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + + matbench_eval_cfg = dict( + evaluator=dict(type=MatbenchEvaluator_regression), + pred_role='BOT') + + matbench_datasets.append( + dict( + type=MatbenchDataset, + path='opencompass/Matbench', + abbr=task, + task=task, + reader_cfg=matbench_reader_cfg, + infer_cfg=matbench_infer_cfg, + eval_cfg=matbench_eval_cfg)) + diff --git a/build/lib/opencompass/configs/datasets/math/README.md b/build/lib/opencompass/configs/datasets/math/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c498db344c0883b534e8305ee6a76237cec67dc0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/README.md @@ -0,0 +1,69 @@ +# MATH + +```bash +python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug +python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug +``` + +## Base Models + +| model | math | +|:------------------------:|-------:| +| llama-7b-turbomind | 2.94 | +| llama-13b-turbomind | 3.84 | +| llama-30b-turbomind | 6.54 | +| llama-65b-turbomind | 10.66 | +| llama-2-7b-turbomind | 3.58 | +| llama-2-13b-turbomind | 5.30 | +| llama-2-70b-turbomind | 13.26 | +| llama-3-8b-turbomind | 16.42 | +| llama-3-70b-turbomind | 39.64 | +| internlm2-1.8b-turbomind | 9.42 | +| internlm2-7b-turbomind | 25.16 | +| internlm2-20b-turbomind | 32.24 | +| qwen-1.8b-turbomind | 6.30 | +| qwen-7b-turbomind | 15.56 | +| qwen-14b-turbomind | 30.38 | +| qwen-72b-turbomind | 44.18 | +| qwen1.5-0.5b-hf | 4.16 | +| qwen1.5-1.8b-hf | 11.32 | +| qwen1.5-4b-hf | 17.50 | +| qwen1.5-7b-hf | 17.34 | +| qwen1.5-14b-hf | 36.18 | +| qwen1.5-32b-hf | 45.74 | +| qwen1.5-72b-hf | 41.56 | +| qwen1.5-moe-a2-7b-hf | 27.96 | +| mistral-7b-v0.1-hf | 13.44 | +| mistral-7b-v0.2-hf | 12.74 | +| mixtral-8x7b-v0.1-hf | 29.46 | +| mixtral-8x22b-v0.1-hf | 41.82 | +| yi-6b-hf | 6.60 | +| yi-34b-hf | 18.80 | +| deepseek-7b-base-hf | 4.66 | +| deepseek-67b-base-hf | 18.76 | + +## Chat Models + +| model | math | +|:-----------------------------:|-------:| +| qwen1.5-0.5b-chat-hf | 0.56 | +| qwen1.5-1.8b-chat-hf | 4.94 | +| qwen1.5-4b-chat-hf | 7.34 | +| qwen1.5-7b-chat-hf | 22.14 | +| qwen1.5-14b-chat-hf | 32.22 | +| qwen1.5-32b-chat-hf | 41.80 | +| qwen1.5-72b-chat-hf | 45.22 | +| qwen1.5-110b-chat-hf | 54.38 | +| internlm2-chat-1.8b-hf | 14.06 | +| internlm2-chat-1.8b-sft-hf | 13.10 | +| internlm2-chat-7b-hf | 28.08 | +| internlm2-chat-7b-sft-hf | 27.60 | +| internlm2-chat-20b-hf | 34.68 | +| internlm2-chat-20b-sft-hf | 32.54 | +| llama-3-8b-instruct-hf | 27.50 | +| llama-3-70b-instruct-hf | 47.52 | +| llama-3-8b-instruct-lmdeploy | 27.42 | +| llama-3-70b-instruct-lmdeploy | 46.90 | +| mistral-7b-instruct-v0.1-hf | 8.48 | +| mistral-7b-instruct-v0.2-hf | 10.82 | +| mixtral-8x7b-instruct-v0.1-hf | 27.02 | diff --git a/build/lib/opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py b/build/lib/opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py new file mode 100644 index 0000000000000000000000000000000000000000..8dd3e41e9f16d5665a8f1c666c93eade9c0c34e0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py @@ -0,0 +1,90 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess_v2 +) +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict( + type=MATHAgentEvaluator, + version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='./data/math/math.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py b/build/lib/opencompass/configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py new file mode 100644 index 0000000000000000000000000000000000000000..5550769d4427a2ac87c7fdf81be58226a2e67bf3 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='./data/math/math.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py b/build/lib/opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py new file mode 100644 index 0000000000000000000000000000000000000000..71597c2913036eef9d0843b278bcd10e0418bd88 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_0shot_gen_11c4b5.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_0shot_gen_393424.py b/build/lib/opencompass/configs/datasets/math/math_0shot_gen_393424.py new file mode 100644 index 0000000000000000000000000000000000000000..d2fef53cf734a8f2289638bfea98ff151a405227 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_0shot_gen_393424.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py b/build/lib/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py new file mode 100644 index 0000000000000000000000000000000000000000..eb302c854ae61b674d9611349c9204d8da2a63d0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py @@ -0,0 +1,78 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator +from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess +from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE + +# ----------------------------- Eval Parameters ----------------------------- +## Postprocess function +post_func = 're' # 're', 'xfinder_model', 'naive_model' + +## Evalute function +eval_func = 'naive_model' # 're', 'naive_model' + +## Model api url +xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model' +naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name +naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +if post_func == 're': + pred_postprocessor = dict(type=math_postprocess_v2) +elif post_func == 'xfinder_model': + pred_postprocessor = dict( + type=xfinder_postprocess, + question_type='math', + model_name='xFinder-qwen1505', + num_processes=128, + api_url=xfinder_url, + ) +elif post_func == 'naive_model': + pred_postprocessor = dict( + type=naive_model_postprocess, + custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE, + model_name=naive_model_name, + num_processes=64, + api_url=naive_model_url, + ) + +if eval_func == 're': + evaluator = dict(type=MATHEvaluator, version='v2') +elif eval_func == 'naive_model': + evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + ) + +math_eval_cfg = dict( + evaluator=evaluator, pred_postprocessor=pred_postprocessor, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py b/build/lib/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py new file mode 100644 index 0000000000000000000000000000000000000000..b56db5db858f673d683401aab411140f67c2abdc --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_0shot_llm_judge_v2_gen_31d777.py @@ -0,0 +1,51 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, GaoKaoMATHEvaluator + +# ----------------------------- Model Eval Parameters ----------------------------- + +naive_model_name = 'dlc_model' # replace with your model name +naive_model_url = ['http://0.0.0.0:23333/v1'] # Multi-apis for accerlation + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), +) + +evaluator = dict( + type=GaoKaoMATHEvaluator, + model_name=naive_model_name, + url=naive_model_url, + language='en', + with_postprocess=True, + post_url=naive_model_url, + post_model_name=naive_model_name, +) + +math_eval_cfg = dict( + evaluator=evaluator, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py b/build/lib/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8696798b45fab4c38806ed1a54ca63f9523e46 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Problem', '问题:'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py b/build/lib/opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py new file mode 100644 index 0000000000000000000000000000000000000000..95dd620ef1e06fef3eabb0ff26d14378d83797be --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_4shot_example_from_google_research.py b/build/lib/opencompass/configs/datasets/math/math_4shot_example_from_google_research.py new file mode 100644 index 0000000000000000000000000000000000000000..80feee446404f4543da2be0b862a4baaf5422010 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_4shot_example_from_google_research.py @@ -0,0 +1,40 @@ +# Solving Quantitative Reasoning Problems with Language Models + +prompt = ''' +Problem: +Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$. + +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$ + +Solution: +We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? + +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: +\\begin{align*} +30n&=480\\ +\\Rightarrow\\qquad n&=480/30=\\boxed{16} +\\end{align*} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations +\\begin{align*} +6x-4y&=a,\\ +6y-9x &=b. +\\end{align*} +has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero. + +Solution: +If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$ +Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct. +'''.strip() diff --git a/build/lib/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py b/build/lib/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py new file mode 100644 index 0000000000000000000000000000000000000000..8c18b47b8af8a470f8c33880dbf6b1704cb665fa --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py @@ -0,0 +1,117 @@ +""" +Summary: A config for AIME-2024 Evaluation. +Setting: + Shot: 0-shot + Evaluator: + - CascadeEvaluator + - MATHVerifyEvaluator + - GenericLLMEvaluator +Avaliable Models: + - Instruct/Chat Models +""" + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset +from opencompass.evaluator import ( + CascadeEvaluator, + GenericLLMEvaluator, + MATHVerifyEvaluator +) + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + + +cascade_evaluator = dict( + type=CascadeEvaluator, + rule_evaluator=dict( + type=MATHVerifyEvaluator, + ), + llm_evaluator= dict( + dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + n=4, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ) + ), + parallel=False, +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr=f'math_prm800k_500', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=dict( + evaluator=cascade_evaluator, + ), + n=1, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_500_gen.py b/build/lib/opencompass/configs/datasets/math/math_500_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..232916fd20a1c603f23bbf99116c9019ae3039be --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_500_gen.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CustomDataset +from opencompass.evaluator import MATHVerifyEvaluator + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +math_eval_cfg = dict( + evaluator=dict(type=MATHVerifyEvaluator), +) + +math_datasets = [ + dict( + type=CustomDataset, + abbr='math-500', + path='opencompass/math', + file_name='test_prm800k_500.jsonl', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py b/build/lib/opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py new file mode 100644 index 0000000000000000000000000000000000000000..67d742661da101238b27c31a5671d51c0b652ff6 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py @@ -0,0 +1,96 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py b/build/lib/opencompass/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py new file mode 100644 index 0000000000000000000000000000000000000000..996f9c68dd8bda0431699542e15ac82dc8454b8b --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py @@ -0,0 +1,100 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess_v2 +) + +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter +Tool Input:```python +from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result +```"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter +Tool Input:```python +import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result +```"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter +Tool Input:```python +def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result +```"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict( + type=MATHAgentEvaluator, + version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_agent_gen_0c1b4e.py b/build/lib/opencompass/configs/datasets/math/math_agent_gen_0c1b4e.py new file mode 100644 index 0000000000000000000000000000000000000000..0abbc0c0f54a569c9b5666cc5877cfaec05a83a1 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_agent_gen_0c1b4e.py @@ -0,0 +1,99 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess +) + +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter +Tool Input:```python +from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result +```"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter +Tool Input:```python +import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result +```"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter +Tool Input:```python +def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result +```"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess), +) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_agent_gen_861b4f.py b/build/lib/opencompass/configs/datasets/math/math_agent_gen_861b4f.py new file mode 100644 index 0000000000000000000000000000000000000000..ad1a7272db810144db0bf177810117fa249a8991 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_agent_gen_861b4f.py @@ -0,0 +1,90 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess +) + +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess), +) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_agent_gen_af2293.py b/build/lib/opencompass/configs/datasets/math/math_agent_gen_af2293.py new file mode 100644 index 0000000000000000000000000000000000000000..51b3500b70c0fd40d76de61dedb25aefa2c125b9 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_agent_gen_af2293.py @@ -0,0 +1,103 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess +) + +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='Given $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import numpy as np + +def solution() + a = np.array([-7, 0, 1]) + b = np.array([4, 2, -1]) + + result = a - 3 * b + + result = r'\\begin{{pmatrix}} {} \ {} \ {} \end{{pmatrix}}'.format(result[0], result[1], result[2]) + return result"""), + dict(role='SYSTEM', prompt='Response:\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess), +) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py b/build/lib/opencompass/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py new file mode 100644 index 0000000000000000000000000000000000000000..6405513ab0fe68cff9cb0c7387a70a9b87f3e60c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template="""\ +Problem: +Find the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$. +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$ +Solution: +We have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. +Solution: +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. + +Problem: +{problem} +Solution:""" + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Problem'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py b/build/lib/opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py new file mode 100644 index 0000000000000000000000000000000000000000..6ca42154b318f6196f6aff62d4d894ace29eea69 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen.py b/build/lib/opencompass/configs/datasets/math/math_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..b97669c288ebb506b956838d7bc158f4cfab517e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .math_gen_a58d9d import math_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_0957ff.py b/build/lib/opencompass/configs/datasets/math/math_gen_0957ff.py new file mode 100644 index 0000000000000000000000000000000000000000..c112f03534c0a6c215a8fde2962d150c3b1b1e9e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_0957ff.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{align*} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{16} \end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{align*} 6x-4y&=a,\\\\ 6y-9x &=b. \end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_1ed9c2.py b/build/lib/opencompass/configs/datasets/math/math_gen_1ed9c2.py new file mode 100644 index 0000000000000000000000000000000000000000..a168d511875612bc62d9fae0d61dac18b65e17f0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_1ed9c2.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplified.\nSolution:'), + dict(role='BOT', prompt='Combine like terms to simplify the expression. The coefficient of $x^3$ is calculated as $$(-3+2\cdot(2+1))+(-5)\cdot(-4))$ = 26$. Thus, the coefficient of $x^3$ is $\\boxed{26}$.\nFinal Answer: The final answer is $26$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nThe surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.\nSolution:'), + dict(role='BOT', prompt='The surface area of a hemisphere (not including the base) is half that of a sphere, so it is $2\pi r^2$. The area of the base is $\pi r^2$. Therefore, for a hemisphere with radius 6 cm, the total surface area is $2\pi (6)^2 + \pi (6)^2 = 108\pi$ square cm.\nFinal Answer: The final answer is $108\pi$ square cm. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nMonica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.\nSolution:'), + dict(role='BOT', prompt='The prime numbers rolled could be 2, 3, or 5, and each has a 1/6 chance of being rolled. The composite number 4 or 6 has a 2/6 chance of being rolled, but it results in $0 win. The remaining non-prime and non-composite number is 1 , and it results in a loss of $3, with a 1/6 chance. So, the expected winnings are $(2+3+5)(1/6)+0(2/6)+(-3)(1/6) = \$1.17$.\nFinal Answer: The final answer is $\$1.17$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nGiven $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$\nSolution:'), + dict(role='BOT', prompt='We find $3 \mathbf{b}$ first, which is $\\begin{pmatrix} 12 \\ 6 \\ -3 \end{pmatrix}$. Then we subtract this vector from $\mathbf{a}$. So, $\mathbf{a} - 3 \mathbf{b} = \\begin{pmatrix} -7 - 12 \\ 0 - 6 \\ 1 - (-3) \end{pmatrix} = \\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}.$\nFinal Answer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_265cce.py b/build/lib/opencompass/configs/datasets/math/math_gen_265cce.py new file mode 100644 index 0000000000000000000000000000000000000000..ef2a6af562a7ffc1b7a02530ed8d4de97b385b4b --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_265cce.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_559593.py b/build/lib/opencompass/configs/datasets/math/math_gen_559593.py new file mode 100644 index 0000000000000000000000000000000000000000..18da00285450af578088c3e7ac2b0fd1b8b46de8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_559593.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''Problem: +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$ +Solution: +We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. +Solution: +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. + +Problem: +{problem} +Solution: +{solution}'''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_5e8458.py b/build/lib/opencompass/configs/datasets/math/math_gen_5e8458.py new file mode 100644 index 0000000000000000000000000000000000000000..ed9c3e5fbe1d85d8bb39bbdb47aa4264db2bdf0e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_5e8458.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''Problem: +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$ +Solution: +We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. +Solution: +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. + +Problem: +{problem}Solution: +{solution}'''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=dict( + input_columns=['problem'], + output_column='solution', + ), + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_736506.py b/build/lib/opencompass/configs/datasets/math/math_gen_736506.py new file mode 100644 index 0000000000000000000000000000000000000000..6f0b1de747f2d2a9cbde315c03be528c8c741066 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_736506.py @@ -0,0 +1,28 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHInternDataset, MATHInternEvaluator, math_intern_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step\nAnswer:") + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHInternEvaluator), pred_postprocessor=dict(type=math_intern_postprocess)) + +math_datasets = [ + dict( + type=MATHInternDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_78ced2.py b/build/lib/opencompass/configs/datasets/math/math_gen_78ced2.py new file mode 100644 index 0000000000000000000000000000000000000000..9088b9758468efc396a82b8739656ad83c996805 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_78ced2.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +QUERY_TEMPLATE = """ +Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem. + +{problem} + +Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command. +""".strip() + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + + template=dict(round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_943d32.py b/build/lib/opencompass/configs/datasets/math/math_gen_943d32.py new file mode 100644 index 0000000000000000000000000000000000000000..ee22ba481fec136e53199814d13dd17679153047 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_943d32.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import MATHDataset, MATHAgentEvaluator, math_postprocess + +# This config is for code interpreter +math_example = """ +Example: + +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$. +{thought} The domain restrictions are determined by: + +The square root in the numerator must be non-negative. +The square root in the denominator must be positive (because we can't have 0 in the denominator). +The value inside a square root (the radicand) must be non-negative. We can use `sympy` to determine the domain of the expression. +{action} PythonInterpreter +{action_input} +```python +from sympy import symbols, solveset, S, And + +def solution(): + # Define the variable + x = symbols('x') + + # Define the inequalities for the domain based on the expression + inequality1 = x-2 >= 0 # because of the square root in the numerator + inequality2 = 5-x > 0 # because of the square root in the denominator + + # Solve the inequalities + domain1 = solveset(inequality1, x, domain=S.Reals) + domain2 = solveset(inequality2, x, domain=S.Reals) + + # Find the intersection of the two domains + final_domain = domain1.intersect(domain2) + + return final_domain + +``` +{response}'Interval.Ropen(2, 5)' + {thought} Therefore, the domain of the expression is $\\boxed{{[2,5)}}$ +{finish} [2,5) +""" + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='{problem}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer, example=math_example)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=dict( + input_columns=['problem'], + output_column='solution', + ), + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_gen_a58d9d.py b/build/lib/opencompass/configs/datasets/math/math_gen_a58d9d.py new file mode 100644 index 0000000000000000000000000000000000000000..648ff0a5028bd7f63a512af5bd279eab1757467d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_gen_a58d9d.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset +from opencompass.evaluator import MATHVerifyEvaluator + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHVerifyEvaluator) +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_intern_evaluator_gen_265cce.py b/build/lib/opencompass/configs/datasets/math/math_intern_evaluator_gen_265cce.py new file mode 100644 index 0000000000000000000000000000000000000000..1a6cbf379ea6f43e87992e76289364183a4602a8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_intern_evaluator_gen_265cce.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHInternEvaluator, math_intern_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHInternEvaluator), pred_postprocessor=dict(type=math_intern_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_llm_judge_gen.py b/build/lib/opencompass/configs/datasets/math/math_llm_judge_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..fff23ef63702ff93492150685ae1ed9bf0fadbfa --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_llm_judge_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .math_llm_judge_gen_56606f import math_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py b/build/lib/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py new file mode 100644 index 0000000000000000000000000000000000000000..f655fa49abfad9441d9b5e2535bdb32e66048a41 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py @@ -0,0 +1,85 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step\nAnswer:") + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer) +) + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..c23bc13699f81c0790eb4b30a83484aa1a676993 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py @@ -0,0 +1,100 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + MATHDataset, + MATHEvaluator, + math_postprocess_v2, + normalize_final_answer, +) +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_academic_postprocess + + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess), + ), + pred_role='BOT', +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name='test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..0faf86303b28c0883cb868d1290645346f863e43 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + MATHDataset, + MATHEvaluator, + math_postprocess_v2, + normalize_final_answer, +) + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name='test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py new file mode 100644 index 0000000000000000000000000000000000000000..0c2e516e1afa09be1d84a997b5249f110e1c143c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import MATHVerifyEvaluator +from opencompass.datasets import ( + MATHDataset, + math_postprocess_v2, + normalize_final_answer, +) + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHVerifyEvaluator) +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name='test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py new file mode 100644 index 0000000000000000000000000000000000000000..c1d20a80c906fd79c85043435af93959b1cda81a --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_gen_b27274.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000.py new file mode 100644 index 0000000000000000000000000000000000000000..25014220d7a01a4893f989482d36de9c2352e803 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000.py @@ -0,0 +1,96 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py new file mode 100644 index 0000000000000000000000000000000000000000..67d742661da101238b27c31a5671d51c0b652ff6 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py @@ -0,0 +1,96 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_xml_gen_63a000.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_xml_gen_63a000.py new file mode 100644 index 0000000000000000000000000000000000000000..15892c18b7dcc0550e748fc86b4bf62471ccab55 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_xml_gen_63a000.py @@ -0,0 +1,99 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset +from opencompass.utils import xml_tag_postprocessor + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + pred_postprocessor=dict(type=xml_tag_postprocessor, tag=''), + + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py new file mode 100644 index 0000000000000000000000000000000000000000..3b4b991d71b0286a90cbac789b77b2347de28874 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py @@ -0,0 +1,89 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=8192), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_gen.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..759b1b63a5be0cd2f584ab33f7e1f9cbe5a737bb --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .math_prm800k_500_0shot_cot_gen_11c4b5 import math_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py new file mode 100644 index 0000000000000000000000000000000000000000..1b3bba2331d8a5b2715fa3906ce23bf75a7b1013 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..461b3a9a1870624925ef9394f7fb9b0308716d62 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468 import math_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py new file mode 100644 index 0000000000000000000000000000000000000000..78e66452a228cb5177f4dec3c0660130ad119973 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py @@ -0,0 +1,99 @@ +# CoT: No CoT +# K-Shot: 0-Shot +# Verify: LLM Verify +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500-llmjudge', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac43b7c16d4a164e15cded1a05daf4e534003c7 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py @@ -0,0 +1,100 @@ +# CoT: No CoT +# K-Shot: 0-Shot +# Verify: LLM Verify +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.datasets import MATHDataset + + +# ----------------------------- Detailed Config ----------------------------- + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', +) + + +math_datasets = [ + dict( + type=MATHDataset, + abbr=f'math_prm800k_500-llmverify-run{idx}', + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) + for idx in range(4) +] diff --git a/build/lib/opencompass/configs/datasets/math401/math401_gen.py b/build/lib/opencompass/configs/datasets/math401/math401_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..a009cd98e570ed90290657183d526a4ab9fc92a8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math401/math401_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .math401_gen_ab5f39 import math401_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/math401/math401_gen_ab5f39.py b/build/lib/opencompass/configs/datasets/math401/math401_gen_ab5f39.py new file mode 100644 index 0000000000000000000000000000000000000000..47cfe41bc0787df0dbf8c3c8f5f2fc2b9836d53d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/math401/math401_gen_ab5f39.py @@ -0,0 +1,47 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MathBenchDataset, Math401Evaluator, mathbench_postprocess + +cloze_prompt = [ + dict(role='HUMAN', prompt='Q: Calculate 2.9-0.11.'), + dict(role='BOT', prompt='A: Let\'s think step by step, 2.9 - 0.11 equals 2.7900. The answer is 2.7900.\n'), + dict(role='HUMAN', prompt='Q: Calculate 0.15-0.032.'), + dict(role='BOT', prompt='A: Let\'s think step by step, 0.15 - 0.032 equals 0.1180. The answer is 0.1180.\n'), + dict(role='HUMAN', prompt='Q: Calculate 78*64.'), + dict(role='BOT', prompt='A: Let\'s think step by step, 78 multiplied by 64 equals 4992. The answer is 4992.\n'), + dict(role='HUMAN', prompt='Q: Calculate 62×42.'), + dict(role='BOT', prompt='A: Let\'s think step by step, 62 multiplied by 42 equals 2604. The answer is 2604.\n'), + dict(role='HUMAN', prompt='Q: Calculate {question}'), + dict(role='BOT', prompt='A: {answer}\n')] + +math401_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=cloze_prompt, + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +math401_eval_cfg = dict( + evaluator=dict(type=Math401Evaluator), + pred_postprocessor=dict(type=mathbench_postprocess, name='en')) + +math401_datasets = [ + dict( + abbr='math401', + type=MathBenchDataset, + path=f'./data/math401/', + with_circular=False, + name='cloze_en', + reader_cfg=dict( + input_columns=['question'], + output_column='answer' + ), + infer_cfg=math401_infer_cfg, + eval_cfg=math401_eval_cfg, + )] diff --git a/build/lib/opencompass/configs/datasets/mbpp/README.md b/build/lib/opencompass/configs/datasets/mbpp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f2b61115d369facf9e7c47d97c8c822b1d91f78 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/README.md @@ -0,0 +1,69 @@ +# MBPP + +```bash +python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug +python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug +``` + +## Base Models + +| model | pass@1 | pass | timeout | failed | wrong_answer | +|:------------------------:|---------:|-------:|----------:|---------:|---------------:| +| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | +| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | +| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | +| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | +| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | +| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | +| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | +| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | +| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | +| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | +| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | +| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | +| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | +| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | +| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | +| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | +| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | +| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | +| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | +| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | +| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | +| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | +| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | +| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | +| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | +| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | +| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | +| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | +| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | +| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | +| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | +| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | + +## Chat Models + +| model | pass@1 | pass | timeout | failed | wrong_answer | +|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:| +| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | +| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | +| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | +| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | +| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | +| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | +| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | +| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | +| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | +| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | +| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | +| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | +| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | +| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | +| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | +| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | +| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | +| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | +| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | +| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | +| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py new file mode 100644 index 0000000000000000000000000000000000000000..697cdf45184dd114efd350f45bdd923acd669933 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='./data/mbpp/mbpp.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py new file mode 100644 index 0000000000000000000000000000000000000000..d09576a762713ea6fe50b3f7b0508649b7949b25 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py @@ -0,0 +1,28 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n[BEGIN]\n", + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator)) + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='./data/mbpp/mbpp.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py new file mode 100644 index 0000000000000000000000000000000000000000..8ec7133e642ae335a9faaa0c0154658bb769e2ff --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'), + dict(role='BOT', prompt='[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)\n[DONE] \n\n '), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'), + dict(role='BOT', prompt='[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result\n[DONE] \n\n '), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'), + dict(role='BOT', prompt='[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums\n[DONE] \n\n '), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='./data/mbpp/mbpp.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py new file mode 100644 index 0000000000000000000000000000000000000000..c6dcbe5ae42679b4ed200b0a7b19bd5fd8a869a2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDatasetV2, + abbr='mbpp_passk', + path='./data/mbpp/mbpp.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py new file mode 100644 index 0000000000000000000000000000000000000000..99263c9c5eafb805a7ccf55c856b4f6b1375c88f --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py @@ -0,0 +1,45 @@ +# This config is used for pass@k evaluation with dataset repetition +# That model cannot generate multiple response for single input +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDatasetV2, + abbr='mbpp_repeat10', + path='./data/mbpp/mbpp.jsonl', + num_repeats=10, + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py new file mode 100644 index 0000000000000000000000000000000000000000..922786eb51fe0dc2a6f2b6cd5fcbc806a15150b6 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n',), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n',), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n',), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n',), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='./data/mbpp/sanitized-mbpp.jsonl', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py new file mode 100644 index 0000000000000000000000000000000000000000..edd3fe039fa7e18ca6a45052e5934300edf6160e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py @@ -0,0 +1,81 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''\ +You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests: + +assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5) +assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) +assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) + +[BEGIN] + 'def similar_elements(test_tup1, test_tup2): + res = tuple(set(test_tup1) & set(test_tup2)) + return (res)' +[DONE] + + + +You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests: + +assert is_not_prime(2) == False +assert is_not_prime(10) == True +assert is_not_prime(35) == True + +[BEGIN] + 'import math +def is_not_prime(n): + result = False + for i in range(2,int(math.sqrt(n)) + 1): + if n % i == 0: + result = True + return result' +[DONE] + + + +You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests: + +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] + +[BEGIN] + 'import heapq as hq +def heap_queue_largest(nums,n): + largest_nums = hq.nlargest(n, nums) + return largest_nums' +[DONE] + + + +You are an expert Python programmer, and here is your task: {text} Your code should pass these tests: + +{test_list} + +[BEGIN] +''' + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='./data/mbpp/sanitized-mbpp.jsonl', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py new file mode 100644 index 0000000000000000000000000000000000000000..f53410fadcb7bba6967f7b0f0aa75e196b58feb0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp_passk', + path='./data/mbpp/sanitized-mbpp.jsonl', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py new file mode 100644 index 0000000000000000000000000000000000000000..8663739409b84f9adf9fc85336e1e78ba08ef9dc --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp_repeat10', + path='./data/mbpp/sanitized-mbpp.jsonl', + num_repeats=10, + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/mbpp_gen.py b/build/lib/opencompass/configs/datasets/mbpp/mbpp_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..e374ac0628b12d2e2de3d3bd582d78799c123e03 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/mbpp_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mbpp_gen_830460 import mbpp_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/mbpp/mbpp_gen_830460.py b/build/lib/opencompass/configs/datasets/mbpp/mbpp_gen_830460.py new file mode 100644 index 0000000000000000000000000000000000000000..693b7ff300149adbcb4f9896656fd8024ff1aea4 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/mbpp_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='opencompass/mbpp', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py b/build/lib/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py new file mode 100644 index 0000000000000000000000000000000000000000..af5a105782cd6dc81c0a12120b71125579daf98e --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/mbpp_passk_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDatasetV2, + abbr='mbpp_passk', + path='opencompass/mbpp', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py b/build/lib/opencompass/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py new file mode 100644 index 0000000000000000000000000000000000000000..643022e3714197e8dea57f47d73a917d39626423 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py @@ -0,0 +1,45 @@ +# This config is used for pass@k evaluation with dataset repetition +# That model cannot generate multiple response for single input +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDatasetV2, + abbr='mbpp_repeat10', + path='opencompass/mbpp', + num_repeats=10, + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py b/build/lib/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py new file mode 100644 index 0000000000000000000000000000000000000000..98eee52840a748ef1b6b098a6b64dc745967f12a --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='opencompass/mbpp', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + n=5, + k=3 + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd24dbb106574522f313e63bae4a4a3bc38e87c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py @@ -0,0 +1,82 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +prompt = ''' +You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests: + +assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5) +assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) +assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) + +[BEGIN] + '\ +def similar_elements(test_tup1, test_tup2): + res = tuple(set(test_tup1) & set(test_tup2)) + return (res)\ +' +[DONE] + + +You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests: + +assert is_not_prime(2) == False +assert is_not_prime(10) == True +assert is_not_prime(35) == True + +[BEGIN] + '\ +import math +def is_not_prime(n): + result = False + for i in range(2,int(math.sqrt(n)) + 1): + if n % i == 0: + result = True + return result\ +' +[DONE] + + +You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests: + +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] + +[BEGIN] + '\ +import heapq as hq +def heap_queue_largest(nums,n): + largest_nums = hq.nlargest(n, nums) + return largest_nums\ +' +[DONE] + + +You are an expert Python programmer, and here is your task: {text} Your code should pass these tests: + +{test_list} + +'''.strip() + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py new file mode 100644 index 0000000000000000000000000000000000000000..1ff90404b5f690f936085c85a6bbbaf4dcb3e2dc --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n',), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n',), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n',), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n',), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py new file mode 100644 index 0000000000000000000000000000000000000000..4890c3f0bb06d595e6ca856a3a8e405755b27fb8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)' \n[DONE]\n\n",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',), + dict(role='BOT', prompt="[BEGIN]\n 'import math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result' \n[DONE]\n\n",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE]\n\n",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py new file mode 100644 index 0000000000000000000000000000000000000000..1700559356fb4908a9de5b3bee5c2950ebbc53f0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_0shot_nocot_gen_a2e416.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',), + # dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)```',), + + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',), + # dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result```',), + + # dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',), + # dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums```',), + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n You should submit your final solution in the following format: ```python\n\n```',), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py new file mode 100644 index 0000000000000000000000000000000000000000..a67e0e5e205ec8cf6aee141fae3f3b11bdacf927 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_mdblock_gen_a447ff.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',), + dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)```',), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',), + dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result```',), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',), + dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums```',), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py new file mode 100644 index 0000000000000000000000000000000000000000..477e8d6a944476e2a9a6418298cf1faceb15babd --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp_passk', + path='opencompass/sanitized_mbpp', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py new file mode 100644 index 0000000000000000000000000000000000000000..28fb62f69fce355e483877b66688b5300b3147c7 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'), + dict(role='BOT', prompt='[BEGIN]\n'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp_repeat10', + path='opencompass/sanitized_mbpp', + num_repeats=10, + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_gen_1d1481.py b/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_gen_1d1481.py new file mode 100644 index 0000000000000000000000000000000000000000..231ae24a7e445e5bc7103cb8c23c2622b5d85a48 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_gen_1d1481.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n' + ), + dict(role='BOT', prompt='[BEGIN]\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +mbpp_cn_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp_cn', + path='./data/mbpp_cn/mbpp_cn.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py b/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py new file mode 100644 index 0000000000000000000000000000000000000000..fa4ad7d892c3a554f8d2140992ef3825bd8f69b7 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='test_column') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n' + ), + dict(role='BOT', prompt='[BEGIN]\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +mbpp_cn_datasets = [ + dict( + type=MBPPDatasetV2, + abbr='mbpp_cn_passk', + path='./data/mbpp_cn/mbpp_cn.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py b/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py new file mode 100644 index 0000000000000000000000000000000000000000..9f1154931acb1ca0ae687cf8fd9094ab35cd8a93 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py @@ -0,0 +1,65 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='test_column') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n' + ), + dict(role='BOT', prompt='[BEGIN]\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT') + +mbpp_cn_datasets = [ + dict( + type=MBPPDatasetV2, + abbr='mbpp_cn_repeat10', + path='./data/mbpp_cn/mbpp_cn.jsonl', + num_repeats=10, + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py b/build/lib/opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..49c18c32271ff634badcb6604fde861a177437eb --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mbpp_cn_gen_9114d5 import mbpp_cn_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py b/build/lib/opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py new file mode 100644 index 0000000000000000000000000000000000000000..b6466f57fbf519e9ebca4dd38d885b0f4f33f7ed --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py @@ -0,0 +1,65 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + '你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n' + ), + dict(role='BOT', prompt='[BEGIN]\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +mbpp_cn_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp_cn', + path='./data/mbpp_cn/mbpp_cn.jsonl', + local_mode=True, + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py b/build/lib/opencompass/configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py new file mode 100644 index 0000000000000000000000000000000000000000..9466da7bd161822bacccf0d3188208a3e4468889 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset + +mbpp_plus_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='task_id') + +mbpp_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n' + ), + dict(role='BOT', prompt='[BEGIN]\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role='BOT') + +mbpp_plus_datasets = [ + dict( + type=MBPPPlusDataset, + abbr='mbpp_plus', + path='./data/mbpp_plus/mbpp_plus.jsonl', + reader_cfg=mbpp_plus_reader_cfg, + infer_cfg=mbpp_plus_infer_cfg, + eval_cfg=mbpp_plus_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py b/build/lib/opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b4e408d4fef3d34b48514a4b5af9e1f7deb111 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from.mbpp_plus_gen_0b836a import mbpp_plus_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py b/build/lib/opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py new file mode 100644 index 0000000000000000000000000000000000000000..1a282b0eadec7b72bf9a6dab66c74b26d0b41bbb --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset + +mbpp_plus_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='task_id') + +mbpp_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n' + ), + dict( + role='BOT', + prompt= + "[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role='HUMAN', + prompt= + 'You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n' + ), + dict(role='BOT', prompt='[BEGIN]\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role='BOT') + +mbpp_plus_datasets = [ + dict( + type=MBPPPlusDataset, + abbr='mbpp_plus', + path='./data/mbpp_plus/mbpp_plus.jsonl', + reader_cfg=mbpp_plus_reader_cfg, + infer_cfg=mbpp_plus_infer_cfg, + eval_cfg=mbpp_plus_eval_cfg) +] diff --git a/build/lib/opencompass/configs/datasets/mbpp_pro/README.md b/build/lib/opencompass/configs/datasets/mbpp_pro/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d34980e16d5c406b8a287d39488b53fbca735268 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_pro/README.md @@ -0,0 +1,17 @@ +# MBPP pro + +## OC results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 66 | +| qwen2.5-14b-instruct-hf | 64 | +| deepseek-v2-lite-chat-hf | 36 | + +## CodeEval-pro results + +| model | pass@1 | +|:--------------------------:|---------:| +|qwen2.5-coder-7b-instruct-hf| 65 | +| qwen2.5-14b-instruct-hf | 65 | +| deepseek-v2-lite-chat-hf | 39 | \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py b/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..84d45d838c1e6b91f174ce8fd47b6c7f6ad09976 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mbpp_pro_gen_3dc067 import mbpppro_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py b/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py new file mode 100644 index 0000000000000000000000000000000000000000..c14b05cb1e63974a1f78fd85111774addc6a0e9c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPProDataset, MBPPProEvaluator + + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +mbpppro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +mbpppro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mbpppro_eval_cfg = dict( + evaluator=dict(type=MBPPProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space'), +) + +mbpppro_datasets = [ + dict( + abbr='mbpp_pro', + type=MBPPProDataset, + path='opencompass/mbpp_pro', + reader_cfg=mbpppro_reader_cfg, + infer_cfg=mbpppro_infer_cfg, + eval_cfg=mbpppro_eval_cfg) +] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py b/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py new file mode 100644 index 0000000000000000000000000000000000000000..631713b8f9e6ed7374632780d27677fe31bfdae4 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPProDataset, MBPPProEvaluator + + +PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions. +Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution. +```python +{raw_problem} +{new_problem} +``` +Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content: +```python +``` +""" + + +mbpppro_reader_cfg = dict( + input_columns=['raw_problem', 'new_problem'], output_column='test_code') + +mbpppro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=PROMPT_WRAPPER), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mbpppro_eval_cfg = dict( + evaluator=dict(type=MBPPProEvaluator, + ip_address='https://opencompass-multiple-evaluator.hf.space'), +) + +mbpppro_datasets = [ + dict( + abbr='mbpp_pro', + type=MBPPProDataset, + path='opencompass/mbpp_pro', + reader_cfg=mbpppro_reader_cfg, + infer_cfg=mbpppro_infer_cfg, + eval_cfg=mbpppro_eval_cfg, + n=5, + k=3) +] \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_gen.py b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..68148ae73bc76436a4067191335c1ad6d2e894a2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .medmcqa_gen_60c8f5 import medmcqa_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py new file mode 100644 index 0000000000000000000000000000000000000000..a0d8bb437209696d65f9e91dba414fe293552643 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py @@ -0,0 +1,58 @@ +from opencompass.datasets import MedmcqaDataset, MedmcqaEvaluator +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this? +ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n' + +# Reader configuration +reader_cfg = dict( + input_columns=[ + 'question', + 'options', + 'subject_name', + 'choice_type', + 'prompt_mode', + 'topic_name', + ], + output_column='label', +) + +# Inference configuration +infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT), + ], + round=[ + dict( + role='HUMAN', + prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot + ), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# Evaluation configuration +eval_cfg = dict( + evaluator=dict(type=MedmcqaEvaluator), + pred_role='BOT', +) +medmcqa_dataset = dict( + type=MedmcqaDataset, + abbr='medmcqa', + path='openlifescienceai/medmcqa', + prompt_mode='zero-shot', + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + +) + +medmcqa_datasets = [medmcqa_dataset] diff --git a/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..f9c1b8064286a54c648441be5a421927d75e3bf0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .medmcqa_llmjudge_gen_60c8f5 import medmcqa_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py new file mode 100644 index 0000000000000000000000000000000000000000..e96cfa28c55619ce59cb5807af81240d1ec56bf7 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py @@ -0,0 +1,105 @@ +from opencompass.datasets import MedmcqaDataset, medmcqa_llmjudge_postprocess +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.evaluator import GenericLLMEvaluator + +SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this? +ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n' +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n\n\n + : \n{label}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + + +# Reader configuration +reader_cfg = dict( + input_columns=[ + 'question', + 'options', + 'subject_name', + 'choice_type', + 'prompt_mode', + 'topic_name', + ], + output_column='label', +) + +# Inference configuration +infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT), + ], + round=[ + dict( + role='HUMAN', + prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot + ), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +# Evaluation configuration +eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MedmcqaDataset, + path='openlifescienceai/medmcqa', + prompt_mode='zero-shot', + reader_cfg=reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=medmcqa_llmjudge_postprocess), + ), +) +medmcqa_dataset = dict( + type=MedmcqaDataset, + abbr='medmcqa', + path='openlifescienceai/medmcqa', + prompt_mode='zero-shot', + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=eval_cfg, + +) + +medmcqa_datasets = [medmcqa_dataset] diff --git a/build/lib/opencompass/configs/datasets/mgsm/README.md b/build/lib/opencompass/configs/datasets/mgsm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..31a9c90d69d16f3ef2ef0c2a882090b40bc0d042 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mgsm/README.md @@ -0,0 +1,67 @@ +# MGSM + +## Introduction + +The following introduction comes from the abstract in [Language models are multilingual chain-of-thought reasoners](https://arxiv.org/abs/2210.03057) + +``` +We introduce the Multilingual Grade School Math (MGSM) benchmark, by manually translating 250 grade-school math problems from the GSM8K dataset into ten typologically diverse languages. +``` + +## Official link + +### Paper + +[Language models are multilingual chain-of-thought reasoners](https://arxiv.org/abs/2210.03057) + +### Repository + +[MGSM](https://github.com/google-research/url-nlp) + +## Examples + +Input example I: + +``` +Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:". + +Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? +``` + +Output example I (from GPT-4): + +``` +Answer: 18 +``` + +## Evaluation results + +``` +dataset version metric mode llama-3-8b-instruct-hf +-------------- --------- ------------- ------ ------------------------ +mgsm_bn b65151 accuracy gen 14.4 +mgsm_de 2cc8ae accuracy gen 60 +mgsm_en 5de71e accuracy gen 76 +mgsm_es d6b459 accuracy gen 61.6 +mgsm_fr 813e3c accuracy gen 54.4 +mgsm_ja 04424f accuracy gen 42.8 +mgsm_ru 400469 accuracy gen 62.8 +mgsm_sw 9e41ed accuracy gen 0.8 +mgsm_te 346d97 accuracy gen 0 +mgsm_th e70bee accuracy gen 44 +mgsm_zh d5cf30 accuracy gen 28.4 +mgsm_latin - naive_average gen 50.56 +mgsm_non_latin - naive_average gen 32.07 +mgsm - naive_average gen 40.47 +``` + +## Reference + +``` +@article{shi2022language, + title={Language models are multilingual chain-of-thought reasoners}, + author={Shi, Freda and Suzgun, Mirac and Freitag, Markus and Wang, Xuezhi and Srivats, Suraj and Vosoughi, Soroush and Chung, Hyung Won and Tay, Yi and Ruder, Sebastian and Zhou, Denny and others}, + journal={arXiv preprint arXiv:2210.03057}, + year={2022} +} +``` diff --git a/build/lib/opencompass/configs/datasets/mgsm/mgsm_gen.py b/build/lib/opencompass/configs/datasets/mgsm/mgsm_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..8feea66c2679472dd492da412df6ad8035431d70 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mgsm/mgsm_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mgsm_gen_d967bc import mgsm_datasets diff --git a/build/lib/opencompass/configs/datasets/mgsm/mgsm_gen_d967bc.py b/build/lib/opencompass/configs/datasets/mgsm/mgsm_gen_d967bc.py new file mode 100644 index 0000000000000000000000000000000000000000..3c38f74a11c78b721f9d4bdc2ff608862d976249 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mgsm/mgsm_gen_d967bc.py @@ -0,0 +1,56 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator +from opencompass.datasets import MGSMSDataset, MGSM_Evaluator, mgsm_postprocess + + +ALL_LANGUAGES = ['bn', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'sw', 'te', 'th', 'zh'] + +LANG_TO_INSTRUCTIONS = { + 'en': """Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:".\n\n{question}""", + 'bn': """এই গণিতের সমস্যাটি সমাধান করুন। চূড়ান্ত উত্তর দেওয়ার আগে যুক্তিসম্পন্ন পদক্ষেপ প্রদান করুন। চূড়ান্ত উত্তরটি একক সংখ্যা হিসাবে "উত্তর:" এর পরে শেষ লাইনে দিন। "উত্তর:" এর পরে অন্য কিছু যুক্ত করবেন না।.\n\n{question}""", + 'de': """Löse dieses Mathematikproblem. Gib die Schritte zur Begründung an, bevor du die endgültige Antwort in der letzten Zeile alleine im Format "Antwort:" gibst. Füge nichts anderes als die ganzzahlige Antwort nach "Antwort:" hinzu.\n\n{question}""", + 'es': """Resuelve este problema matemático. Proporciona los pasos de razonamiento antes de dar la respuesta final en la última línea por sí misma en el formato de "Respuesta:". No añadas nada más que la respuesta entera después de "Respuesta:".\n\n{question}""", + 'fr': """Résolvez ce problème de mathématiques. Donnez les étapes de raisonnement avant de fournir la réponse finale sur la dernière ligne elle-même dans le format de "Réponse:". N'ajoutez rien d'autre que la réponse entière après "Réponse:".\n\n{question}""", + 'ja': """の数学の問題を解いてください。最終的な答えを出す前に、解答の推論過程を記述してください。そして最後の行には "答え:" の形式で答えを記述し、その後には整数の答え以外何も追加しないでください。\n\n{question}""", + 'ru': """Решите эту математическую задачу. Объясните шаги рассуждения перед тем, как дать окончательный ответ в последней строке сам по себе в формате "Ответ:". Не добавляйте ничего, кроме целочисленного ответа после "Ответ:".\n\n{question}""", + 'sw': """Suluhisha tatizo hili la hesabu. Toa hatua za mantiki kabla ya kutoa jibu la mwisho kwenye mstari wa mwisho peke yake katika muundo wa "Jibu:". Usiongeze chochote kingine isipokuwa jibu la integer baada ya "Jibu:".\n\n{question}""", + 'te': """ఈ గణిత సమస్యను పరిష్కరించండి. చివరి సమాధానాన్ని ఇవ్వదానికి ముందు తర్కాత్మక అదుగులను ఇవ్వండి. చివరి పంక్తిలో మాత్రమే 'సమాధానం:' అనే ఆకారంలో చివరి సమాధానాద్ని ఇవ్వండి సమాధానం: తర్వాత పూర్ణాంక సమాధానానికి తప్పించి ఎదేనా చేర్చవద్దు.\n\n{question}""", + 'th': """แก้ปัญหาคณิตศาสตร์นี้ ให้ให้ขั้นตอนการใช้เหตุผลก่อนที่จะให้คำตอบสุดท้ายในบรรทัดสุดท้ายโดยอยู่ในรูปแบบ "คำตอบ:" ไม่ควรเพิ่มอะไรนอกจากคำตอบที่เป็นจำนวนเต็มหลังจาก "คำตอบ:"\n\n{question}""", + 'zh': """解决这个数学问题。在最后一行给出答案前,请提供推理步骤。最后一行应该以 "答案: " 的形式独立给出答案。在 "答案:" 后不要添加除整数答案之外的任何内容。\n\n{question}""", +} + +mgsm_datasets = [] +for lang in ALL_LANGUAGES: + mgsm_reader_cfg = dict(input_columns=['question'], output_column='answer') + + mgsm_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=LANG_TO_INSTRUCTIONS[lang]), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + + mgsm_eval_cfg = dict( + evaluator=dict(type=MGSM_Evaluator), + pred_role='BOT', + pred_postprocessor=dict(type=mgsm_postprocess, lang=lang), + ) + + mgsm_datasets.append( + dict( + type=MGSMSDataset, + abbr=f'mgsm_{lang}', + path=f'data/mgsm/mgsm_{lang}.tsv', + reader_cfg=mgsm_reader_cfg, + infer_cfg=mgsm_infer_cfg, + eval_cfg=mgsm_eval_cfg, + ) + ) diff --git a/build/lib/opencompass/configs/datasets/mmlu/README.md b/build/lib/opencompass/configs/datasets/mmlu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eaa2181e3334141c1387e5c633eb2e3a311aeb88 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/README.md @@ -0,0 +1,368 @@ +# MMLU + +```bash +python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug +python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug +``` + +## Base Models + +| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other | +|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:| +| llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 | +| llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 | +| llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 | +| llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 | +| llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 | +| llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 | +| llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 | +| llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 | +| llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 | +| internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 | +| internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 | +| internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 | +| qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 | +| qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 | +| qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 | +| qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 | +| qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 | +| qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 | +| qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 | +| qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 | +| qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 | +| qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 | +| qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 | +| qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 | +| mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 | +| mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 | +| mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 | +| mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 | +| yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 | +| yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 | +| deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 | +| deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 | + +### Details + +| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts | +|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:| +| llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 | +| llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 | +| llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 | +| llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 | +| llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 | +| llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 | +| llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 | +| llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 | +| llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 | +| internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 | +| internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 | +| internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 | +| qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 | +| qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 | +| qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 | +| qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 | +| qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 | +| qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 | +| qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 | +| qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 | +| qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 | +| qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 | +| qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 | +| qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 | +| mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 | +| mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 | +| mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 | +| mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 | +| yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 | +| yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 | +| deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 | +| deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 | + +| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology | +|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:| +| llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 | +| llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 | +| llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 | +| llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 | +| llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 | +| llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 | +| llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 | +| llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 | +| llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 | +| internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 | +| internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 | +| internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 | +| qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 | +| qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 | +| qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 | +| qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 | +| qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 | +| qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 | +| qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 | +| qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 | +| qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 | +| qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 | +| qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 | +| qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 | +| mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 | +| mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 | +| mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 | +| mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 | +| yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 | +| yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 | +| deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 | +| deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 | + +| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history | +|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:| +| llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 | +| llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 | +| llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 | +| llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 | +| llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 | +| llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 | +| llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 | +| llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 | +| llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 | +| internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 | +| internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 | +| internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 | +| qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 | +| qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 | +| qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 | +| qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 | +| qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 | +| qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 | +| qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 | +| qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 | +| qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 | +| qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 | +| qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 | +| qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 | +| mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 | +| mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 | +| mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 | +| mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 | +| yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 | +| yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 | +| deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 | +| deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 | + +| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine | +|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:| +| llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 | +| llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 | +| llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 | +| llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 | +| llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 | +| llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 | +| llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 | +| llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 | +| llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 | +| internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 | +| internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 | +| internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 | +| qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 | +| qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 | +| qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 | +| qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 | +| qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 | +| qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 | +| qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 | +| qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 | +| qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 | +| qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 | +| qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 | +| qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 | +| mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 | +| mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 | +| mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 | +| mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 | +| yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 | +| yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 | +| deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 | +| deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 | + +| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy | +|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:| +| llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 | +| llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 | +| llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 | +| llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 | +| llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 | +| llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 | +| llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 | +| llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 | +| llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 | +| internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 | +| internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 | +| internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 | +| qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 | +| qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 | +| qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 | +| qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 | +| qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 | +| qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 | +| qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 | +| qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 | +| qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 | +| qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 | +| qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 | +| qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 | +| mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 | +| mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 | +| mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 | +| mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 | +| yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 | +| yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 | +| deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 | +| deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 | + +## Chat Models + +| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other | +|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:| +| qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 | +| qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 | +| qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 | +| qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 | +| qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 | +| qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 | +| qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 | +| qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 | +| internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 | +| internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 | +| internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 | +| internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 | +| internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 | +| internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 | +| llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 | +| llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 | +| llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 | +| llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 | +| mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 | +| mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 | +| mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 | + +### Details + +| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts | +|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:| +| qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 | +| qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 | +| qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 | +| qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 | +| qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 | +| qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 | +| qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 | +| qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 | +| internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 | +| internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 | +| internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 | +| internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 | +| internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 | +| internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 | +| llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 | +| llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 | +| llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 | +| llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 | +| mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 | +| mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 | +| mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 | + +| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology | +|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:| +| qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 | +| qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 | +| qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 | +| qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 | +| qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 | +| qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 | +| qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 | +| qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 | +| internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 | +| internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 | +| internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 | +| internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 | +| internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 | +| internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 | +| llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 | +| llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 | +| llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 | +| llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 | +| mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 | +| mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 | +| mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 | + +| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history | +|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:| +| qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 | +| qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 | +| qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 | +| qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 | +| qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 | +| qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 | +| qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 | +| qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 | +| internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 | +| internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 | +| internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 | +| internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 | +| internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 | +| internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 | +| llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 | +| llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 | +| llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 | +| llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 | +| mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 | +| mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 | +| mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 | + +| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine | +|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:| +| qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 | +| qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 | +| qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 | +| qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 | +| qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 | +| qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 | +| qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 | +| qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 | +| internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 | +| internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 | +| internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 | +| internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 | +| internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 | +| internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 | +| llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 | +| llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 | +| llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 | +| llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 | +| mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 | +| mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 | +| mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 | + +| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy | +|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:| +| qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 | +| qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 | +| qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 | +| qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 | +| qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 | +| qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 | +| qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 | +| qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 | +| internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 | +| internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 | +| internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 | +| internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 | +| internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 | +| internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 | +| llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 | +| llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 | +| llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 | +| llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 | +| mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 | +| mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 | +| mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 | diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_all_sets.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_all_sets.py new file mode 100644 index 0000000000000000000000000000000000000000..e5b68a7c10b12110f85f335b70313244c8f91b8d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_all_sets.py @@ -0,0 +1,59 @@ +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..52d23e29902216d9ef47e9c99a0d6f7d483330a5 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py @@ -0,0 +1,114 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccContaminationEvaluator +from opencompass.datasets import MMLUDatasetClean as MMLUDataset + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={ + opt: + f'{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n' + for opt in ['A', 'B', 'C', 'D'] + }, + ), + prompt_template=dict( + type=PromptTemplate, + template={ + opt: + f'{_hint}{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}' + for opt in ['A', 'B', 'C', 'D'] + }, + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer), + ) + + mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..5c8303b84497b5f40abb822c96cb5887065dd76d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py new file mode 100644 index 0000000000000000000000000000000000000000..71e3eba97c7b31e0074e3ada1e89c1235297aabc --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py @@ -0,0 +1,124 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_prompt_template = dict( + type='PromptTemplate', + template=None, + ice_token='') + +mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ])), + prompt_template=mmlu_prompt_template, + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer)) + +mmlu_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess)) + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg.copy(), + eval_cfg=mmlu_eval_cfg)) + + mmlu_datasets[-1]['infer_cfg'][ + 'prompt_template'] = mmlu_prompt_template.copy() + mmlu_datasets[-1]['infer_cfg']['prompt_template']['template'] = \ + dict( + begin=[ + dict(role='SYSTEM', fallback_role='HUMAN', prompt=f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.'), + '', + ], + round=[ + dict(role='HUMAN', prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '), + ] + ) + +del _name diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py new file mode 100644 index 0000000000000000000000000000000000000000..36c0a21a5fb45e3b8aa49daf9d993094e877f9a8 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py @@ -0,0 +1,123 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py new file mode 100644 index 0000000000000000000000000000000000000000..73bd38956729b9ea08f9d2fcd139863876e410c7 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py @@ -0,0 +1,124 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess)) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae6a0e41724225ff4fe6a3fd0a73ac26615f8ba --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py @@ -0,0 +1,110 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template= + '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: {target}\n', + ), + prompt_template=dict( + type=PromptTemplate, + template= + f'{_hint}{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:', + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess), + ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py new file mode 100644 index 0000000000000000000000000000000000000000..76a4f9b42b92a60d299d26fb11ee09a3bdad0096 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py @@ -0,0 +1,124 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..b2389fb23016b821cfdc18fec7c55b06e7836363 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_llmjudge_gen_f4336b import mmlu_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py new file mode 100644 index 0000000000000000000000000000000000000000..645fa9f07ad826b28f272fdd1efc83616c6a8225 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py @@ -0,0 +1,111 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess + +with read_base(): + from .mmlu_all_sets import mmlu_all_sets +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev', +) + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', + ) + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + mode='singlescore', + ) + ) diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py new file mode 100644 index 0000000000000000000000000000000000000000..45cefb76218d67e2e303771dcfabf996ac4b7b2a --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py @@ -0,0 +1,141 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.utils.model_postprocessors import navie_model_postprocess +from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE + + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + +# # You can write your own postprocess prompt like: +# MMLU_NAVIE_PROMPT_TEMPLATE = """ +# There is a detailed explanation of the final answer you should extract: +# 1. ... +# 2. ... +# ... +# """ + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + model_postprocessor=dict( + type=navie_model_postprocess, + custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE, + model_name='', + api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_openai_0shot_nocot_llmjudge_gen_216503.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_openai_0shot_nocot_llmjudge_gen_216503.py new file mode 100644 index 0000000000000000000000000000000000000000..4b77cec5ab8e15912edfae3c9dce8f67965f6e3f --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_openai_0shot_nocot_llmjudge_gen_216503.py @@ -0,0 +1,104 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess + +with read_base(): + # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets + from .mmlu_stem_sets import mmlu_all_sets +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + pred_role='BOT', + ) + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + mode='singlescore', + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py new file mode 100644 index 0000000000000000000000000000000000000000..b6b45d2fe1d8aa215ed3db837912a612aea411a2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py @@ -0,0 +1,59 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern + +with read_base(): + from .mmlu_all_sets import mmlu_all_sets + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])')) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_ppl.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..3c081ed8e71fa15188ca37948360daba29f4cba3 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_ppl_ac766d import mmlu_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py new file mode 100644 index 0000000000000000000000000000000000000000..ea39d57df594add55006c5b8f9db49e524db465a --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py @@ -0,0 +1,106 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n' + question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={opt: f'{question_overall}\nAnswer: {opt}\n' for opt in ['A', 'B', 'C', 'D']}, + ), + prompt_template=dict( + type=PromptTemplate, + template={opt: f'{_hint}{question_overall}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']}, + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer), + ) + + mmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator), ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py new file mode 100644 index 0000000000000000000000000000000000000000..1f83098bfc89d6a12a449275d1e103a9a3d6c0df --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py @@ -0,0 +1,127 @@ +""" +Setting: 0-shot No-CoT +Evaluator: GenericLLMEvaluator +""" +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern +from opencompass.datasets import generic_llmjudge_postprocess +from opencompass.evaluator import ( + CascadeEvaluator, + GenericLLMEvaluator, +) + +with read_base(): + # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets + from .mmlu_stem_sets import mmlu_all_sets +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict( + type=CascadeEvaluator, + rule_evaluator=dict( + type=AccEvaluator, + pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'), + ), + llm_evaluator = dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + ), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + judge_cfg=dict(), + ), + parallel=False + ), + ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + mode='singlescore', + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_gen_216503.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_gen_216503.py new file mode 100644 index 0000000000000000000000000000000000000000..bc253a6ef14ce8a217d36ab24de0f744eecb578c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_gen_216503.py @@ -0,0 +1,115 @@ +""" +Setting: 0-shot No-CoT +Evaluator: GenericLLMEvaluator +""" +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess + +with read_base(): + # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets + from .mmlu_stem_sets import mmlu_all_sets +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + ), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + judge_cfg=dict(), + ), + ) + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + mode='singlescore', + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_xml_gen_216503.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_xml_gen_216503.py new file mode 100644 index 0000000000000000000000000000000000000000..bddb7cd94bcbafc330c9f8dd43c4b4f197c95a62 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_xml_gen_216503.py @@ -0,0 +1,116 @@ +""" +Setting: 0-shot No-CoT +Evaluator: GenericLLMEvaluator +""" +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import match_answer_pattern, xml_tag_postprocessor +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_postprocess + +with read_base(): + # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets + from .mmlu_stem_sets import mmlu_all_sets +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. + +{input} + +A) {A} +B) {B} +C) {C} +D) {D} +""".strip() + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_datasets = [] +for name in mmlu_all_sets: + mmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + ), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + pred_postprocessor=dict(type=xml_tag_postprocessor, tag=''), + judge_cfg=dict(), + ), + ) + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + mode='singlescore', + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_sets.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_sets.py new file mode 100644 index 0000000000000000000000000000000000000000..192a4968bbb019af774ace9c395e4733516f8f40 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_stem_sets.py @@ -0,0 +1,3 @@ +mmlu_all_sets = [ + 'abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning' +] diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py new file mode 100644 index 0000000000000000000000000000000000000000..8dff3a57db9f610eccae81f503327fb16566cbe2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_xfinder_gen_4d595a.py @@ -0,0 +1,130 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.utils.model_postprocessors import xfinder_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + model_postprocessor=dict( + type=xfinder_postprocess, + question_type='alphabet_option', + xfinder_model_name='xFinder-qwen1505', + xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1') + ) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py b/build/lib/opencompass/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py new file mode 100644 index 0000000000000000000000000000000000000000..b8096e8c1fbfaf983baaabeecd4e71737acb4fad --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu/mmlu_zero_shot_gen_47e2c0.py @@ -0,0 +1,123 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever, ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +mmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_all_sets = [ + 'college_biology', + 'college_chemistry', + 'college_computer_science', + 'college_mathematics', + 'college_physics', + 'electrical_engineering', + 'astronomy', + 'anatomy', + 'abstract_algebra', + 'machine_learning', + 'clinical_knowledge', + 'global_facts', + 'management', + 'nutrition', + 'marketing', + 'professional_accounting', + 'high_school_geography', + 'international_law', + 'moral_scenarios', + 'computer_security', + 'high_school_microeconomics', + 'professional_law', + 'medical_genetics', + 'professional_psychology', + 'jurisprudence', + 'world_religions', + 'philosophy', + 'virology', + 'high_school_chemistry', + 'public_relations', + 'high_school_macroeconomics', + 'human_sexuality', + 'elementary_mathematics', + 'high_school_physics', + 'high_school_computer_science', + 'high_school_european_history', + 'business_ethics', + 'moral_disputes', + 'high_school_statistics', + 'miscellaneous', + 'formal_logic', + 'high_school_government_and_politics', + 'prehistory', + 'security_studies', + 'high_school_biology', + 'logical_fallacies', + 'high_school_world_history', + 'professional_medicine', + 'high_school_mathematics', + 'college_medicine', + 'high_school_us_history', + 'sociology', + 'econometrics', + 'high_school_psychology', + 'human_aging', + 'us_foreign_policy', + 'conceptual_physics', +] + + +mmlu_datasets = [] +for _name in mmlu_all_sets: + _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.' + mmlu_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt= + f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nLet's think step by step. A: " + ), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=256), + ) + + mmlu_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_datasets.append( + dict( + abbr=f'lukaemon_mmlu_{_name}', + type=MMLUDataset, + path='opencompass/mmlu', + name=_name, + reader_cfg=mmlu_reader_cfg, + infer_cfg=mmlu_infer_cfg, + eval_cfg=mmlu_eval_cfg, + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py new file mode 100644 index 0000000000000000000000000000000000000000..ab8b198f40780f57f8b9c34a1d5c4cad59523780 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py @@ -0,0 +1,16 @@ +categories = [ + 'Math', + 'Physics', + 'Chemistry', + 'Law', + 'Engineering', + 'Other', + 'Economics', + 'Health', + 'Psychology', + 'Business', + 'Biology', + 'Philosophy', + 'Computer_Science', + 'History', +] diff --git a/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py new file mode 100644 index 0000000000000000000000000000000000000000..6500cee7f0847f59db30e6a02dd7f8e7d4e8f360 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..5fbee8d9e59ed80c75820e804135dbcc99b8fdc7 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py new file mode 100644 index 0000000000000000000000000000000000000000..851fec913a4df973187cd2fe38fb3dcb8580f5c1 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py new file mode 100644 index 0000000000000000000000000000000000000000..d084f4f0f8ead0425b1698b970d8e138eba4404f --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py new file mode 100644 index 0000000000000000000000000000000000000000..341e13cf57195860d229764488ec815c2514a98b --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUProDataset +from opencompass.utils.text_postprocessors import match_answer_pattern + +with read_base(): + from .mmlu_pro_categories import categories + + + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering. + +Question:\n +{question} + +Options:\n +{options_str} + +""".strip() + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', + prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict( + type=match_answer_pattern, + answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])') + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py new file mode 100644 index 0000000000000000000000000000000000000000..e12f43fe66c2c65072bb22addab76de09eda8624 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py @@ -0,0 +1,106 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import MMLUProDataset, generic_llmjudge_postprocess + +with read_base(): + from .mmlu_pro_categories import categories + + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering. + +Question:\n +{question} + +Options:\n +{options_str} + +""".strip() + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + : {question}\n {options_str} \n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + ) + ) diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py new file mode 100644 index 0000000000000000000000000000000000000000..02766491dbcd5671e32cb3fd353f7d46aeea4df2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py @@ -0,0 +1,60 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUProDataset +from opencompass.utils.text_postprocessors import match_answer_pattern + +categories = [ + 'health', +] + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering. +Question:\n +{question} +Options:\n +{options_str} +""".strip() + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', + prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict( + type=match_answer_pattern, + answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])') + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py new file mode 100644 index 0000000000000000000000000000000000000000..58cd20b18067bf536180343b0c0c10732c1c62c3 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py @@ -0,0 +1,101 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import MMLUProDataset, generic_llmjudge_postprocess + +categories = [ + 'health', +] + + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering. +Question:\n +{question} +Options:\n +{options_str} +""".strip() + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + : {question}\n {options_str} \n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + Judging the correctness of candidates' answers: +""".strip() + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + ) + ) \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_categories.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_categories.py new file mode 100644 index 0000000000000000000000000000000000000000..eff38983e0a06b5a806239fd644e85ed19f141a0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_categories.py @@ -0,0 +1,16 @@ +categories = [ + 'math', + 'physics', + 'chemistry', + 'law', + 'engineering', + 'other', + 'economics', + 'health', + 'psychology', + 'business', + 'biology', + 'philosophy', + 'computer science', + 'history', +] diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py new file mode 100644 index 0000000000000000000000000000000000000000..dc12fd1d9d18e4b5ea744671e9a495285d1a20f2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py @@ -0,0 +1,47 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator + +with read_base(): + from .mmlu_pro_categories import categories + +mmlu_pro_datasets = [] + +for category in categories: + hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.' + question_and_options = 'Question:\n{question}\nOptions:\n{options_str}' + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer_string', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=f'{question_and_options}\nAnswer: {{answer}}'), + prompt_template=dict( + type=PromptTemplate, + template=f'{hint}\n{question_and_options}\nAnswer: ', + ice_token='' + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer, max_out_len=100) + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=MMLUProBaseEvaluator) + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..228dad99cdcaf04da5894b93d4ae117f2ee05565 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py new file mode 100644 index 0000000000000000000000000000000000000000..42c30131ff8fcac716063df504fc6866adba0488 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py @@ -0,0 +1,59 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUProDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_pro_categories import categories + + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + + mmlu_pro_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Question:\n{question}\nOptions:\n{options_str}'), + dict(role='BOT', prompt="Answer: Let's think step by step. {cot_content}") + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt='Question:\n{question}\nOptions:\n{options_str}'), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFGHIJKLMNOP'), + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..a895d5c281ad930e28b5382eb51098e927aa2a6d --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import mmlu_pro_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmmlu/README.md b/build/lib/opencompass/configs/datasets/mmmlu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8faef6b277314299a5419fb7e7e08232a47e4c16 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu/README.md @@ -0,0 +1,35 @@ +# MMMLU +## Dataset Description +Multilingual Massive Multitask Language Understanding (MMMLU) +The MMLU is a widely recognized benchmark of general knowledge attained by AI models. It covers a broad range of topics from 57 different categories, covering elementary-level knowledge up to advanced professional subjects like law, physics, history, and computer science. + +We translated the MMLU’s test set into 14 languages using professional human translators. Relying on human translators for this evaluation increases confidence in the accuracy of the translations, especially for low-resource languages like Yoruba. We are publishing the professional human translations and the code we use to run the evaluations. + +This effort reflects our commitment to improving the multilingual capabilities of AI models, ensuring they perform accurately across languages, particularly for underrepresented communities. By prioritizing high-quality translations, we aim to make AI technology more inclusive and effective for users worldwide. +MMMLU contains the MMLU test set translated into the following locales: + +- AR_XY (Arabic) +- BN_BD (Bengali) +- DE_DE (German) +- ES_LA (Spanish) +- FR_FR (French) +- HI_IN (Hindi) +- ID_ID (Indonesian) +- IT_IT (Italian) +- JA_JP (Japanese) +- KO_KR (Korean) +- PT_BR (Brazilian Portuguese) +- SW_KE (Swahili) +- YO_NG (Yoruba) +- ZH_CH (Simplied Chinese) + + +## How to Use +Download file from [link](https://hf-mirror.com/datasets/openai/MMMLU) + +```python +from datasets import load_dataset +ds = load_dataset("openai/MMMLU", "default") +from datasets import load_dataset +ds = load_dataset("openai/MMMLU", "by_language") +``` \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_5_shot_gen_bcbeb3.py b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_5_shot_gen_bcbeb3.py new file mode 100644 index 0000000000000000000000000000000000000000..2479c3d59efffa3ca19e9bab3995ae42a935d7cd --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_5_shot_gen_bcbeb3.py @@ -0,0 +1,138 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess +from mmengine.config import read_base + +with read_base(): + from .mmmlu_prompt import (get_few_shot_prompts_ar, + get_few_shot_prompts_bn, + get_few_shot_prompts_de, + get_few_shot_prompts_es, + get_few_shot_prompts_fr, + get_few_shot_prompts_hi, + get_few_shot_prompts_id, + get_few_shot_prompts_it, + get_few_shot_prompts_ja, + get_few_shot_prompts_ko, + get_few_shot_prompts_pt, + get_few_shot_prompts_zh, + get_few_shot_prompts_sw, + get_few_shot_prompts_yo) + +mmmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D','subject'], + output_column='target', + train_split='test') + +mmmlu_all_sets = [ + 'mmlu_AR-XY', + 'mmlu_BN-BD', + 'mmlu_DE-DE', + 'mmlu_ES-LA', + 'mmlu_FR-FR', + 'mmlu_HI-IN', + 'mmlu_ID-ID', + 'mmlu_IT-IT', + 'mmlu_JA-JP', + 'mmlu_KO-KR', + 'mmlu_PT-BR', + 'mmlu_SW-KE', + 'mmlu_YO-NG', + 'mmlu_ZH-CN', +] + +mmmlu_datasets = [] +for _name in mmmlu_all_sets: + if 'AR' in _name: + _hint = f'هناك سؤال اختيار واحد. أجب عن السؤال بالرد على A أو B أو C أو D, يرجى استخدام واحدة من الرموز A، B، C، أو D لتمثيل خيارات الإجابة في ردك' + _prompt = f'يتعلق بـ {{subject}} \nالسؤال: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالإجابة:' + _round = get_few_shot_prompts_ar(_hint, _prompt) + elif 'BN' in _name: + _hint = f'এটি একটি একক পছন্দের প্রশ্ন। এ, বি, সি বা ডি উত্তর দিয়ে প্রশ্নের উত্তর দিন।, আপনার উত্তরে ইংরেজি বর্ণ A, B, C এবং D এর মধ্যে একটি ব্যবহার করুন' + _prompt = f'এটি {{subject}} সম্পর্কে \nপ্রশ্ন: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nউত্তর:' + _round = get_few_shot_prompts_bn(_hint, _prompt) + elif 'DE' in _name: + _hint = f'Es gibt eine Einzelwahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.' + _prompt = f'Es geht um {{subject}} \nFrage: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAntwort:' + _round = get_few_shot_prompts_de(_hint, _prompt) + elif 'ES' in _name: + _hint = f'Hay una pregunta de elección única. Responde a la pregunta respondiendo A, B, C o D.' + _prompt = f'Se trata de {{subject}} \nPregunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRespuesta:' + _round = get_few_shot_prompts_es(_hint, _prompt) + elif 'FR' in _name: + _hint = f'Il y a une question à choix unique. Répondez à la question en répondant A, B, C ou D.' + _prompt = f'''C'est à propos de {{subject}} \nQuestion : {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRéponse :''' + _round = get_few_shot_prompts_fr(_hint, _prompt) + elif 'HI' in _name: + _hint = f'यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D में से कोई भी उत्तर देकर दें।' + _prompt = f'यह {{subject}} के बारे में है \nप्रश्न: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nउत्तर:' + _round = get_few_shot_prompts_hi(_hint, _prompt) + elif 'ID' in _name: + _hint = f'Ada pertanyaan pilihan tunggal. Jawablah pertanyaan dengan menjawab A, B, C, atau D.' + _prompt = f'Ini tentang {{subject}} \nPertanyaan: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJawaban:' + _round = get_few_shot_prompts_id(_hint, _prompt) + elif 'IT' in _name: + _hint = f'Ci sono domande a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.' + _prompt = f'Si tratta di {{subject}} \nDomanda: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRisposta:' + _round = get_few_shot_prompts_it(_hint, _prompt) + elif 'JA' in _name: + _hint = f'単一選択肢の質問があります。この質問にはA、B、C、またはDで答えてください。' + _prompt = f'これは {{subject}} に関することです \n質問: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n回答:' + _round = get_few_shot_prompts_ja(_hint, _prompt) + elif 'KO' in _name: + _hint = f'단일 선택 질문이 있습니다. A, B, C 또는 D로 답변해 주세요.' + _prompt = f'이것은 {{subject}}에 관한 것입니다 \n질문: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n답변:' + _round = get_few_shot_prompts_ko(_hint, _prompt) + elif 'PT' in _name: + _hint = f'Há uma pergunta de escolha única. Responda à pergunta escolhendo A, B, C ou D.' + _prompt = f'É sobre {{subject}} \nPergunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nResposta:' + _round = get_few_shot_prompts_pt(_hint, _prompt) + elif 'ZH' in _name: + _hint = f'这里有一个单项选择题。请通过选择 A、B、C 或 D 来回答该问题。' + _prompt = f'这是关于 {{subject}} 的内容\n问题:{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:' + _round = get_few_shot_prompts_zh(_hint, _prompt) + elif 'SW' in _name: + _hint = f'Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.' + _prompt = f'Hii ni kuhusu {{subject}}.\nSwali: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJibu:' + _round = get_few_shot_prompts_sw(_hint, _prompt) + elif 'YO' in _name: + _hint = f'Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.' + _prompt = f'Eyi jẹ nipa {{subject}}.\nIbeere: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nFesi:' + _round = get_few_shot_prompts_yo(_hint, _prompt) + else: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + _prompt = f'it is about {{subject}} \nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:' + _round = f'{_hint}\n{_prompt}\n'+'Please answer only with option A, B, C or D. \nAnswer:' + mmmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=_round + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmmlu_datasets.append( + dict( + abbr=f'openai_m{_name}', + type=MMMLUDataset, + path='openai/MMMLU', + name=_name, + reader_cfg=mmmlu_reader_cfg, + infer_cfg=mmmlu_infer_cfg, + eval_cfg=mmmlu_eval_cfg, + )) + +del _name, _hint, _prompt, _round diff --git a/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_gen.py b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..557909780280a84934f0589f32840abdbeca2b94 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmmlu_gen_c51a84 import mmmlu_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py new file mode 100644 index 0000000000000000000000000000000000000000..85c196c23135b0650f79a90dfd110fc1a3deb475 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py @@ -0,0 +1,105 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMMLUDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + + +mmmlu_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D','subject'], + output_column='target', + train_split='test') + +mmmlu_all_sets = [ + 'mmlu_AR-XY', + 'mmlu_BN-BD', + 'mmlu_DE-DE', + 'mmlu_ES-LA', + 'mmlu_FR-FR', + 'mmlu_HI-IN', + 'mmlu_ID-ID', + 'mmlu_IT-IT', + 'mmlu_JA-JP', + 'mmlu_KO-KR', + 'mmlu_PT-BR', + 'mmlu_SW-KE', + 'mmlu_YO-NG', + 'mmlu_ZH-CN', +] + +mmmlu_datasets = [] +for _name in mmmlu_all_sets: + if 'AR' in _name: + _hint = f'هناك سؤال اختيار واحد. أجب عن السؤال بالرد على A أو B أو C أو D, يرجى استخدام واحدة من الرموز A، B، C، أو D لتمثيل خيارات الإجابة في ردك' + _prompt = f'يتعلق بـ {{subject}} \nالسؤال: {{input}}\nأ. {{A}}\nب. {{B}}\nج. {{C}}\nد. {{D}}\nالإجابة:' + elif 'BN' in _name: + _hint = f'এটি একটি একক পছন্দের প্রশ্ন। এ, বি, সি বা ডি উত্তর দিয়ে প্রশ্নের উত্তর দিন।, আপনার উত্তরে ইংরেজি বর্ণ A, B, C এবং D এর মধ্যে একটি ব্যবহার করুন' + _prompt = f'এটি {{subject}} এর সম্পর্কে \nপ্রশ্ন: {{input}}\nএ. {{A}}\nবি. {{B}}\nসি. {{C}}\nডি. {{D}}\nউত্তর:' + elif 'DE' in _name: + _hint = f'Es gibt eine Einzelwahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.' + _prompt = f'Es geht um {{subject}} \nFrage: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAntwort:' + elif 'ES' in _name: + _hint = f'Hay una pregunta de elección única. Responde a la pregunta respondiendo A, B, C o D.' + _prompt = f'Se trata de {{subject}} \nPregunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRespuesta:' + elif 'FR' in _name: + _hint = f'Il y a une question à choix unique. Répondez à la question en répondant A, B, C ou D.' + _prompt = f'''C'est à propos de {{subject}} \nQuestion : {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRéponse :''' + elif 'HI' in _name: + _hint = f'यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D में से कोई भी उत्तर देकर दें।' + _prompt = f'यह {{subject}} के बारे में है \nप्रश्न: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nउत्तर:' + elif 'ID' in _name: + _hint = f'Ada pertanyaan pilihan tunggal. Jawablah pertanyaan dengan menjawab A, B, C, atau D.' + _prompt = f'Ini tentang {{subject}} \nPertanyaan: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJawaban:' + elif 'IT' in _name: + _hint = f'Ci sono domande a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.' + _prompt = f'Si tratta di {{subject}} \nDomanda: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRisposta:' + elif 'JA' in _name: + _hint = f'単一選択肢の質問があります。この質問にはA、B、C、またはDで答えてください。' + _prompt = f'これは {{subject}} に関することです \n質問: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n回答:' + elif 'KO' in _name: + _hint = f'단일 선택 질문이 있습니다. A, B, C 또는 D로 답변해 주세요.' + _prompt = f'이것은 {{subject}}에 관한 것입니다 \n질문: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n답변:' + elif 'PT' in _name: + _hint = f'Há uma pergunta de escolha única. Responda à pergunta escolhendo A, B, C ou D.' + _prompt = f'É sobre {{subject}} \nPergunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nResposta:' + elif 'ZH' in _name: + _hint = f'这里有一个单项选择题。请通过选择 A、B、C 或 D 来回答该问题。' + _prompt = f'这是关于 {{subject}} 的内容\n问题:{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:' + else: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + _prompt = f'it is about {{subject}} \nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:' + mmmlu_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\n {_prompt}' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmmlu_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmmlu_datasets.append( + dict( + abbr=f'openai_m{_name}', + type=MMMLUDataset, + path='openai/MMMLU', + name=_name, + reader_cfg=mmmlu_reader_cfg, + infer_cfg=mmmlu_infer_cfg, + eval_cfg=mmmlu_eval_cfg, + )) + +del _name, _hint, _prompt \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_prompt.py b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..f04f3c6276f68c7ce47990f582b581cdef3683ff --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu/mmmlu_prompt.py @@ -0,0 +1,213 @@ +_hint = 'This is a hint that helps you solve the question.' + +_prompt = 'Please enter your question here.' + +def get_few_shot_prompts_ar(_hint, _prompt): + return [ + + dict(role='HUMAN', prompt='هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق بـ الجبر المجرد\nالسؤال: ابحث عن أقصى حد ممكن لترتيب بعض العناصر في Z_4 x Z_6.\n A.4\nB.6\nC.12\nD.24\nلنفكر خطوة بخطوة\nالإجابة:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو المدرسة الثانوية\nالسؤال: أي من الأديان أدناه هو دين عالمي؟ A. الطاوية\n B. الإسلام\n C. الشنتو\n D. الكونفوشيوسية\nلنفكر خطوة بخطوة\nالإجابة:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق بـ تعلم الآلة\nالسؤال: في كعكة يان لوكون، الكرز في الأعلى هو: \nA. التعلم المعزز\nB. التعلم الذاتي المراقب\nC. التعلم غير المراقب\nD. التعلم المراقب\nلنفكر خطوة بخطوة\nالإجابة:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق بـ الفلسفة\nالسؤال: يقترح سقراط أن المقدس هو جزء واحد من:\nA. ما هو حكيم.\nB. ما هو عادل.\nC. ما هو جميل.\nD. ما هو قانوني.\nلنفكر خطوة بخطوة\nالإجابة:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='هذه سؤال اختيار واحد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق الأمر بتاريخ الولايات المتحدة في المدارس الثانوية.\nسؤال: هذه السؤال يشير إلى المعلومات التالية. ربما، مع ذلك، أنا أكثر وعيًا بأهمية الحريات المدنية في هذه اللحظة المحددة من تاريخنا من أي شخص آخر، لأنني أسافر عبر البلاد وألتقي بالناس وأرى أشياء حدثت لأناس عاديين، أدرك ما يعنيه للديمقراطية الحفاظ على حرياتنا المدنية. طوال السنوات كان علينا أن نقاتل من أجل الحرية المدنية، ونعلم أن هناك أوقاتًا تصبح فيها الأضواء خافتة، وكلما حدث ذلك تكون الديمقراطية في خطر. الآن، إلى حد كبير بسبب الحالة المضطربة للعالم ككل، اختفت الحريات المدنية في العديد من البلدان الأخرى. من المستحيل، بالطبع، أن تكون في حالة حرب وأن تحافظ على حرية الصحافة وحرية التعبير وحرية التجمع. إنها تختفي تلقائيًا. وهكذا في العديد من البلدان التي كانت آمنة عادة، أصبحت اليوم قد اختفت. في بلدان أخرى، حتى قبل أن تأتي الحرب، لم تختف فقط حرية الصحافة وحرية التجمع وحرية التعبير، بل اختفت أيضًا حرية الدين. ولذلك، نحن هنا في هذا البلد، لدينا مسؤولية كبيرة. نحن في السلام. ليس لدينا سبب من المخاوف التي تتحكم في العديد من الشعوب الأخرى في جميع أنحاء العالم؛ لذلك يجب علينا أن نحافظ على حريات الديمقراطية. —إلينور روزفلت، خطاب إلى الاتحاد الأمريكي للحريات المدنية، شيكاغو، إلينوي، 14 مارس 1940.\nفي خطابها، أشارت إلينور روزفلت إلى التهديد السابق للحريات المدنية الذي أنشأته أي مما يلي؟\nA. الحرب العالمية الأولى\nB. الصفقة الجديدة\nC. الحرب الباردة\nD. الكساد العظيم\nدعونا نفكر خطوة بخطوة.\nالجواب:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'\nفقط يجب الرد على الخيار A أو B أو C أو D. \nالإجابة هي:'), + ] + +def get_few_shot_prompts_bn(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি মেশিন লার্নিং সম্পর্কে\nপ্রশ্ন: ইয়ান লেকুনের কেকের উপর চেরি হল: \nA. শক্তিশালীকরণ শেখা\nB. স্ব-নিরীক্ষিত শেখা\nC. অ-নিরীক্ষিত শেখা\nD. নিরীক্ষিত শেখা\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি বিমূর্ত বীজগণিত সম্পর্কে\nপ্রশ্ন: Z_4 x Z_6 এর কোন একটি উপাদানের জন্য সর্বাধিক সম্ভাব্য র‍্যাঙ্ক খুঁজুন।\nA.4\nB.6\nC.12\nD.24\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি উচ্চ বিদ্যালয়ের ভূগোল সম্পর্কে\nপ্রশ্ন: নিচের কোন ধর্ম একটি বিশ্বজনীন ধর্ম? \nA. তাওবাদ\nB. ইসলাম\nC. শিন্টোবাদ\nD. কনফুসিয়াসবাদ\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি দর্শনশাস্ত্র সম্পর্কে\nপ্রশ্ন: সক্রেটিস নির্দেশ করেন যে পবিত্র হচ্ছে:\nA. যা বিজ্ঞ\nB. যা ন্যায়িক\nC. যা সুন্দর\nD. যা আইনগত\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='এটি একটি একক পছন্দের প্রশ্ন। প্রশ্নের উত্তর A, B, C অথবা D দিয়ে দিন।\nএটি উচ্চ বিদ্যালয়ের মার্কিন ইতিহাস সম্পর্কে।\nপ্রশ্ন: এই প্রশ্নটি নিম্নলিখিত তথ্যের সাথে সম্পর্কিত। তবে, शायद আমি আমাদের ইতিহাসের এই নির্ভরযোগ্য মুহূর্তে নাগরিক স্বাধীনতার গুরুত্ব সম্পর্কে অন্য যে কারো চেয়ে বেশি সচেতন, কারণ আমি দেশজুড়ে ভ্রমণ করি এবং মানুষদের সঙ্গে দেখা করি এবং ছোট মানুষদের সাথে ঘটে যাওয়া ঘটনার কথা দেখি। আমি বুঝতে পারি যে আমাদের নাগরিক স্বাধীনতাগুলো রক্ষা করা কীভাবে গণতন্ত্রের জন্য গুরুত্বপূর্ণ। আমরা সাল জুড়ে নাগরিক স্বাধীনতার জন্য লড়াই করতে হয়েছে, এবং আমরা জানি যে এমন সময় আসে যখন আলো ধীরে ধীরে ম্লান হয়, এবং যখনই তা ঘটে, গণতন্ত্র বিপদে পড়ে। এখন, বিশালাংশে বিশ্বজুড়ে অস্থির পরিস্থিতির কারণে, অনেক অন্যান্য দেশে নাগরিক স্বাধীনতা হারিয়ে গেছে। যুদ্ধ চলাকালীন সংবাদপত্রের স্বাধীনতা, বক্তৃতার স্বাধীনতা এবং সমাবেশের স্বাধীনতা বজায় রাখা অবশ্যই অসম্ভব। সেগুলি স্বয়ংক্রিয়ভাবে消失 হয়ে যায়। এবং তাই বহু দেশে যেগুলি সাধারণত নিরাপদ ছিল, আজ তারা gone হয়ে গেছে। অন্যান্য দেশে, এমনকি যুদ্ধ আসার আগেই, শুধুমাত্র সংবাদপত্রের স্বাধীনতা, সমাবেশের স্বাধীনতা, এবং বক্তৃতার স্বাধীনতা হারিয়ে যায়নি, তবে ধর্মের স্বাধীনতাও消失 হয়ে গেছে। এবং তাই আমরা জানি যে এই দেশে আমাদের একটি গুরুতর দায়িত্ব আছে। আমরা শান্তিতে আছি। আমাদের কাছে বিশ্বের অনেক অন্যান্য লোকজনের নিয়ন্ত্রণ করা ভয়ের জন্য কোন কারণ নেই; অতএব, আমাদের গণতন্ত্রের স্বাধীনতাগুলোকে রক্ষা করতে হবে। —এলিনর রুজভেল্ট, আমেরিকান সিভিল লিবারটিজ ইউনিয়নের সম্বোধন, শিকাগো, ইলিনয়, ১৪ই মার্চ, ১৯৪০।\nতার বক্তৃতায়, এলিনর রুজভেল্ট পূর্ববর্তী নাগরিক স্বাধীনতার প্রতি হুমকি সম্পর্কে কোনটি উল্লেখ করেছেন?\nA. বিশ্বযুদ্ধ I\nB. নয়া চুক্তি\nC. ঠাণ্ডা যুদ্ধ\nD. গ্রেট ডিপ্রেশন\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'শুধু বিকল্প A, B, C বা D এর উত্তর দিন, \nউত্তর হলো:'), + ] + +def get_few_shot_prompts_de(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um maschinelles Lernen.\nFrage: In Yann LeCuns Kuchen ist die Kirsche oben:\nA. Verstärkendes Lernen\nB. Selbstüberwachtes Lernen\nC. Unüberwachtes Lernen\nD. Überwachtes Lernen\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um abstrakte Algebra.\nFrage: Finde die maximal mögliche Ordnung für ein Element von Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um Geografie in der High School.\nFrage: Welche der folgenden Religionen ist eine universalisierende Religion? \nA. Taoismus\nB. Islam\nC. Shintoismus\nD. Konfuzianismus\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um Philosophie.\nFrage: Sokrates schlägt vor, dass das Heilige ein Teil von:\nA. was weise ist.\nB. was gerecht ist.\nC. was schön ist.\nD. was legal ist.\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Dies ist eine Einzelwahlfrage. Beantworten Sie die Frage, indem Sie A, B, C oder D antworten.\nEs geht um die amerikanische Geschichte in der High School.\nFrage: Diese Frage bezieht sich auf die folgenden Informationen. Vielleicht bin ich mir jedoch in diesem bestimmten Moment unserer Geschichte mehr bewusst, wie wichtig die Bürgerrechte sind als jeder andere, weil ich durch das Land reise, Menschen treffe und Dinge sehe, die kleinen Menschen passiert sind. Ich erkenne, was es bedeutet, die Bürgerrechte zu bewahren, um die Demokratie zu erhalten. Im Verlauf der Jahre mussten wir für die Bürgerrechte kämpfen, und wir wissen, dass es Zeiten gibt, in denen das Licht eher schwach wird, und jedes Mal, wenn das passiert, ist die Demokratie in Gefahr. Jetzt, größtenteils aufgrund des angespannten Zustands der Welt als Ganzes, sind die Bürgerrechte in vielen anderen Ländern verschwunden. Es ist unmöglich, im Krieg zu sein und gleichzeitig die Pressefreiheit, die Meinungsfreiheit und die Versammlungsfreiheit aufrechtzuerhalten. Sie verschwinden automatisch. Und so sind in vielen Ländern, in denen sie normalerweise sicher waren, heute verschwunden. In anderen Ländern verschwanden nicht nur die Pressefreiheit und die Versammlungsfreiheit und die Meinungsfreiheit, sondern auch die Religionsfreiheit, selbst bevor der Krieg kam. Und so wissen wir hier in diesem Land, dass wir eine ernsthafte Verantwortung haben. Wir sind in Frieden. Wir haben keinen Grund für die Ängste, die so viele andere Menschen auf der ganzen Welt regieren; daher müssen wir die Freiheiten der Demokratie schützen. —Eleanor Roosevelt, Ansprache an die Amerikanische Vereinigung für Bürgerrechte, Chicago, Illinois, 14. März 1940.\nIn ihrer Rede spielte Eleanor Roosevelt auf die frühere Bedrohung der Bürgerrechte an, die durch welches der folgenden Dinge geschaffen wurde?\nA. Erster Weltkrieg\nB. New Deal\nC. Kalter Krieg\nD. Große Depression\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'Bitte beantworten Sie nur mit Option A, B, C oder D. \nAntwort:'), + ] + +def get_few_shot_prompts_es(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de aprendizaje automático.\nPregunta: En el pastel de Yann LeCun, la cereza en la cima es: \nA. aprendizaje por refuerzo\nB. aprendizaje auto-supervisado\nC. aprendizaje no supervisado\nD. aprendizaje supervisado\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de álgebra abstracta.\nPregunta: Encuentra el orden máximo posible para algún elemento de Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de geografía de secundaria.\nPregunta: ¿Cuál de la siguiente religión es una religión universalizante? \nA. Taoísmo\nB. Islam\nC. Shintoísmo\nD. Confucianismo\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de filosofía.\nPregunta: Sócrates sugiere que lo sagrado es una parte de:\nA. lo que es prudente.\nB. lo que es justo.\nC. lo que es bello.\nD. lo que es legal.\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de la historia de EE.UU. en la escuela secundaria.\nPregunta: Esta pregunta se refiere a la siguiente información. Sin embargo, quizás soy más consciente de la importancia de las libertades civiles en este momento particular de nuestra historia que cualquier otra persona, porque viajo por el país, encuentro personas y veo cosas que han sucedido a las personas pequeñas, me doy cuenta de lo que significa para la democracia preservar nuestras libertades civiles. A lo largo de los años hemos tenido que luchar por la libertad civil, y sabemos que hay momentos en que la luz se vuelve bastante tenue, y cada vez que eso sucede, la democracia está en peligro. Ahora, en gran parte debido al estado problemático del mundo en su conjunto, las libertades civiles han desaparecido en muchos otros países. Es imposible, por supuesto, estar en guerra y mantener la libertad de prensa, la libertad de expresión y la libertad de reunión. Desaparecen automáticamente. Y así, en muchos países donde normalmente eran seguras, hoy han desaparecido. En otros países, incluso antes de que llegara la guerra, no solo la libertad de prensa y la libertad de reunión, y la libertad de expresión desaparecieron, sino que también desapareció la libertad de religión. Y así sabemos que aquí en este país, tenemos una grave responsabilidad. Estamos en paz. No tenemos razón para los temores que gobiernan a tantas otras personas en todo el mundo; por lo tanto, tenemos que proteger las libertades de la democracia. —Eleanor Roosevelt, Discurso ante la Unión Americana de Libertades Civiles, Chicago, Illinois, 14 de marzo de 1940.\nEn su discurso, Eleanor Roosevelt aludió a la amenaza anterior a las libertades civiles creada por cuál de las siguientes?\nA. Primera Guerra Mundial\nB. El New Deal\nC. La Guerra Fría\nD. La Gran Depresión\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'Solo necesitas responder con la opción A, B, C o D. \nRespuesta:'), + ] + +def get_few_shot_prompts_fr(_hint, _prompt): + return [ + dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit d'apprentissage automatique.\nQuestion : Dans le gâteau de Yann LeCun, la cerise sur le dessus est :\nA. apprentissage par renforcement\nB. apprentissage auto-supervisé\nC. apprentissage non supervisé\nD. apprentissage supervisé\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit d'algèbre abstraite.\nQuestion : Trouvez l'ordre maximum possible pour un élément de Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit de géographie de lycée.\nQuestion : Laquelle des religions suivantes est une religion universalisante ?\nA. Taoïsme\nB. Islam\nC. Shintoïsme\nD. Confucianisme\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit de philosophie.\nQuestion : Socrate suggère que le sacré est une partie de :\nA. ce qui est prudent.\nB. ce qui est juste.\nC. ce qui est beau.\nD. ce qui est légal.\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nC'est sur l'histoire des États-Unis au lycée.\nQuestion : Cette question se réfère aux informations suivantes. Peut-être, cependant, je suis plus conscient de l'importance des libertés civiles à ce moment particulier de notre histoire que quiconque, car je voyage à travers le pays, rencontre des gens et vois des choses qui sont arrivées à des petites personnes, je réalise ce que signifie pour la démocratie de préserver nos libertés civiles. Au fil des ans, nous avons dû nous battre pour la liberté civile, et nous savons qu'il y a des moments où la lumière devient plutôt faible, et chaque fois que cela se produit, la démocratie est en danger. Maintenant, en grande partie à cause de l'état troublé du monde dans son ensemble, les libertés civiles ont disparu dans de nombreux autres pays. Il est impossible, bien sûr, d'être en guerre et de maintenir la liberté de la presse, la liberté d'expression et la liberté de réunion. Elles disparaissent automatiquement. Et donc dans de nombreux pays où elles étaient normalement en sécurité, aujourd'hui, elles sont parties. Dans d'autres pays, même avant l'arrivée de la guerre, non seulement la liberté de la presse et la liberté de réunion, et la liberté d'expression ont disparu, mais la liberté de religion a aussi disparu. Et donc nous savons ici dans ce pays, nous avons une grave responsabilité. Nous sommes en paix. Nous n'avons aucune raison pour les peurs qui gouvernent tant d'autres peuples à travers le monde ; par conséquent, nous devons garder les libertés de la démocratie. —Eleanor Roosevelt, Adresse à l'Union Américaine pour les Libertés Civiles, Chicago, Illinois, 14 mars 1940\nDans son discours, Eleanor Roosevelt a fait allusion à la menace antérieure pour les libertés civiles créée par laquelle des suivantes ?\nA. Première Guerre Mondiale\nB. Le New Deal\nC. La Guerre froide\nD. La Grande Dépression\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Vous n'avez qu'à répondre avec l'option A, B, C ou D. \nRéponse :"), + ] + +def get_few_shot_prompts_hi(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह मशीन लर्निंग के बारे में है\nप्रश्न: Yann LeCun की केक में, ऊपर का चेरी है: \nA.रिवॉर्ड लर्निंग\nB.सेल्फ-सुपरवाइज्ड लर्निंग\nC.अनसुपरवाइज्ड लर्निंग\nD.सुपरवाइज्ड लर्निंग\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह अमूर्त बीजगणित के बारे में है\nप्रश्न: Z_4 x Z_6 के कुछ तत्व के लिए अधिकतम संभावित क्रम ढूंढें।\n A.4\nB.6\nC.12\nD.24\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह उच्च विद्यालय भूगोल के बारे में है\nप्रश्न: नीचे कौन सा धर्म सार्वभौमिक धर्म है? A.ताओवाद\n B.इस्लाम\n C.शिंतो धर्म\n D.कन्फ्यूशियसवाद\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह दर्शनशास्त्र के बारे में है\nप्रश्न: सुकरात सुझाते हैं कि पवित्र एक भाग है:\nA.जो विवेकी है।\nB.जो न्यायपूर्ण है।\nC.जो सुंदर है।\nD.जो कानूनी है।\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह दर्शनशास्त्र के बारे में है\nप्रश्न: यह प्रश्न नीचे दी गई जानकारी के संदर्भ में है। शायद, फिर भी, मैं हमारे इतिहास के इस विशेष क्षण में नागरिक स्वतंत्रताओं के महत्व के प्रति अन्य किसी से अधिक जागरूक हूँ, क्योंकि मैं देश भर में यात्रा करता हूँ और लोगों से मिलता हूँ और उन चीज़ों को देखता हूँ जो छोटे लोगों के साथ हुई हैं, मैं समझता हूँ कि हमारी नागरिक स्वतंत्रताओं को बनाए रखना लोकतंत्र का क्या अर्थ है। वर्षों से, हमें नागरिक स्वतंत्रता के लिए लड़ना पड़ा है, और हम जानते हैं कि ऐसे क्षण होते हैं जब रोशनी कम हो जाती है, और हर बार ऐसा होने पर लोकतंत्र खतरे में होता है। अब, मुख्यतः पूरी दुनिया की चिंताजनक स्थिति के कारण, कई अन्य देशों में नागरिक स्वतंत्रताएँ गायब हो गई हैं। यह निश्चित रूप से असंभव है, युद्ध में रहकर प्रेस की स्वतंत्रता और भाषण की स्वतंत्रता और सभा की स्वतंत्रता को बनाए रखना। वे स्वचालित रूप से गायब हो जाती हैं। और इसलिए कई देशों में जहाँ सामान्यतः वे सुरक्षित थीं, आज वे गायब हो गई हैं। अन्य देशों में, यहाँ तक कि युद्ध आने से पहले, न केवल प्रेस की स्वतंत्रता और सभा की स्वतंत्रता और भाषण की स्वतंत्रता गायब हो गई, बल्कि धर्म की स्वतंत्रता भी गायब हो गई। और इसलिए हम यहाँ इस देश में जानते हैं, हमें एक गंभीर जिम्मेदारी है। हम शांति में हैं। हमारे पास उन डर के लिए कोई कारण नहीं है जो दुनिया भर में tantos अन्य लोगों को प्रभावित करते हैं; इसलिए, हमें लोकतंत्र की स्वतंत्रताओं की रक्षा करनी चाहिए। —Eleanor Roosevelt, American Civil Liberties Union को दिए गए संबोधन में, शिकागो, इलिनोइस, 14 मार्च, 1940 उसकी स्पीच में, Eleanor Roosevelt ने नागरिक स्वतंत्रताओं के लिए पूर्व के खतरे का उल्लेख किसके द्वारा किया?\nA.विश्व युद्ध I\nB.न्यू डील\nC.शीत युद्ध\nD.महान मंदी\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'आपको केवल विकल्प A, B, C या D के साथ जवाब देना है। \nउत्तर:'), + ] + +def get_few_shot_prompts_id(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang pembelajaran mesin.\nPertanyaan: Dalam kue Yann LeCun, ceri di atas adalah:\nA. pembelajaran penguatan\nB. pembelajaran mandiri\nC. pembelajaran tak terawasi\nD. pembelajaran terawasi\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang aljabar abstrak.\nPertanyaan: Temukan urutan maksimum yang mungkin untuk beberapa elemen dari Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang geografi sekolah menengah.\nPertanyaan: Agama mana di bawah ini yang merupakan agama universal?\nA. Taoisme\nB. Islam\nC. Shintoisme\nD. Konfusianisme\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang filsafat.\nPertanyaan: Socrates menyarankan bahwa yang suci adalah salah satu bagian dari:\nA. apa yang bijak.\nB. apa yang adil.\nC. apa yang indah.\nD. apa yang legal.\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Ini adalah pertanyaan pilihan ganda. Jawab pertanyaan dengan menjawab A, B, C, atau D.\nIni tentang sejarah AS di sekolah menengah.\nPertanyaan: Pertanyaan ini merujuk pada informasi berikut. Namun, mungkin saya lebih sadar akan pentingnya kebebasan sipil pada momen tertentu dalam sejarah kita daripada siapa pun, karena saya berkeliling negara dan bertemu orang-orang serta melihat hal-hal yang terjadi pada orang-orang kecil, saya menyadari apa artinya bagi demokrasi untuk memelihara kebebasan sipil kita. Selama bertahun-tahun kita harus berjuang untuk kebebasan sipil, dan kita tahu bahwa ada kalanya cahaya menjadi redup, dan setiap kali itu terjadi, demokrasi berada dalam bahaya. Sekarang, sebagian besar karena keadaan dunia yang bermasalah secara keseluruhan, kebebasan sipil telah menghilang di banyak negara lain. Tentu saja, adalah mustahil untuk berperang dan tetap mempertahankan kebebasan pers, kebebasan berbicara, dan kebebasan berkumpul. Mereka menghilang secara otomatis. Dan jadi di banyak negara di mana biasanya mereka aman, hari ini mereka sudah hilang. Di negara-negara lain, bahkan sebelum perang datang, tidak hanya kebebasan pers dan kebebasan berkumpul, serta kebebasan berbicara yang hilang, tetapi kebebasan beragama juga hilang. Dan jadi kami tahu di negara ini, kami memiliki tanggung jawab yang berat. Kami berada dalam keadaan damai. Kami tidak punya alasan untuk ketakutan yang mengatur begitu banyak orang di seluruh dunia; oleh karena itu, kami harus menjaga kebebasan demokrasi. —Eleanor Roosevelt, Pidato kepada Asosiasi Kebebasan Sipil Amerika, Chicago, Illinois, 14 Maret 1940\nDalam pidatonya, Eleanor Roosevelt merujuk pada ancaman sebelumnya terhadap kebebasan sipil yang diciptakan oleh mana di antara berikut ini?\nA. Perang Dunia I\nB. New Deal\nC. Perang Dingin\nD. Depresi Besar\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'Anda hanya perlu menjawab dengan opsi A, B, C, atau D. \nJawaban:'), + ] + +def get_few_shot_prompts_it(_hint, _prompt): + return [ + dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo al machine learning.\nDomanda: Nella torta di Yann LeCun, la ciliegina sulla torta è:\nA. apprendimento per rinforzo\nB. apprendimento auto-supervisionato\nC. apprendimento non supervisionato\nD. apprendimento supervisionato\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo all'algebra astratta.\nDomanda: Trova l'ordine massimo possibile per alcuni elementi di Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nPensiamo passo dopo passo.\nRisposta:"), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo alla geografia delle scuole superiori.\nDomanda: Quale religione qui sotto è una religione universalista?\nA. Taoismo\nB. Islam\nC. Shintoismo\nD. Confucianesimo\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo alla filosofia.\nDomanda: Socrate suggerisce che il sacro è una parte di:\nA. ciò che è prudente.\nB. ciò che è giusto.\nC. ciò che è bello.\nD. ciò che è legale.\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt="Questa è una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nRiguarda la storia degli Stati Uniti delle scuole superiori.\nDomanda: Questa domanda si riferisce alle seguenti informazioni. Tuttavia, forse sono più consapevole dell'importanza delle libertà civili in questo particolare momento della nostra storia rispetto a chiunque altro, perché viaggio per il paese, incontro persone e vedo cose che sono accadute a persone comuni, mi rendo conto di cosa significhi per la democrazia preservare le nostre libertà civili. Negli anni abbiamo dovuto combattere per la libertà civile e sappiamo che ci sono momenti in cui la luce si fa piuttosto fioca, e ogni volta che ciò accade, la democrazia è in pericolo. Ora, principalmente a causa dello stato travagliato del mondo nel suo insieme, le libertà civili sono scomparse in molti altri paesi. È impossibile, naturalmente, essere in guerra e mantenere la libertà di stampa, la libertà di parola e la libertà di riunione. Esse scompaiono automaticamente. E così, in molti paesi dove normalmente erano sicure, oggi sono svanite. In altri paesi, anche prima che arrivasse la guerra, non solo la libertà di stampa e la libertà di riunione, e la libertà di parola sono scomparse, ma anche la libertà di religione è scomparsa. E così sappiamo qui in questo paese, abbiamo una grave responsabilità. Siamo in pace. Non abbiamo ragione per le paure che governano così tante altre persone nel mondo; pertanto, dobbiamo difendere le libertà della democrazia. —Eleanor Roosevelt, Discorso all'Unione Americana per le Libertà Civili, Chicago, Illinois, 14 marzo 1940.\nNel suo discorso, Eleanor Roosevelt alluse alla minaccia precedente alle libertà civili creata da quale delle seguenti?\nA. Prima Guerra Mondiale\nB. Il New Deal\nC. Guerra Fredda\nD. Grande Depressione\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"È sufficiente rispondere con l'opzione A, B, C o D. \nRisposta:"), + ] + +def get_few_shot_prompts_ja(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは機械学習に関するものです。\n質問:ヤン・ルカンのケーキにおいて、一番上のチェリーは:\nA. 強化学習\nB. 自己監督学習\nC. 教師なし学習\nD. 教師あり学習\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは抽象代数学に関するものです。\n質問:Z_4 x Z_6 のいくつかの要素の最大可能な順序を求めなさい。\nA. 4\nB. 6\nC. 12\nD. 24\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは高校の地理に関するものです。\n質問:以下のどの宗教が普遍化宗教ですか?\nA. 道教\nB. イスラム教\nC. 神道\nD. 儒教\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは哲学に関するものです。\n質問:ソクラテスは、聖なるものが以下のどれの一部であると示唆していますか?\nA. 賢明なもの\nB. 正義のあるもの\nC. 美しいもの\nD. 合法的なもの\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれはアメリカの歴史についてです。\n質問:この質問は次の情報を参照しています。しかし、私はこの特定の歴史の瞬間における市民の自由の重要性に他の誰よりも敏感であるかもしれません。なぜなら、私は国を旅し、人々に会い、小さな人々に起こったことを見てきたからです。民主主義が市民の自由を守ることが何を意味するのかを理解しています。私たちは市民の自由のために戦わなければならなかった年月があり、光がかなり薄暗くなる時期があることを知っています。そのたびに民主主義は危険にさらされます。今、主に世界全体の不安定な状態によって、多くの他の国で市民の自由が消失しています。もちろん、戦争をしていては報道の自由、言論の自由、集会の自由を保つことは不可能です。それらは自動的に消えてしまいます。そして、通常は安全であった多くの国では、今日、これらはなくなりました。他の国々では、戦争が来る前から、報道の自由や集会の自由、言論の自由だけでなく、宗教の自由も消えていました。したがって、私たちはこの国で重大な責任を負っていることを知っています。私たちは平和です。他の国々の多くの人々が抱える恐れの理由はないので、私たちは民主主義の自由を守らなければなりません。 —エレノア・ルーズベルト、1940年3月14日イリノイ州シカゴでのアメリカ市民自由連合への演説\n彼女の演説で、エレノア・ルーズベルトは市民の自由に対する以前の脅威をどのように言及しましたか?\nA. 第一次世界大戦\nB. ニューディール\nC. 冷戦\nD. 大恐慌\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:'), + ] + +def get_few_shot_prompts_ko(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 기계 학습에 관한 것입니다.\n질문: 얀 르쿤의 케이크에서 가장 위에 있는 체리는:\nA. 강화 학습\nB. 자기 지도 학습\nC. 비지도 학습\nD. 지도 학습\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 추상 대수학에 관한 것입니다.\n질문: Z_4 x Z_6의 어떤 요소의 최대 가능한 순서를 찾으세요.\nA. 4\nB. 6\nC. 12\nD. 24\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 고등학교 지리에 관한 것입니다.\n질문: 아래의 어떤 종교가 보편화 종교입니까?\nA. 도교\nB. 이슬람교\nC. 신도\nD. 유교\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 철학에 관한 것입니다.\n질문: 소크라테스는 신성한 것이 다음 중 어떤 것의 일부라고 제안합니까?\nA. 신중한 것\nB. 정의로운 것\nC. 아름다운 것\nD. 합법적인 것\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='이것은 단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이는 미국 역사에 관한 것입니다.\n질문: 이 질문은 다음 정보를 참조합니다. 그러나 어쩌면 나는 이 특정 역사적 순간에 시민 권리의 중요성에 대해 다른 누구보다 더 잘 인식하고 있습니다. 왜냐하면 나는 나라를 여행하며 사람들을 만나고 작은 사람들에게 일어난 일들을 보았기 때문입니다. 나는 민주주의가 시민 권리를 보존한다는 것이 무엇을 의미하는지 깨닫습니다. 우리는 시민 권리를 위해 싸워야 했던 여러 해를 거쳐 왔으며, 빛이 흐려지는 순간이 있음을 알고 있습니다. 그럴 때마다 민주주의가 위험에 처한 것처럼 느껴집니다. 지금은 전 세계의 불안정한 상태로 인해 많은 다른 국가에서 시민 권리가 사라졌습니다. 전쟁 중에 언론의 자유와 표현의 자유, 집회의 자유를 유지하는 것은 불가능합니다. 그것들은 자동으로 사라집니다. 그리고 따라서 일반적으로 안전했던 많은 국가에서는 오늘날 그것들이 사라졌습니다. 다른 국가에서는 전쟁이 오기 전에도 언론의 자유와 집회의 자유, 표현의 자유가 사라졌을 뿐만 아니라 종교의 자유도 사라졌습니다. 그래서 우리는 이 나라에서 중대한 책임을 지고 있다는 것을 압니다. 우리는 평화로운 상태에 있습니다. 전 세계의 많은 사람들이 느끼는 두려움에 대한 이유가 없으므로 우리는 민주주의의 자유를 지켜야 합니다. —엘리노르 루즈벨트, 1940년 3월 14일 일리노이주 시카고에서 미국 시민 자유 연합에 대한 연설\n그녀의 연설에서 엘리노르 루즈벨트는 시민 권리에 대한 이전의 위협이 어떤 것에 의해 발생했는지 언급했습니다.\nA. 제1차 세계 대전\nB. 뉴딜\nC. 냉전\nD. 대공황\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:'), + ] + +def get_few_shot_prompts_pt(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre aprendizado de máquina.\nPergunta: No bolo de Yann LeCun, a cereja no topo é:\nA. aprendizado por reforço\nB. aprendizado auto-supervisionado\nC. aprendizado não supervisionado\nD. aprendizado supervisionado\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre álgebra abstrata.\nPergunta: Encontre a ordem máxima possível para algum elemento de Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre geografia do ensino médio.\nPergunta: Qual religião abaixo é uma religião universalizante?\nA. Taoísmo\nB. Islamismo\nC. Xintoísmo\nD. Confucionismo\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre filosofia.\nPergunta: Sócrates sugere que o sagrado é uma parte de:\nA. o que é prudente.\nB. o que é justo.\nC. o que é belo.\nD. o que é legal.\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre história dos Estados Unidos do ensino médio.\nPergunta: Esta pergunta se refere à seguinte informação. Talvez, no entanto, eu esteja mais consciente da importância das liberdades civis neste momento particular da nossa história do que qualquer outra pessoa, porque eu viajo pelo país e conheço pessoas e vejo coisas que aconteceram com pessoas pequenas, percebo o que significa para a democracia preservar nossas liberdades civis. Ao longo dos anos, tivemos que lutar pela liberdade civil, e sabemos que há momentos em que a luz fica bastante fraca, e toda vez que isso acontece, a democracia está em perigo. Agora, em grande parte por causa do estado problemático do mundo como um todo, as liberdades civis desapareceram em muitos outros países. É impossível, é claro, estar em guerra e manter a liberdade de imprensa, a liberdade de expressão e a liberdade de reunião. Elas desaparecem automaticamente. E assim, em muitos países onde normalmente estavam seguras, hoje desapareceram. Em outros países, mesmo antes da guerra chegar, não apenas a liberdade de imprensa e a liberdade de reunião, e a liberdade de expressão desapareceram, mas a liberdade de religião também desapareceu. E assim, sabemos aqui neste país, temos uma grave responsabilidade. Estamos em paz. Não temos razão para os medos que governam tantas outras pessoas ao redor do mundo; portanto, temos que proteger as liberdades da democracia. —Eleanor Roosevelt, Discurso à União Americana pelas Liberdades Civis, Chicago, Illinois, 14 de março de 1940\nEm seu discurso, Eleanor Roosevelt aludiu à ameaça anterior às liberdades civis criada por qual das seguintes?\nA. Primeira Guerra Mundial\nB. O Novo Pacto\nC. A Guerra Fria\nD. A Grande Depressão\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'Você só precisa responder com a opção A, B, C ou D. \nResposta:'), + ] + +def get_few_shot_prompts_zh(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于美国历史的。\n问题:这个问题参考以下信息。或许,我对我们历史这一特定时刻公民自由重要性的认识,比其他任何人都要深刻,因为我在全国各地旅行,见到人们,看到那些发生在小人物身上的事情,我意识到民主意味着要保护我们的公民自由。在这些年里,我们不得不为公民自由而奋斗,我们知道有时光芒会变得微弱,每当这种情况发生时,民主就处于危险之中。现在,主要由于整个世界的动荡状态,许多其他国家的公民自由已经消失。当然,在战争中保持新闻自由、言论自由和集会自由是不可能的。它们会自动消失。因此,在许多通常是安全的国家里,今天它们已经消失。在其他国家,即使在战争到来之前,不仅新闻自由、集会自由和言论自由消失了,宗教自由也消失了。因此,我们知道,在这个国家,我们有着重大的责任。我们处于和平状态。我们没有理由去害怕其他世界上许多人所感受到的恐惧;因此,我们必须保护民主的自由。——埃莉诺·罗斯福,1940年3月14日在伊利诺伊州芝加哥的美国公民自由联盟演讲\n在她的演讲中,埃莉诺·罗斯福提到了哪一事件对公民自由造成了早期威胁?\nA. 第一次世界大战\nB. 新政\nC. 冷战\nD. 大萧条\n只需选择 A、B、C 或 D 来回答该问题。\n答案:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于高中地理的。\n问题:以下哪个宗教是普世宗教?\nA. 道教\nB. 伊斯兰教\nC. 神道教\nD. 儒教\n只需选择 A、B、C 或 D 来回答该问题。\n答案:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于哲学的。\n问题:苏格拉底建议神圣是以下哪个部分:\nA. 什么是谨慎的。\nB. 什么是正义的。\nC. 什么是美的。\nD. 什么是合法的。\n只需选择 A、B、C 或 D 来回答该问题。\n答案:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于抽象代数的。\n问题:找到 Z_4 x Z_6 中某个元素的最大可能阶数。\nA. 4\nB. 6\nC. 12\nD. 24\n只需选择 A、B、C 或 D 来回答该问题。\n答案:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于机器学习的。\n问题:在 Yann LeCun 的蛋糕中,最上面的樱桃是:\nA. 强化学习\nB. 自监督学习\nC. 无监督学习\nD. 监督学习\n只需选择 A、B、C 或 D 来回答该问题。\n答案:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'只需选择 A、B、C 或 D 来回答该问题。 \n回答:'), + ] + +def get_few_shot_prompts_sw(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu filosofia\nSwali: Swali hili linahusiana na taarifa ifuatayo. Huenda, hata hivyo, mimi nina ufahamu zaidi wa umuhimu wa uhuru wa kiraia katika wakati huu maalum wa historia yetu kuliko mtu mwingine yeyote, kwa sababu nasafiri nchini na kukutana na watu na kuona mambo yanayotokea kwa watu wadogo, ninatambua inamaanisha nini kwa demokrasia kuhifadhi uhuru wetu wa kiraia. kwa kupitia miaka yote tumepaswa kupigania uhuru wa kiraia, na tunajua kuwa kuna nyakati ambapo mwanga unakuwa dhaifu, na kila wakati hii inatokea, demokrasia iko katika hatari. Sasa, hasa kwa sababu ya hali ya machafuko ya ulimwengu kwa jumla, uhuru wa kiraia umepotea katika nchi nyingi nyingine. Haiwezekani, kwa kweli, kuwa katika vita na kudumisha uhuru wa vyombo vya habari na uhuru wa kusema na uhuru wa kukusanyika. Vinapotea kiotomatiki. Na hivyo katika nchi nyingi ambapo kwa kawaida walikuwa salama, leo wameondoka. Katika nchi zingine, hata kabla ya vita kuja, si tu uhuru wa vyombo vya habari na uhuru wa kukusanyika, na uhuru wa kusema umepotea, bali pia uhuru wa dini umepotea. Na hivyo tunajua hapa katika nchi hii, tuna wajibu mzito. Tuko katika amani. Hatuna sababu ya hofu ambazo zinatawala watu wengi wengine duniani; kwa hiyo, ni lazima tulinde uhuru wa demokrasia. —Eleanor Roosevelt, Hotuba kwa Muungano wa Uhuru wa Kiraia wa Marekani, Chicago, Illinois, Machi 14, 1940 Katika hotuba yake, Eleanor Roosevelt alizungumzia tishio la awali kwa uhuru wa kiraia lililotolewa na ipi kati ya yafuatayo? \nA.Vita vya Kwanza vya Dunia\nB.Mkataba Mpya\nC.Vita vya Baridi\nD.Mapinduzi Makuu\nChagua tu A, B, C au D kujibu swali hili.\nJibu:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu jiografia ya shule ya sekondari\nSwali: Dini ipi hapa chini ni dini ya kueneza? \nA.Taoism\nB.Islam\nC.Shintoism\nD.Confucianism\nChagua tu A, B, C au D kujibu swali hili.\nJibu:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu filosofia\nSwali: Socrates anapendekeza kwamba kitakatifu ni sehemu moja ya:\nA. kile kilicho busara.\nB. kile kilicho haki.\nC. kile kilicho kizuri.\nD. kile kilicho halali.\nChagua tu A, B, C au D kujibu swali hili.\nJibu:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu algebra ya kiabstract\nSwali: Pata kipindi chenye uwezo mkubwa zaidi kwa kipengele baadhi ya Z_4 x Z_6.\nA.4\nB.6\nC.12\nD.24\nHebu tufikirie hatua kwa hatua\nJibu:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu kujifunza kwa mashine\nSwali: Katika keki ya Yann LeCun, cherii juu ni: \nA.kujifunza kwa nguvu\nB.kujifunza kwa kujisimamia\nC.kujifunza bila usimamizi\nD.kujifunza kwa usimamizi\nChagua tu A, B, C au D kujibu swali hili.\nJibu:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'Chagua tu A, B, C au D kujibu swali hili. \nJibu:'), + ] +def get_few_shot_prompts_yo(_hint, _prompt): + return [ + dict(role='HUMAN', prompt='Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa filosofía\nIbeere: Ibeere yii tọka si alaye ti n bọ. Boya, sibẹsibẹ, Mo ni imọ diẹ sii nipa pataki awọn ominira ilu ni akoko pataki yii ti itan wa ju ẹnikẹni miiran lọ, nitori Mo n rin irin-ajo kọja ilẹ naa ati pe Mo pade awọn eniyan ati pe Mo ri awọn nkan ti o ti ṣẹlẹ si awọn eniyan kekere, Mo mọ ohun ti o tumọ si fun ijọba mimọ lati pa awọn ominira ilu wa. Ni gbogbo ọdun, a ti ni lati ja fun ominira ilu, ati pe a mọ pe awọn akoko wa nigba ti ina di dimu, ati nigbami ti eyi ṣẹlẹ, ijọba wa ni ewu. Bayi, ni pataki nitori ipo iṣoro agbaye ni apapọ, awọn ominira ilu ti parẹ ni ọpọlọpọ awọn orilẹ-ede miiran. O jẹ alailẹgbẹ, dajudaju, lati wa ni ogun ati ki o ṣetọju ominira iwe irohin ati ominira ẹtọ ati ominira apejọ. Wọn parẹ laifọwọyi. Ati pe nitorina ni ọpọlọpọ awọn orilẹ-ede nibiti wọn ti jẹ ailewu ni deede, loni wọn ti parẹ. Ni awọn orilẹ-ede miiran, paapaa ṣaaju ki ogun wa, kii ṣe nikan ominira iwe irohin ati ominira apejọ , ati ominira ẹtọ ti parẹ, ṣugbọn ominira ẹsin ti parẹ. Ati pe nitorina a mọ nibi ninu orilẹ-ede yii, a ni ojuse pataki. A wa ni alaafia. A ko ni idi fun awọn bẹru ti o ṣakoso ọpọlọpọ awọn eniyan miiran ni gbogbo agbaye; nitorina, a ni lati daabobo awọn ominira ti ijọba mimọ. —Eleanor Roosevelt, Ikede si American Civil Liberties Union, Chicago, Illinois, Oṣu Kẹta 14, 1940 Ninu ọrọ rẹ, Eleanor Roosevelt daba pe ewu ti tẹlẹ si awọn ominira ilu ṣẹda nipasẹ eyi ti o tẹle? \nA.Ija Agbaye I\nB.Iwọn Ilana Tuntun\nC.Ijakadi Tutu\nD.Ipe wọn nla\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt='Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa геography ile-iwe gíga\nIbeere: Igbagbọ wo ni isalẹ jẹ igbagbọ agbaye? A.Taoism\n B.Islam\n C.Shintoism\n D.Confucianism\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa filosofía\nIbeere: Socrates daba pe mímọ̀ jẹ apakan kan ti:\nA. ohun ti o jẹ ọlọgbọn.\nB. ohun ti o jẹ ododo.\nC. ohun ti o jẹ ẹwa.\nD. ohun ti o jẹ ofin.\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:'), + dict(role='BOT', prompt='B'), + dict(role='HUMAN', prompt='Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa aljebra igba\nIbeere: Wa aṣẹ to pọju fun diẹ ninu awọn eroja ti Z_4 x Z_6.\n A.4\nB.6\nC.12\nD.24\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:'), + dict(role='BOT', prompt='C'), + dict(role='HUMAN', prompt='Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa ikẹkọ ẹrọ\nIbeere: Ninu akara Yann LeCun, eso cherry lori oke ni: \nA.ikẹkọ imudara\nB.ikẹkọ ara-ṣaaju\nC.ikẹkọ aibojumu\nD.ikẹkọ ti a fojusi\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:'), + dict(role='BOT', prompt='A'), + dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+'Kan yan A, B, C tabi D lati fesi si ibeere naa. \nFesi:'), + ] diff --git a/build/lib/opencompass/configs/datasets/mmmlu_lite/README.md b/build/lib/opencompass/configs/datasets/mmmlu_lite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f7866a0b0e2aaaa46e0bf24c285a1edc92c07a0c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu_lite/README.md @@ -0,0 +1,38 @@ +# MMMLU-Lite + +## Introduction + +A lite version of the MMMLU dataset, which is an community version of the MMMLU dataset by [OpenCompass](https://github.com/open-compass/opencompass). Due to the large size of the original dataset (about 200k questions), we have created a lite version of the dataset to make it easier to use. We sample 25 examples from each language subject in the original dataset with fixed seed to ensure reproducibility, finally we have 19950 examples in the lite version of the dataset, which is about 10% of the original dataset. + + +## Dataset Description +Multilingual Massive Multitask Language Understanding (MMMLU) +The MMLU is a widely recognized benchmark of general knowledge attained by AI models. It covers a broad range of topics from 57 different categories, covering elementary-level knowledge up to advanced professional subjects like law, physics, history, and computer science. + +We translated the MMLU’s test set into 14 languages using professional human translators. Relying on human translators for this evaluation increases confidence in the accuracy of the translations, especially for low-resource languages like Yoruba. We are publishing the professional human translations and the code we use to run the evaluations. + +This effort reflects our commitment to improving the multilingual capabilities of AI models, ensuring they perform accurately across languages, particularly for underrepresented communities. By prioritizing high-quality translations, we aim to make AI technology more inclusive and effective for users worldwide. +MMMLU contains the MMLU test set translated into the following locales: + +- AR_XY (Arabic) +- BN_BD (Bengali) +- DE_DE (German) +- ES_LA (Spanish) +- FR_FR (French) +- HI_IN (Hindi) +- ID_ID (Indonesian) +- IT_IT (Italian) +- JA_JP (Japanese) +- KO_KR (Korean) +- PT_BR (Brazilian Portuguese) +- SW_KE (Swahili) +- YO_NG (Yoruba) +- ZH_CH (Simplied Chinese) + + +## How to Use + +```python +from datasets import load_dataset +ds = load_dataset("opencompass/mmmlu_lite", "AR_XY") +``` \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py b/build/lib/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..d918cc369fc6eaebcd7e00dd01e63343be037668 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .mmmlu_lite_gen_c51a84 import mmmlu_lite_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py b/build/lib/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py new file mode 100644 index 0000000000000000000000000000000000000000..9e9a8ab442b0f372f752e8eed0f91dfa7972c136 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py @@ -0,0 +1,105 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMMLULiteDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + + +mmmlu_lite_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D','subject'], + output_column='target', + train_split='test') + +mmmlu_lite_all_sets = [ + 'mmlu_lite_AR-XY', + 'mmlu_lite_BN-BD', + 'mmlu_lite_DE-DE', + 'mmlu_lite_ES-LA', + 'mmlu_lite_FR-FR', + 'mmlu_lite_HI-IN', + 'mmlu_lite_ID-ID', + 'mmlu_lite_IT-IT', + 'mmlu_lite_JA-JP', + 'mmlu_lite_KO-KR', + 'mmlu_lite_PT-BR', + 'mmlu_lite_SW-KE', + 'mmlu_lite_YO-NG', + 'mmlu_lite_ZH-CN', +] + +mmmlu_lite_datasets = [] +for _name in mmmlu_lite_all_sets: + if 'AR' in _name: + _hint = f'هناك سؤال اختيار واحد. أجب عن السؤال بالرد على A أو B أو C أو D, يرجى استخدام واحدة من الرموز A، B، C، أو D لتمثيل خيارات الإجابة في ردك' + _prompt = f'يتعلق بـ {{subject}} \nالسؤال: {{input}}\nأ. {{A}}\nب. {{B}}\nج. {{C}}\nد. {{D}}\nالإجابة:' + elif 'BN' in _name: + _hint = f'এটি একটি একক পছন্দের প্রশ্ন। এ, বি, সি বা ডি উত্তর দিয়ে প্রশ্নের উত্তর দিন।, আপনার উত্তরে ইংরেজি বর্ণ A, B, C এবং D এর মধ্যে একটি ব্যবহার করুন' + _prompt = f'এটি {{subject}} এর সম্পর্কে \nপ্রশ্ন: {{input}}\nএ. {{A}}\nবি. {{B}}\nসি. {{C}}\nডি. {{D}}\nউত্তর:' + elif 'DE' in _name: + _hint = f'Es gibt eine Einzelwahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.' + _prompt = f'Es geht um {{subject}} \nFrage: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAntwort:' + elif 'ES' in _name: + _hint = f'Hay una pregunta de elección única. Responde a la pregunta respondiendo A, B, C o D.' + _prompt = f'Se trata de {{subject}} \nPregunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRespuesta:' + elif 'FR' in _name: + _hint = f'Il y a une question à choix unique. Répondez à la question en répondant A, B, C ou D.' + _prompt = f'''C'est à propos de {{subject}} \nQuestion : {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRéponse :''' + elif 'HI' in _name: + _hint = f'यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D में से कोई भी उत्तर देकर दें।' + _prompt = f'यह {{subject}} के बारे में है \nप्रश्न: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nउत्तर:' + elif 'ID' in _name: + _hint = f'Ada pertanyaan pilihan tunggal. Jawablah pertanyaan dengan menjawab A, B, C, atau D.' + _prompt = f'Ini tentang {{subject}} \nPertanyaan: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJawaban:' + elif 'IT' in _name: + _hint = f'Ci sono domande a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.' + _prompt = f'Si tratta di {{subject}} \nDomanda: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRisposta:' + elif 'JA' in _name: + _hint = f'単一選択肢の質問があります。この質問にはA、B、C、またはDで答えてください。' + _prompt = f'これは {{subject}} に関することです \n質問: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n回答:' + elif 'KO' in _name: + _hint = f'단일 선택 질문이 있습니다. A, B, C 또는 D로 답변해 주세요.' + _prompt = f'이것은 {{subject}}에 관한 것입니다 \n질문: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n답변:' + elif 'PT' in _name: + _hint = f'Há uma pergunta de escolha única. Responda à pergunta escolhendo A, B, C ou D.' + _prompt = f'É sobre {{subject}} \nPergunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nResposta:' + elif 'ZH' in _name: + _hint = f'这里有一个单项选择题。请通过选择 A、B、C 或 D 来回答该问题。' + _prompt = f'这是关于 {{subject}} 的内容\n问题:{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:' + else: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + _prompt = f'it is about {{subject}} \nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:' + mmmlu_lite_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\n {_prompt}' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmmlu_lite_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmmlu_lite_datasets.append( + dict( + abbr=f'openai_m{_name}', + type=MMMLULiteDataset, + path='opencompass/mmmlu_lite', + name=f'openai_m{_name}', + reader_cfg=mmmlu_lite_reader_cfg, + infer_cfg=mmmlu_lite_infer_cfg, + eval_cfg=mmmlu_lite_eval_cfg, + )) + +del _name, _hint, _prompt \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/multipl_e/multiple_gen.py b/build/lib/opencompass/configs/datasets/multipl_e/multiple_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..b32af567f67ea23f60d8dc3f96e94af86e125159 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/multipl_e/multiple_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .multiple_top_ten_gen_f44aaf import multiple_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py b/build/lib/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py new file mode 100644 index 0000000000000000000000000000000000000000..040c5ba5bcf65cb7061161d4fb1ad882103118bd --- /dev/null +++ b/build/lib/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py @@ -0,0 +1,54 @@ +# Select the 10 most popular programming languages from MultiPL-E to compose the test set. + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MultiplEDataset, MultiplEEvaluator + + +_TOP_TEN_LANGUAGE_ = ['cpp', 'cs', 'go', 'java', 'rb', 'js', 'php', 'r', 'rs', 'sh'] + +multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests') + +multiple_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +multiple_eval_cfg = { + lang: dict( + evaluator=dict( + type=MultiplEEvaluator, + language=lang, + ip_address='https://opencompass-multiple-evaluator.hf.space', + ), + pred_role='BOT', + ) for lang in _TOP_TEN_LANGUAGE_ +} + +multiple_datasets = [ + dict( + type=MultiplEDataset, + abbr=f'humaneval-multiple-{lang}', + language=lang, + path='opencompass/multipl_e', + tag='humaneval', + reader_cfg=multiple_reader_cfg, + infer_cfg=multiple_infer_cfg, + eval_cfg=multiple_eval_cfg[lang], + ) for lang in _TOP_TEN_LANGUAGE_ +] + +multiple_datasets += [ + dict( + type=MultiplEDataset, + abbr=f'mbpp-multiple-{lang}', + language=lang, + path='opencompass/multipl_e', + tag='mbpp', + reader_cfg=multiple_reader_cfg, + infer_cfg=multiple_infer_cfg, + eval_cfg=multiple_eval_cfg[lang], + ) for lang in _TOP_TEN_LANGUAGE_ +] diff --git a/build/lib/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py b/build/lib/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py new file mode 100644 index 0000000000000000000000000000000000000000..1a603d32605b04cdd267032a80eb4a7ce8da1879 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py @@ -0,0 +1,58 @@ +# Select the 10 most popular programming languages from MultiPL-E to compose the test set. + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MultiplEDataset, MultiplEEvaluator + + +_TOP_TEN_LANGUAGE_ = ['cpp'] + +multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests') + +multiple_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +multiple_eval_cfg = { + lang: dict( + evaluator=dict( + type=MultiplEEvaluator, + language=lang, + ip_address='https://opencompass-multiple-evaluator.hf.space', + ), + pred_role='BOT', + ) for lang in _TOP_TEN_LANGUAGE_ +} + +multiple_datasets = [ + dict( + type=MultiplEDataset, + abbr=f'humaneval-multiple-{lang}', + language=lang, + path='opencompass/multipl_e', + tag='humaneval', + reader_cfg=multiple_reader_cfg, + infer_cfg=multiple_infer_cfg, + eval_cfg=multiple_eval_cfg[lang], + n=5, + k=3 + ) for lang in _TOP_TEN_LANGUAGE_ +] + +multiple_datasets += [ + dict( + type=MultiplEDataset, + abbr=f'mbpp-multiple-{lang}', + language=lang, + path='opencompass/multipl_e', + tag='mbpp', + reader_cfg=multiple_reader_cfg, + infer_cfg=multiple_infer_cfg, + eval_cfg=multiple_eval_cfg[lang], + n=5, + k=3 + ) for lang in _TOP_TEN_LANGUAGE_ +] diff --git a/build/lib/opencompass/configs/datasets/musr/README.md b/build/lib/opencompass/configs/datasets/musr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b1ff5816528cc408692e031f56435e5e02d431c0 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/README.md @@ -0,0 +1,75 @@ + +# MuSR: Multistep Soft Reasoning Dataset + +MuSR (Multistep Soft Reasoning) is a dataset designed to evaluate language models (LLMs) on complex reasoning tasks embedded in natural language narratives. Created to challenge state-of-the-art models like GPT-4 and others, MuSR emphasizes nuanced reasoning across different domains, including social and physical reasoning, commonsense reasoning, and planning, with tasks framed within realistic scenarios such as murder mysteries, object placements, and team allocations. + +## Overview + +### Purpose + +Current large language models can perform complex tasks through prompting techniques like chain-of-thought reasoning. However, robust multistep reasoning remains challenging. MuSR addresses these limitations by evaluating LLM performance on tasks involving multistep reasoning in three domains: +- **Murder Mysteries**: Requires social and physical deductive reasoning. +- **Object Placements**: Tests observational and theory-of-mind reasoning. +- **Team Allocations**: Focuses on social reasoning and constraint satisfaction. + +### Dataset Construction + +MuSR instances are generated using a neurosymbolic synthetic-to-natural narrative generation algorithm. This approach allows for the creation of complex reasoning instances that combine structured reasoning trees with natural language narratives, challenging both direct and nuanced inference capabilities in LLMs. + +MuSR's dataset consists of: +- **Murder Mysteries**: Scenarios with suspects, motives, and opportunities requiring deductive inference. +- **Object Placements**: Scenarios where individuals' observations inform reasoning about object locations. +- **Team Allocations**: Scenarios that simulate social relationships and teamwork for optimal task assignments. + + +### Dataset Access +MuSR dataset is publicly available, with instructions provided on the [GitHub Project](https://github.com/Zayne-Sprague/MuSR). You can download the dataset and use pre-defined prompts or create your own configurations. + +### Evaluation + +1. Install dependencies and configure the environment. +2. Run evaluations using `opencompass examples/eval_musr.py` to assess LLM performance. +3. Analyze results against human performance benchmarks. + +### Example Command +```bash +opencompass examples/eval_musr.py +``` + +## Baselines and Results + +MuSR includes baseline results for multiple LLMs evaluated with chain-of-thought and advanced reasoning strategies. These benchmarks assess model accuracy on reasoning tasks across the three domains. + +| Domain | Baseline Accuracy (GPT-4) | Human Performance | +|------------------|---------------------------|--------------------| +| Murder Mystery | 80.4% | 94.1% | +| Object Placement | 60.9% | 95.0% | +| Team Allocation | 68.4% | 100% | + + +| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | qwen2.5-7b-instruct-turbomind | qwen2.5-14b-instruct-turbomind | yi-1.5-9b-chat-turbomind | qwen2.5-32b-instruct-turbomind | glm-4-9b-chat-turbomind | llama-3_1-8b-instruct-turbomind | ministral-8B-instruct-2410-turbomind | gemma-2-9b-it-turbomind | gemma-2-27b-it-turbomind | +|----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----| +| musr_murder_mysteries | a5ce30 | accuracy | gen | 59.20 | 63.20 | 76.00 | 68.80 | 78.80 | 71.20 | 73.60 | 73.60 | 74.80 | 77.20 | +| musr_object_placements | a5ce30 | accuracy | gen | 54.69 | 56.25 | 57.42 | 52.73 | 66.02 | 49.22 | 57.42 | 60.94 | 60.94 | 62.11 | +| musr_team_allocation | a5ce30 | accuracy | gen | 39.20 | 32.40 | 55.60 | 40.00 | 67.60 | 50.40 | 46.00 | 36.40 | 40.80 | 41.20 | +| musr_average | - | naive_average | gen | 51.03 | 50.62 | 63.01 | 53.84 | 70.81 | 56.94 | 59.01 | 56.98 | 58.85 | 60.17 | + + +## Citation + +If you use MuSR in your research, please cite: +```bibtex +@misc{sprague2024musrtestinglimitschainofthought, + title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning}, + author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett}, + year={2024}, + eprint={2310.16049}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2310.16049}, +} +``` + +## Details + +For further details, please refer to the MuSR paper [here](https://arxiv.org/abs/2310.16049). diff --git a/build/lib/opencompass/configs/datasets/musr/musr_gen.py b/build/lib/opencompass/configs/datasets/musr/musr_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..d8b8994e6ba6ed6d22931be3261cec708828ad80 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/musr_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .musr_gen_b47fd3 import musr_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/musr/musr_gen_3622bb.py b/build/lib/opencompass/configs/datasets/musr/musr_gen_3622bb.py new file mode 100644 index 0000000000000000000000000000000000000000..93c065f0764471730ca22ef90c47a57cbf3288ea --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/musr_gen_3622bb.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) diff --git a/build/lib/opencompass/configs/datasets/musr/musr_gen_3c6e15.py b/build/lib/opencompass/configs/datasets/musr/musr_gen_3c6e15.py new file mode 100644 index 0000000000000000000000000000000000000000..2d57392b54029a6d35ba1672bc91d0c516e9403c --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/musr_gen_3c6e15.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) diff --git a/build/lib/opencompass/configs/datasets/musr/musr_gen_b47fd3.py b/build/lib/opencompass/configs/datasets/musr/musr_gen_b47fd3.py new file mode 100644 index 0000000000000000000000000000000000000000..70d7622751cf5b261a4e0c4cd98bb2ce1d1172e4 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/musr_gen_b47fd3.py @@ -0,0 +1,135 @@ +from opencompass.datasets import MusrDataset, MusrEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + + +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + 'reader_cfg': dict( + input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'], + output_column='gold_answer', + ), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}' + ) + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict( + type=MusrEvaluator, + answer_index_modifier=1, + self_consistency_n=1 + ), + ), + }, +} + + +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=config['reader_cfg'], + infer_cfg=config['infer_cfg'], + eval_cfg=config['eval_cfg'], + ) + musr_datasets.append(dataset) diff --git a/build/lib/opencompass/configs/datasets/musr/musr_llm_judge_gen.py b/build/lib/opencompass/configs/datasets/musr/musr_llm_judge_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..29bc39dc0bff5b06258640f0f0d0f18dc119c214 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/musr_llm_judge_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .musr_llmjudge_gen_b47fd3 import musr_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/build/lib/opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py b/build/lib/opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py new file mode 100644 index 0000000000000000000000000000000000000000..8f72fbd9e1f1883611b4f2e52ecf0016d8d345b2 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py @@ -0,0 +1,131 @@ +from opencompass.datasets import MusrDataset, generic_llmjudge_postprocess +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : {system_prompt}\n{prompt}\n\n\n + : \n{gold_answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Common configuration components +reader_cfg = dict( + input_columns=[ + 'context', + 'question_text', + 'question', + 'answer', + 'choices', + 'choices_str', + 'intermediate_trees', + 'intermediate_data', + 'prompt', + 'system_prompt', + 'gold_answer', + 'scidx', + 'self_consistency_n', + 'ablation_name', + ], + output_column='gold_answer', +) + +infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}', + ) + ], + round=[ + dict(role='HUMAN', prompt='{prompt}'), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +# Dataset configurations +DATASET_CONFIGS = { + 'murder_mysteries': { + 'abbr': 'musr_murder_mysteries', + 'name': 'murder_mysteries', + 'path': 'opencompass/musr', + }, + 'object_placements': { + 'abbr': 'musr_object_placements', + 'name': 'object_placements', + 'path': 'opencompass/musr', + }, + 'team_allocation': { + 'abbr': 'musr_team_allocation', + 'name': 'team_allocation', + 'path': 'opencompass/musr', + }, +} + +# Create dataset configurations +musr_datasets = [] + +for config in DATASET_CONFIGS.values(): + dataset = dict( + abbr=config['abbr'], + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=reader_cfg, + infer_cfg=infer_cfg, + eval_cfg=dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.", + ) + ], + round=[ + dict(role='HUMAN', prompt=GRADER_TEMPLATE), + ], + ), + ), + dataset_cfg=dict( + type=MusrDataset, + path=config['path'], + name=config['name'], + reader_cfg=reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_postprocess), + ), + ), + ) + musr_datasets.append(dataset) diff --git a/build/lib/opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py b/build/lib/opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..6b7ead2ce17335a89a335ce0aaca9633026b2577 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .narrativeqa_gen_db6413 import narrativeqa_datasets # noqa: F401, F403 diff --git a/build/lib/opencompass/configs/datasets/narrativeqa/narrativeqa_gen_a2d88a.py b/build/lib/opencompass/configs/datasets/narrativeqa/narrativeqa_gen_a2d88a.py new file mode 100644 index 0000000000000000000000000000000000000000..13f2593312934d1342cd98e4ead6baa361744012 --- /dev/null +++ b/build/lib/opencompass/configs/datasets/narrativeqa/narrativeqa_gen_a2d88a.py @@ -0,0 +1,30 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import NarrativeQADataset, TriviaQAEvaluator + +narrativeqa_reader_cfg = dict( + input_columns=['question', 'evidence'], + output_column='answer', + train_split='valid', + test_split='valid') + +narrativeqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{evidence}\nAnswer these questions:\nQ: {question}?\nA:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, max_out_len=50, max_seq_len=8192, batch_size=4)) + +narrativeqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator)) + +narrativeqa_datasets = [ + dict( + type=NarrativeQADataset, + abbr='NarrativeQA', + path='./data/narrativeqa/', + reader_cfg=narrativeqa_reader_cfg, + infer_cfg=narrativeqa_infer_cfg, + eval_cfg=narrativeqa_eval_cfg) +]