msj19 commited on
Commit
a20260e
·
verified ·
1 Parent(s): f47d3f6

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. build/lib/opencompass/configs/datasets/korbench/korbench_gen.py +4 -0
  2. build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py +60 -0
  3. build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py +116 -0
  4. build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py +54 -0
  5. build/lib/opencompass/configs/datasets/korbench/readme.md +71 -0
  6. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py +166 -0
  7. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py +4 -0
  8. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py +164 -0
  9. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py +164 -0
  10. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py +163 -0
  11. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py +165 -0
  12. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py +165 -0
  13. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py +132 -0
  14. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py +164 -0
  15. build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py +168 -0
  16. build/lib/opencompass/configs/datasets/livemathbench/README.md +74 -0
  17. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py +4 -0
  18. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py +49 -0
  19. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py +45 -0
  20. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py +49 -0
  21. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py +4 -0
  22. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py +45 -0
  23. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py +120 -0
  24. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py +96 -0
  25. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py +44 -0
  26. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py +44 -0
  27. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py +97 -0
  28. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py +45 -0
  29. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py +45 -0
  30. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py +44 -0
  31. build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py +43 -0
  32. build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py +4 -0
  33. build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py +136 -0
  34. build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py +142 -0
  35. build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py +142 -0
  36. build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py +152 -0
  37. build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py +155 -0
  38. build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py +4 -0
  39. build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py +152 -0
  40. build/lib/opencompass/configs/datasets/llm_compression/README.md +105 -0
  41. build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py +50 -0
  42. build/lib/opencompass/configs/datasets/longbench/longbench.py +26 -0
  43. build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py +4 -0
  44. build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py +43 -0
  45. build/lib/opencompass/configs/datasets/lveval/lveval.md +165 -0
  46. build/lib/opencompass/configs/datasets/lveval/lveval.py +38 -0
  47. build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py +4 -0
  48. build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py +36 -0
  49. build/lib/opencompass/configs/datasets/matbench/matbench_gen.py +5 -0
  50. build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py +55 -0
build/lib/opencompass/configs/datasets/korbench/korbench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .korbench_single_0_shot_gen import korbench_0shot_single_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
2
+ from opencompass.openicl.icl_inferencer import GenInferencer
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+
6
+ categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
7
+
8
+ korbench_0shot_single_datasets = []
9
+
10
+ for category in categories:
11
+ # Prompt template
12
+ prompt_template = dict(
13
+ type=PromptTemplate,
14
+ template=dict(
15
+ begin=[
16
+ dict(
17
+ role='HUMAN',
18
+ prompt=''
19
+ )
20
+ ],
21
+ round=[
22
+ dict(
23
+ role='HUMAN',
24
+ prompt='{prompt}' # f-string
25
+ )
26
+ ]
27
+ )
28
+ )
29
+
30
+ # Reader configuration
31
+ reader_cfg = dict(
32
+ input_columns=['prompt'],
33
+ output_column='answer',
34
+ )
35
+
36
+ # Inference configuration
37
+ infer_cfg = dict(
38
+ prompt_template=prompt_template,
39
+ retriever=dict(type=ZeroRetriever),
40
+ inferencer=dict(type=GenInferencer),
41
+ )
42
+
43
+ # Evaluation configuration
44
+ eval_cfg = dict(
45
+ evaluator=dict(type=korbenchEvaluator),
46
+ pred_role='BOT',
47
+ )
48
+
49
+ korbench_dataset = dict(
50
+ type=korbenchDataset,
51
+ abbr=f'korbench_{category}',
52
+ path='opencompass/korbench',
53
+ prompt_mode='0_shot',
54
+ category=category,
55
+ reader_cfg=reader_cfg,
56
+ infer_cfg=infer_cfg,
57
+ eval_cfg=eval_cfg,
58
+ )
59
+
60
+ korbench_0shot_single_datasets.append(korbench_dataset)
build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
2
+ from opencompass.openicl.icl_inferencer import GenInferencer
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.evaluator import GenericLLMEvaluator
6
+ from opencompass.datasets import generic_llmjudge_postprocess
7
+
8
+ categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
9
+
10
+ GRADER_TEMPLATE = """
11
+ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
12
+
13
+ Here are some evaluation criteria:
14
+ 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
15
+ 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
16
+ 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
17
+ 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
18
+ 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
19
+
20
+ Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
21
+ A: CORRECT
22
+ B: INCORRECT
23
+ Just return the letters "A" or "B", with no text around it.
24
+
25
+ Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
26
+
27
+
28
+ <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
29
+ <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
30
+ <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
31
+
32
+ Judging the correctness of candidates' answers:
33
+ """.strip()
34
+
35
+ korbench_0shot_single_datasets = []
36
+
37
+ for category in categories:
38
+ # Prompt template
39
+ prompt_template = dict(
40
+ type=PromptTemplate,
41
+ template=dict(
42
+ begin=[
43
+ dict(
44
+ role='HUMAN',
45
+ prompt=''
46
+ )
47
+ ],
48
+ round=[
49
+ dict(
50
+ role='HUMAN',
51
+ prompt='{prompt}' # f-string
52
+ )
53
+ ]
54
+ )
55
+ )
56
+
57
+ # Reader configuration
58
+ reader_cfg = dict(
59
+ input_columns=['prompt'],
60
+ output_column='answer',
61
+ )
62
+
63
+ # Inference configuration
64
+ infer_cfg = dict(
65
+ prompt_template=prompt_template,
66
+ retriever=dict(type=ZeroRetriever),
67
+ inferencer=dict(type=GenInferencer, max_out_len=1024),
68
+ )
69
+
70
+ # Evaluation configuration
71
+ eval_cfg = dict(
72
+ evaluator=dict(
73
+ type=GenericLLMEvaluator,
74
+ prompt_template=dict(
75
+ type=PromptTemplate,
76
+ template=dict(
77
+ begin=[
78
+ dict(
79
+ role='SYSTEM',
80
+ fallback_role='HUMAN',
81
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
82
+ ],
83
+ round=[
84
+ dict(
85
+ role='HUMAN',
86
+ prompt=GRADER_TEMPLATE
87
+ ),
88
+ ]),
89
+ ),
90
+ dataset_cfg=dict(
91
+ type=korbenchDataset,
92
+ path='opencompass/korbench',
93
+ prompt_mode='0_shot',
94
+ category=category,
95
+ reader_cfg=reader_cfg,
96
+ ),
97
+ judge_cfg=dict(),
98
+ dict_postprocessor=dict(type=generic_llmjudge_postprocess),
99
+ ),
100
+ pred_role='BOT',
101
+ )
102
+
103
+ # Dataset
104
+ korbench_dataset = dict(
105
+ type=korbenchDataset,
106
+ abbr=f'korbench_{category}',
107
+ path='opencompass/korbench',
108
+ prompt_mode='0_shot',
109
+ category=category,
110
+ reader_cfg=reader_cfg,
111
+ infer_cfg=infer_cfg,
112
+ eval_cfg=eval_cfg,
113
+ mode='singlescore',
114
+ )
115
+
116
+ korbench_0shot_single_datasets.append(korbench_dataset)
build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.datasets.korbench.korbench import (
2
+ korbenchDataset,
3
+ korbenchEvaluator,
4
+ )
5
+
6
+ from opencompass.openicl.icl_inferencer import GenInferencer
7
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
8
+ from opencompass.openicl.icl_retriever import ZeroRetriever
9
+
10
+ categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
11
+
12
+ korbench_3shot_single_datasets = []
13
+
14
+ for category in categories:
15
+ # Prompt template
16
+ prompt_template = dict(
17
+ type=PromptTemplate,
18
+ template=dict(
19
+ begin=[dict(role='HUMAN', prompt='')],
20
+ round=[dict(role='HUMAN', prompt='{prompt}')], # f-string
21
+ ),
22
+ )
23
+
24
+ # Reader configuration
25
+ reader_cfg = dict(
26
+ input_columns=['prompt'],
27
+ output_column='answer',
28
+ )
29
+
30
+ # Inference configuration
31
+ infer_cfg = dict(
32
+ prompt_template=prompt_template,
33
+ retriever=dict(type=ZeroRetriever),
34
+ inferencer=dict(type=GenInferencer, max_out_len=1024),
35
+ )
36
+
37
+ # Evaluation configuration
38
+ eval_cfg = dict(
39
+ evaluator=dict(type=korbenchEvaluator),
40
+ pred_role='BOT',
41
+ )
42
+
43
+ korbench_dataset = dict(
44
+ type=korbenchDataset,
45
+ abbr=f'korbench_{category}',
46
+ path='opencompass/korbench',
47
+ prompt_mode='3_shot',
48
+ category=category,
49
+ reader_cfg=reader_cfg,
50
+ infer_cfg=infer_cfg,
51
+ eval_cfg=eval_cfg,
52
+ )
53
+
54
+ korbench_3shot_single_datasets.append(korbench_dataset)
build/lib/opencompass/configs/datasets/korbench/readme.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks
2
+
3
+ KOR-Bench is a dataset designed to evaluate large language models (LLMs) on tasks that require reasoning independent of prior knowledge. Created to assess reasoning and planning abilities, KOR-Bench introduces rule-based tasks that minimize the influence of pretrained knowledge, enabling a focused evaluation of intrinsic model capabilities.
4
+
5
+ ## Overview
6
+
7
+ ### Purpose
8
+
9
+ Large language models, such as GPT-4 and Claude, excel in knowledge-based tasks but face challenges in applying reasoning skills to unfamiliar scenarios. KOR-Bench is built to evaluate such reasoning capabilities across five categories:
10
+ - **Operation**: Arithmetic and logical operations.
11
+ - **Logic**: Complex deductive and inductive reasoning.
12
+ - **Cipher**: Code-breaking and pattern discovery.
13
+ - **Puzzle**: Problem-solving with creative and logical reasoning.
14
+ - **Counterfactual**: Hypothetical reasoning in alternate scenarios.
15
+
16
+ ### Dataset Construction
17
+
18
+ KOR-Bench tasks are designed with novel rules and configurations, ensuring no reliance on pretrained knowledge. Each task includes:
19
+ - **Rules**: Custom rule sets to guide reasoning.
20
+ - **Questions**: Carefully crafted problems that require the application of rules.
21
+ - **Evaluation Scenarios**: Zero-shot, three-shot, and subquestion-specific configurations.
22
+
23
+ The dataset is structured to assess multistep reasoning, pattern recognition, and adaptability to new rules.
24
+
25
+ ### Dataset Access
26
+
27
+ KOR-Bench is publicly available with detailed usage instructions in the [GitHub Repository](https://github.com/KOR-Bench/KOR-Bench). Download the dataset and leverage predefined evaluation scripts or customize your own.
28
+
29
+ ### Evaluation
30
+
31
+ 1. Install dependencies and configure your environment.
32
+ 2. Run evaluations using `opencompass examples/eval_korbench.py` to assess LLM performance.
33
+ 3. Analyze model performance across various reasoning tasks.
34
+
35
+ ### Example Command
36
+ ```bash
37
+ opencompass examples/eval_korbench.py
38
+ ```
39
+
40
+ ## Baselines and Results
41
+ KOR-Bench includes baseline results for leading LLMs evaluated across various configurations, including zero-shot (gen) and few-shot modes. Below is a summary of the results.
42
+ | dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | internlm2_5-1_8b-chat-turbomind | llama-3_1-8b-instruct-turbomind | glm-4-9b-chat-turbomind | gemma-2-9b-it-turbomind |
43
+ |---------|---------|--------|------|--------------------------------|---------------------------------|---------------------------------|--------------------------|--------------------------|
44
+ | korbench_mixed_Multi-Q | 21f998 | accuracy | gen | 0.60 | 0.20 | 9.60 | 8.70 | 7.80 |
45
+ | korbench_mixed_Multi-R | 21f998 | accuracy | gen | 1.70 | 0.10 | 8.80 | 12.10 | 9.80 |
46
+ | korbench_mixed_Multi-RQ | 21f998 | accuracy | gen | 1.50 | 0.10 | 6.40 | 8.60 | 6.00 |
47
+ | korbench_cipher | 21f998 | accuracy | gen | 8.80 | 0.80 | 14.00 | 6.80 | 6.40 |
48
+ | korbench_counterfactual | 21f998 | accuracy | gen | 83.60 | 17.20 | 88.80 | 90.40 | 87.60 |
49
+ | korbench_logic | 21f998 | accuracy | gen | 8.40 | 3.60 | 37.60 | 38.80 | 40.80 |
50
+ | korbench_operation | 21f998 | accuracy | gen | 56.00 | 25.20 | 68.40 | 63.60 | 67.60 |
51
+ | korbench_puzzle | 21f998 | accuracy | gen | 3.60 | 0.00 | 3.20 | 3.20 | 5.60 |
52
+ | korbench_cipher | 21f998 | accuracy | fewshot | 8.40 | 3.20 | 9.60 | 9.20 | 9.60 |
53
+ | korbench_counterfactual | 21f998 | accuracy | fewshot | 87.60 | 58.00 | 23.60 | 89.60 | 84.40 |
54
+ | korbench_logic | 21f998 | accuracy | fewshot | 45.20 | 19.60 | 24.40 | 38.40 | 54.00 |
55
+ | korbench_operation | 21f998 | accuracy | fewshot | 24.80 | 11.20 | 73.20 | 67.20 | 23.20 |
56
+ | korbench_puzzle | 21f998 | accuracy | fewshot | 4.80 | 2.40 | 1.60 | 3.60 | 6.80 |
57
+
58
+ ### Citation
59
+
60
+ **BibTeX:**
61
+ ```bibtex
62
+ @misc{ma2024korbenchbenchmarkinglanguagemodels,
63
+ title={KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks},
64
+ author={Kaijing Ma and Xinrun Du and Yunran Wang and Haoran Zhang and Zhoufutu Wen and Xingwei Qu and Jian Yang and Jiaheng Liu and Minghao Liu and Xiang Yue and Wenhao Huang and Ge Zhang},
65
+ year={2024},
66
+ eprint={2410.06526},
67
+ archivePrefix={arXiv},
68
+ primaryClass={cs.DB},
69
+ url={https://arxiv.org/abs/2410.06526},
70
+ }
71
+ ```
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ ),
53
+ pred_role='BOT',
54
+ )
55
+
56
+ LCBCodeGeneration_dataset = dict(
57
+ type=LCBCodeGenerationDataset,
58
+ abbr='lcb_code_generation',
59
+ path='opencompass/code_generation_lite',
60
+ reader_cfg=lcb_code_generation_reader_cfg,
61
+ infer_cfg=lcb_code_generation_infer_cfg,
62
+ eval_cfg=lcb_code_generation_eval_cfg,
63
+ n=5,
64
+ k=3
65
+ )
66
+
67
+ # Code Execution Dataset
68
+ lcb_code_execution_reader_cfg = dict(
69
+ input_columns=[
70
+ 'prompt',
71
+ ],
72
+ output_column='evaluation_sample',
73
+ )
74
+
75
+ lcb_code_execution_infer_cfg = dict(
76
+ prompt_template=dict(
77
+ type=PromptTemplate,
78
+ template=dict(
79
+ begin=[
80
+ dict(
81
+ role='SYSTEM',
82
+ fallback_role='HUMAN',
83
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
84
+ ),
85
+ ],
86
+ round=[
87
+ dict(
88
+ role='HUMAN',
89
+ prompt='{prompt}'
90
+ )
91
+ ]
92
+ )
93
+ ),
94
+ retriever=dict(type=ZeroRetriever),
95
+ inferencer=dict(type=GenInferencer)
96
+ )
97
+
98
+ lcb_code_execution_eval_cfg = dict(
99
+ evaluator=dict(
100
+ type=LCBCodeExecutionEvaluator,
101
+ ),
102
+ pred_role='BOT',
103
+ )
104
+
105
+ LCBCodeExecution_dataset = dict(
106
+ type=LCBCodeExecutionDataset,
107
+ abbr='lcb_code_execution',
108
+ path='opencompass/execution-v2',
109
+ reader_cfg=lcb_code_execution_reader_cfg,
110
+ infer_cfg=lcb_code_execution_infer_cfg,
111
+ eval_cfg=lcb_code_execution_eval_cfg,
112
+ )
113
+
114
+ # TestOuputput Dataset
115
+ lcb_test_output_reader_cfg = dict(
116
+ input_columns=[
117
+ 'prompt',
118
+ ],
119
+ output_column='evaluation_sample',
120
+ )
121
+
122
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
123
+
124
+ lcb_test_output_infer_cfg = dict(
125
+ prompt_template=dict(
126
+ type=PromptTemplate,
127
+ template=dict(
128
+ # begin=[
129
+ # dict(
130
+ # role='SYSTEM',
131
+ # prompt=system_prompt
132
+ # ),
133
+ # ],
134
+ round=[
135
+ dict(
136
+ role='HUMAN',
137
+ prompt='{prompt}'
138
+ )
139
+ ]
140
+ )
141
+ ),
142
+ retriever=dict(type=ZeroRetriever),
143
+ inferencer=dict(type=GenInferencer)
144
+ )
145
+
146
+ lcb_test_output_eval_cfg = dict(
147
+ evaluator=dict(
148
+ type=LCBTestOutputEvaluator,
149
+ ),
150
+ pred_role='BOT',
151
+ )
152
+
153
+ LCBTestOutput_dataset = dict(
154
+ type=LCBTestOutputPredictionDataset,
155
+ abbr='lcb_test_output',
156
+ path='opencompass/test_generation',
157
+ reader_cfg=lcb_test_output_reader_cfg,
158
+ infer_cfg=lcb_test_output_infer_cfg,
159
+ eval_cfg=lcb_test_output_eval_cfg,
160
+ )
161
+
162
+ LCB_datasets = [
163
+ LCBCodeGeneration_dataset,
164
+ # LCBCodeExecution_dataset,
165
+ # LCBTestOutput_dataset,
166
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .livecodebench_gen_a4f90b import LCB_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ ),
53
+ pred_role='BOT',
54
+ )
55
+
56
+ LCBCodeGeneration_dataset = dict(
57
+ type=LCBCodeGenerationDataset,
58
+ abbr='lcb_code_generation',
59
+ path='opencompass/code_generation_lite',
60
+ reader_cfg=lcb_code_generation_reader_cfg,
61
+ infer_cfg=lcb_code_generation_infer_cfg,
62
+ eval_cfg=lcb_code_generation_eval_cfg
63
+ )
64
+
65
+ # Code Execution Dataset
66
+ lcb_code_execution_reader_cfg = dict(
67
+ input_columns=[
68
+ 'prompt',
69
+ ],
70
+ output_column='evaluation_sample',
71
+ )
72
+
73
+ lcb_code_execution_infer_cfg = dict(
74
+ prompt_template=dict(
75
+ type=PromptTemplate,
76
+ template=dict(
77
+ begin=[
78
+ dict(
79
+ role='SYSTEM',
80
+ fallback_role='HUMAN',
81
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
82
+ ),
83
+ ],
84
+ round=[
85
+ dict(
86
+ role='HUMAN',
87
+ prompt='{prompt}'
88
+ )
89
+ ]
90
+ )
91
+ ),
92
+ retriever=dict(type=ZeroRetriever),
93
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
94
+ )
95
+
96
+ lcb_code_execution_eval_cfg = dict(
97
+ evaluator=dict(
98
+ type=LCBCodeExecutionEvaluator,
99
+ ),
100
+ pred_role='BOT',
101
+ )
102
+
103
+ LCBCodeExecution_dataset = dict(
104
+ type=LCBCodeExecutionDataset,
105
+ abbr='lcb_code_execution',
106
+ path='opencompass/execution-v2',
107
+ reader_cfg=lcb_code_execution_reader_cfg,
108
+ infer_cfg=lcb_code_execution_infer_cfg,
109
+ eval_cfg=lcb_code_execution_eval_cfg,
110
+ )
111
+
112
+ # TestOuputput Dataset
113
+ lcb_test_output_reader_cfg = dict(
114
+ input_columns=[
115
+ 'prompt',
116
+ ],
117
+ output_column='evaluation_sample',
118
+ )
119
+
120
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
121
+
122
+ lcb_test_output_infer_cfg = dict(
123
+ prompt_template=dict(
124
+ type=PromptTemplate,
125
+ template=dict(
126
+ # begin=[
127
+ # dict(
128
+ # role='SYSTEM',
129
+ # prompt=system_prompt
130
+ # ),
131
+ # ],
132
+ round=[
133
+ dict(
134
+ role='HUMAN',
135
+ prompt='{prompt}'
136
+ )
137
+ ]
138
+ )
139
+ ),
140
+ retriever=dict(type=ZeroRetriever),
141
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
142
+ )
143
+
144
+ lcb_test_output_eval_cfg = dict(
145
+ evaluator=dict(
146
+ type=LCBTestOutputEvaluator,
147
+ ),
148
+ pred_role='BOT',
149
+ )
150
+
151
+ LCBTestOutput_dataset = dict(
152
+ type=LCBTestOutputPredictionDataset,
153
+ abbr='lcb_test_output',
154
+ path='opencompass/test_generation',
155
+ reader_cfg=lcb_test_output_reader_cfg,
156
+ infer_cfg=lcb_test_output_infer_cfg,
157
+ eval_cfg=lcb_test_output_eval_cfg,
158
+ )
159
+
160
+ LCB_datasets = [
161
+ LCBCodeGeneration_dataset,
162
+ LCBCodeExecution_dataset,
163
+ LCBTestOutput_dataset,
164
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ ),
53
+ pred_role='BOT',
54
+ )
55
+
56
+ LCBCodeGeneration_dataset = dict(
57
+ type=LCBCodeGenerationDataset,
58
+ abbr='lcb_code_generation',
59
+ path='opencompass/code_generation_lite',
60
+ reader_cfg=lcb_code_generation_reader_cfg,
61
+ infer_cfg=lcb_code_generation_infer_cfg,
62
+ eval_cfg=lcb_code_generation_eval_cfg
63
+ )
64
+
65
+ # Code Execution Dataset
66
+ lcb_code_execution_reader_cfg = dict(
67
+ input_columns=[
68
+ 'prompt',
69
+ ],
70
+ output_column='evaluation_sample',
71
+ )
72
+
73
+ lcb_code_execution_infer_cfg = dict(
74
+ prompt_template=dict(
75
+ type=PromptTemplate,
76
+ template=dict(
77
+ begin=[
78
+ dict(
79
+ role='SYSTEM',
80
+ fallback_role='HUMAN',
81
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
82
+ ),
83
+ ],
84
+ round=[
85
+ dict(
86
+ role='HUMAN',
87
+ prompt='{prompt}'
88
+ )
89
+ ]
90
+ )
91
+ ),
92
+ retriever=dict(type=ZeroRetriever),
93
+ inferencer=dict(type=GenInferencer)
94
+ )
95
+
96
+ lcb_code_execution_eval_cfg = dict(
97
+ evaluator=dict(
98
+ type=LCBCodeExecutionEvaluator,
99
+ ),
100
+ pred_role='BOT',
101
+ )
102
+
103
+ LCBCodeExecution_dataset = dict(
104
+ type=LCBCodeExecutionDataset,
105
+ abbr='lcb_code_execution',
106
+ path='opencompass/execution-v2',
107
+ reader_cfg=lcb_code_execution_reader_cfg,
108
+ infer_cfg=lcb_code_execution_infer_cfg,
109
+ eval_cfg=lcb_code_execution_eval_cfg,
110
+ )
111
+
112
+ # TestOuputput Dataset
113
+ lcb_test_output_reader_cfg = dict(
114
+ input_columns=[
115
+ 'prompt',
116
+ ],
117
+ output_column='evaluation_sample',
118
+ )
119
+
120
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
121
+
122
+ lcb_test_output_infer_cfg = dict(
123
+ prompt_template=dict(
124
+ type=PromptTemplate,
125
+ template=dict(
126
+ # begin=[
127
+ # dict(
128
+ # role='SYSTEM',
129
+ # prompt=system_prompt
130
+ # ),
131
+ # ],
132
+ round=[
133
+ dict(
134
+ role='HUMAN',
135
+ prompt='{prompt}'
136
+ )
137
+ ]
138
+ )
139
+ ),
140
+ retriever=dict(type=ZeroRetriever),
141
+ inferencer=dict(type=GenInferencer)
142
+ )
143
+
144
+ lcb_test_output_eval_cfg = dict(
145
+ evaluator=dict(
146
+ type=LCBTestOutputEvaluator,
147
+ ),
148
+ pred_role='BOT',
149
+ )
150
+
151
+ LCBTestOutput_dataset = dict(
152
+ type=LCBTestOutputPredictionDataset,
153
+ abbr='lcb_test_output',
154
+ path='opencompass/test_generation',
155
+ reader_cfg=lcb_test_output_reader_cfg,
156
+ infer_cfg=lcb_test_output_infer_cfg,
157
+ eval_cfg=lcb_test_output_eval_cfg,
158
+ )
159
+
160
+ LCB_datasets = [
161
+ LCBCodeGeneration_dataset,
162
+ LCBCodeExecution_dataset,
163
+ LCBTestOutput_dataset,
164
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ ),
53
+ pred_role='BOT',
54
+ )
55
+
56
+ LCBCodeGeneration_dataset = dict(
57
+ type=LCBCodeGenerationDataset,
58
+ abbr='lcb_code_generation',
59
+ path='opencompass/code_generation_lite',
60
+ reader_cfg=lcb_code_generation_reader_cfg,
61
+ infer_cfg=lcb_code_generation_infer_cfg,
62
+ eval_cfg=lcb_code_generation_eval_cfg
63
+ )
64
+
65
+ # Code Execution Dataset
66
+ lcb_code_execution_reader_cfg = dict(
67
+ input_columns=[
68
+ 'prompt',
69
+ ],
70
+ output_column='evaluation_sample',
71
+ )
72
+
73
+ lcb_code_execution_infer_cfg = dict(
74
+ prompt_template=dict(
75
+ type=PromptTemplate,
76
+ template=dict(
77
+ begin=[
78
+ dict(
79
+ role='SYSTEM',
80
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
81
+ ),
82
+ ],
83
+ round=[
84
+ dict(
85
+ role='HUMAN',
86
+ prompt='{prompt}'
87
+ )
88
+ ]
89
+ )
90
+ ),
91
+ retriever=dict(type=ZeroRetriever),
92
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
93
+ )
94
+
95
+ lcb_code_execution_eval_cfg = dict(
96
+ evaluator=dict(
97
+ type=LCBCodeExecutionEvaluator,
98
+ ),
99
+ pred_role='BOT',
100
+ )
101
+
102
+ LCBCodeExecution_dataset = dict(
103
+ type=LCBCodeExecutionDataset,
104
+ abbr='lcb_code_execution',
105
+ path='opencompass/execution-v2',
106
+ reader_cfg=lcb_code_execution_reader_cfg,
107
+ infer_cfg=lcb_code_execution_infer_cfg,
108
+ eval_cfg=lcb_code_execution_eval_cfg,
109
+ )
110
+
111
+ # TestOuputput Dataset
112
+ lcb_test_output_reader_cfg = dict(
113
+ input_columns=[
114
+ 'prompt',
115
+ ],
116
+ output_column='evaluation_sample',
117
+ )
118
+
119
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
120
+
121
+ lcb_test_output_infer_cfg = dict(
122
+ prompt_template=dict(
123
+ type=PromptTemplate,
124
+ template=dict(
125
+ # begin=[
126
+ # dict(
127
+ # role='SYSTEM',
128
+ # prompt=system_prompt
129
+ # ),
130
+ # ],
131
+ round=[
132
+ dict(
133
+ role='HUMAN',
134
+ prompt='{prompt}'
135
+ )
136
+ ]
137
+ )
138
+ ),
139
+ retriever=dict(type=ZeroRetriever),
140
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
141
+ )
142
+
143
+ lcb_test_output_eval_cfg = dict(
144
+ evaluator=dict(
145
+ type=LCBTestOutputEvaluator,
146
+ ),
147
+ pred_role='BOT',
148
+ )
149
+
150
+ LCBTestOutput_dataset = dict(
151
+ type=LCBTestOutputPredictionDataset,
152
+ abbr='lcb_test_output',
153
+ path='opencompass/test_generation',
154
+ reader_cfg=lcb_test_output_reader_cfg,
155
+ infer_cfg=lcb_test_output_infer_cfg,
156
+ eval_cfg=lcb_test_output_eval_cfg,
157
+ )
158
+
159
+ LCB_datasets = [
160
+ LCBCodeGeneration_dataset,
161
+ LCBCodeExecution_dataset,
162
+ LCBTestOutput_dataset,
163
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ release_version='release_v4',
53
+ ),
54
+ pred_role='BOT',
55
+ )
56
+
57
+ LCBCodeGeneration_dataset = dict(
58
+ type=LCBCodeGenerationDataset,
59
+ abbr='lcb_code_generation_v4',
60
+ path='opencompass/code_generation_lite',
61
+ reader_cfg=lcb_code_generation_reader_cfg,
62
+ infer_cfg=lcb_code_generation_infer_cfg,
63
+ eval_cfg=lcb_code_generation_eval_cfg,
64
+ release_version='release_v4',
65
+ )
66
+
67
+ # Code Execution Dataset
68
+ lcb_code_execution_reader_cfg = dict(
69
+ input_columns=[
70
+ 'prompt',
71
+ ],
72
+ output_column='evaluation_sample',
73
+ )
74
+
75
+ lcb_code_execution_infer_cfg = dict(
76
+ prompt_template=dict(
77
+ type=PromptTemplate,
78
+ template=dict(
79
+ begin=[
80
+ dict(
81
+ role='SYSTEM',
82
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
83
+ ),
84
+ ],
85
+ round=[
86
+ dict(
87
+ role='HUMAN',
88
+ prompt='{prompt}'
89
+ )
90
+ ]
91
+ )
92
+ ),
93
+ retriever=dict(type=ZeroRetriever),
94
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
95
+ )
96
+
97
+ lcb_code_execution_eval_cfg = dict(
98
+ evaluator=dict(
99
+ type=LCBCodeExecutionEvaluator,
100
+ ),
101
+ pred_role='BOT',
102
+ )
103
+
104
+ LCBCodeExecution_dataset = dict(
105
+ type=LCBCodeExecutionDataset,
106
+ abbr='lcb_code_execution',
107
+ path='opencompass/execution-v2',
108
+ reader_cfg=lcb_code_execution_reader_cfg,
109
+ infer_cfg=lcb_code_execution_infer_cfg,
110
+ eval_cfg=lcb_code_execution_eval_cfg,
111
+ )
112
+
113
+ # TestOuputput Dataset
114
+ lcb_test_output_reader_cfg = dict(
115
+ input_columns=[
116
+ 'prompt',
117
+ ],
118
+ output_column='evaluation_sample',
119
+ )
120
+
121
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
122
+
123
+ lcb_test_output_infer_cfg = dict(
124
+ prompt_template=dict(
125
+ type=PromptTemplate,
126
+ template=dict(
127
+ # begin=[
128
+ # dict(
129
+ # role='SYSTEM',
130
+ # prompt=system_prompt
131
+ # ),
132
+ # ],
133
+ round=[
134
+ dict(
135
+ role='HUMAN',
136
+ prompt='{prompt}'
137
+ )
138
+ ]
139
+ )
140
+ ),
141
+ retriever=dict(type=ZeroRetriever),
142
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
143
+ )
144
+
145
+ lcb_test_output_eval_cfg = dict(
146
+ evaluator=dict(
147
+ type=LCBTestOutputEvaluator,
148
+ ),
149
+ pred_role='BOT',
150
+ )
151
+
152
+ LCBTestOutput_dataset = dict(
153
+ type=LCBTestOutputPredictionDataset,
154
+ abbr='lcb_test_output',
155
+ path='opencompass/test_generation',
156
+ reader_cfg=lcb_test_output_reader_cfg,
157
+ infer_cfg=lcb_test_output_infer_cfg,
158
+ eval_cfg=lcb_test_output_eval_cfg,
159
+ )
160
+
161
+ LCB_datasets = [
162
+ LCBCodeGeneration_dataset,
163
+ # LCBCodeExecution_dataset,
164
+ # LCBTestOutput_dataset,
165
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ release_version='release_split_v4',
53
+ ),
54
+ pred_role='BOT',
55
+ )
56
+
57
+ LCBCodeGeneration_dataset = dict(
58
+ type=LCBCodeGenerationDataset,
59
+ abbr='lcb_code_generation_split_v4',
60
+ path='opencompass/code_generation_lite',
61
+ reader_cfg=lcb_code_generation_reader_cfg,
62
+ infer_cfg=lcb_code_generation_infer_cfg,
63
+ eval_cfg=lcb_code_generation_eval_cfg,
64
+ release_version='release_split_v4',
65
+ )
66
+
67
+ # Code Execution Dataset
68
+ lcb_code_execution_reader_cfg = dict(
69
+ input_columns=[
70
+ 'prompt',
71
+ ],
72
+ output_column='evaluation_sample',
73
+ )
74
+
75
+ lcb_code_execution_infer_cfg = dict(
76
+ prompt_template=dict(
77
+ type=PromptTemplate,
78
+ template=dict(
79
+ begin=[
80
+ dict(
81
+ role='SYSTEM',
82
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
83
+ ),
84
+ ],
85
+ round=[
86
+ dict(
87
+ role='HUMAN',
88
+ prompt='{prompt}'
89
+ )
90
+ ]
91
+ )
92
+ ),
93
+ retriever=dict(type=ZeroRetriever),
94
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
95
+ )
96
+
97
+ lcb_code_execution_eval_cfg = dict(
98
+ evaluator=dict(
99
+ type=LCBCodeExecutionEvaluator,
100
+ ),
101
+ pred_role='BOT',
102
+ )
103
+
104
+ LCBCodeExecution_dataset = dict(
105
+ type=LCBCodeExecutionDataset,
106
+ abbr='lcb_code_execution',
107
+ path='opencompass/execution-v2',
108
+ reader_cfg=lcb_code_execution_reader_cfg,
109
+ infer_cfg=lcb_code_execution_infer_cfg,
110
+ eval_cfg=lcb_code_execution_eval_cfg,
111
+ )
112
+
113
+ # TestOuputput Dataset
114
+ lcb_test_output_reader_cfg = dict(
115
+ input_columns=[
116
+ 'prompt',
117
+ ],
118
+ output_column='evaluation_sample',
119
+ )
120
+
121
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
122
+
123
+ lcb_test_output_infer_cfg = dict(
124
+ prompt_template=dict(
125
+ type=PromptTemplate,
126
+ template=dict(
127
+ # begin=[
128
+ # dict(
129
+ # role='SYSTEM',
130
+ # prompt=system_prompt
131
+ # ),
132
+ # ],
133
+ round=[
134
+ dict(
135
+ role='HUMAN',
136
+ prompt='{prompt}'
137
+ )
138
+ ]
139
+ )
140
+ ),
141
+ retriever=dict(type=ZeroRetriever),
142
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
143
+ )
144
+
145
+ lcb_test_output_eval_cfg = dict(
146
+ evaluator=dict(
147
+ type=LCBTestOutputEvaluator,
148
+ ),
149
+ pred_role='BOT',
150
+ )
151
+
152
+ LCBTestOutput_dataset = dict(
153
+ type=LCBTestOutputPredictionDataset,
154
+ abbr='lcb_test_output',
155
+ path='opencompass/test_generation',
156
+ reader_cfg=lcb_test_output_reader_cfg,
157
+ infer_cfg=lcb_test_output_infer_cfg,
158
+ eval_cfg=lcb_test_output_eval_cfg,
159
+ )
160
+
161
+ LCB_datasets = [
162
+ LCBCodeGeneration_dataset,
163
+ # LCBCodeExecution_dataset,
164
+ # LCBTestOutput_dataset,
165
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (LCBCodeGenerationDataset,
5
+ LCBCodeExecutionDataset,
6
+ LCBTestOutputPredictionDataset,
7
+ LCBCodeGenerationEvaluator,
8
+ LCBCodeExecutionEvaluator,
9
+ LCBTestOutputEvaluator)
10
+
11
+ lcb_code_generation_reader_cfg = dict(
12
+ input_columns=[
13
+ 'question_content',
14
+ 'format_prompt',
15
+ ],
16
+ # output_column='evaluation_sample',
17
+ output_column='question_id',
18
+ )
19
+
20
+ SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501
21
+
22
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
23
+ '### Answer: (use the provided format with backticks)\n\n'
24
+
25
+ # Code Generation Tasks
26
+ lcb_code_generation_infer_cfg = dict(prompt_template=dict(
27
+ type=PromptTemplate,
28
+ template=dict(round=[dict(role='HUMAN', prompt=prompt_template)])),
29
+ retriever=dict(type=ZeroRetriever),
30
+ inferencer=dict(type=GenInferencer))
31
+
32
+ lcb_code_generation_eval_cfg = dict(
33
+ evaluator=dict(type=LCBCodeGenerationEvaluator,
34
+ num_process_evaluate=4,
35
+ timeout=6,
36
+ release_version='release_v5',
37
+ start_date='2024-08-01',
38
+ end_date='2025-02-01'),
39
+ pred_role='BOT',
40
+ )
41
+
42
+ LCBCodeGeneration_dataset = dict(
43
+ type=LCBCodeGenerationDataset,
44
+ abbr='lcb_code_generation',
45
+ path='opencompass/code_generation_lite',
46
+ reader_cfg=lcb_code_generation_reader_cfg,
47
+ infer_cfg=lcb_code_generation_infer_cfg,
48
+ eval_cfg=lcb_code_generation_eval_cfg,
49
+ release_version='release_v5',
50
+ )
51
+
52
+ # Code Execution Dataset
53
+ lcb_code_execution_reader_cfg = dict(
54
+ input_columns=[
55
+ 'prompt',
56
+ ],
57
+ output_column='evaluation_sample',
58
+ )
59
+
60
+ lcb_code_execution_infer_cfg = dict(
61
+ prompt_template=dict(
62
+ type=PromptTemplate,
63
+ template=dict(
64
+ begin=[
65
+ dict(
66
+ role='SYSTEM',
67
+ fallback_role='HUMAN',
68
+ prompt=
69
+ 'You are an expert at Python programming, code execution, test case generation, and fuzzing.' # noqa: E501
70
+ ),
71
+ ],
72
+ round=[dict(role='HUMAN', prompt='{prompt}')])),
73
+ retriever=dict(type=ZeroRetriever),
74
+ inferencer=dict(type=GenInferencer))
75
+
76
+ lcb_code_execution_eval_cfg = dict(
77
+ evaluator=dict(type=LCBCodeExecutionEvaluator, ),
78
+ pred_role='BOT',
79
+ )
80
+
81
+ LCBCodeExecution_dataset = dict(
82
+ type=LCBCodeExecutionDataset,
83
+ abbr='lcb_code_execution',
84
+ path='opencompass/execution-v2',
85
+ reader_cfg=lcb_code_execution_reader_cfg,
86
+ infer_cfg=lcb_code_execution_infer_cfg,
87
+ eval_cfg=lcb_code_execution_eval_cfg,
88
+ )
89
+
90
+ # TestOuputput Dataset
91
+ lcb_test_output_reader_cfg = dict(
92
+ input_columns=[
93
+ 'prompt',
94
+ ],
95
+ output_column='evaluation_sample',
96
+ )
97
+
98
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501
99
+
100
+ lcb_test_output_infer_cfg = dict(
101
+ prompt_template=dict(
102
+ type=PromptTemplate,
103
+ template=dict(
104
+ # begin=[
105
+ # dict(
106
+ # role='SYSTEM',
107
+ # prompt=system_prompt
108
+ # ),
109
+ # ],
110
+ round=[dict(role='HUMAN', prompt='{prompt}')])),
111
+ retriever=dict(type=ZeroRetriever),
112
+ inferencer=dict(type=GenInferencer))
113
+
114
+ lcb_test_output_eval_cfg = dict(
115
+ evaluator=dict(type=LCBTestOutputEvaluator, ),
116
+ pred_role='BOT',
117
+ )
118
+
119
+ LCBTestOutput_dataset = dict(
120
+ type=LCBTestOutputPredictionDataset,
121
+ abbr='lcb_test_output',
122
+ path='opencompass/test_generation',
123
+ reader_cfg=lcb_test_output_reader_cfg,
124
+ infer_cfg=lcb_test_output_infer_cfg,
125
+ eval_cfg=lcb_test_output_eval_cfg,
126
+ )
127
+
128
+ LCB_datasets = [
129
+ LCBCodeGeneration_dataset,
130
+ LCBCodeExecution_dataset,
131
+ LCBTestOutput_dataset,
132
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ num_process_evaluate=4,
51
+ timeout=6,
52
+ ),
53
+ pred_role='BOT',
54
+ )
55
+
56
+ LCBCodeGeneration_dataset = dict(
57
+ type=LCBCodeGenerationDataset,
58
+ abbr='lcb_code_generation_v1',
59
+ path='opencompass/code_generation_lite',
60
+ reader_cfg=lcb_code_generation_reader_cfg,
61
+ infer_cfg=lcb_code_generation_infer_cfg,
62
+ eval_cfg=lcb_code_generation_eval_cfg,
63
+ release_version='release_v1',
64
+ )
65
+
66
+ # Code Execution Dataset
67
+ lcb_code_execution_reader_cfg = dict(
68
+ input_columns=[
69
+ 'prompt',
70
+ ],
71
+ output_column='evaluation_sample',
72
+ )
73
+
74
+ lcb_code_execution_infer_cfg = dict(
75
+ prompt_template=dict(
76
+ type=PromptTemplate,
77
+ template=dict(
78
+ begin=[
79
+ dict(
80
+ role='SYSTEM',
81
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
82
+ ),
83
+ ],
84
+ round=[
85
+ dict(
86
+ role='HUMAN',
87
+ prompt='{prompt}'
88
+ )
89
+ ]
90
+ )
91
+ ),
92
+ retriever=dict(type=ZeroRetriever),
93
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
94
+ )
95
+
96
+ lcb_code_execution_eval_cfg = dict(
97
+ evaluator=dict(
98
+ type=LCBCodeExecutionEvaluator,
99
+ ),
100
+ pred_role='BOT',
101
+ )
102
+
103
+ LCBCodeExecution_dataset = dict(
104
+ type=LCBCodeExecutionDataset,
105
+ abbr='lcb_code_execution',
106
+ path='opencompass/execution-v2',
107
+ reader_cfg=lcb_code_execution_reader_cfg,
108
+ infer_cfg=lcb_code_execution_infer_cfg,
109
+ eval_cfg=lcb_code_execution_eval_cfg,
110
+ )
111
+
112
+ # TestOuputput Dataset
113
+ lcb_test_output_reader_cfg = dict(
114
+ input_columns=[
115
+ 'prompt',
116
+ ],
117
+ output_column='evaluation_sample',
118
+ )
119
+
120
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
121
+
122
+ lcb_test_output_infer_cfg = dict(
123
+ prompt_template=dict(
124
+ type=PromptTemplate,
125
+ template=dict(
126
+ # begin=[
127
+ # dict(
128
+ # role='SYSTEM',
129
+ # prompt=system_prompt
130
+ # ),
131
+ # ],
132
+ round=[
133
+ dict(
134
+ role='HUMAN',
135
+ prompt='{prompt}'
136
+ )
137
+ ]
138
+ )
139
+ ),
140
+ retriever=dict(type=ZeroRetriever),
141
+ inferencer=dict(type=GenInferencer, max_out_len=1024)
142
+ )
143
+
144
+ lcb_test_output_eval_cfg = dict(
145
+ evaluator=dict(
146
+ type=LCBTestOutputEvaluator,
147
+ ),
148
+ pred_role='BOT',
149
+ )
150
+
151
+ LCBTestOutput_dataset = dict(
152
+ type=LCBTestOutputPredictionDataset,
153
+ abbr='lcb_test_output',
154
+ path='opencompass/test_generation',
155
+ reader_cfg=lcb_test_output_reader_cfg,
156
+ infer_cfg=lcb_test_output_infer_cfg,
157
+ eval_cfg=lcb_test_output_eval_cfg,
158
+ )
159
+
160
+ LCB_datasets = [
161
+ LCBCodeGeneration_dataset,
162
+ # LCBCodeExecution_dataset,
163
+ # LCBTestOutput_dataset,
164
+ ]
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import (
5
+ LCBCodeGenerationDataset,
6
+ LCBCodeExecutionDataset,
7
+ LCBTestOutputPredictionDataset,
8
+ LCBCodeGenerationEvaluator,
9
+ LCBCodeExecutionEvaluator,
10
+ LCBTestOutputEvaluator
11
+ )
12
+ from opencompass.datasets.livecodebench import TestOutputPromptConstants
13
+
14
+
15
+ lcb_code_generation_reader_cfg = dict(
16
+ input_columns=[
17
+ 'question_content',
18
+ 'format_prompt',
19
+ ],
20
+ # output_column='evaluation_sample',
21
+ output_column='question_id',
22
+ )
23
+
24
+ SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
25
+
26
+ prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
27
+ '### Answer: (use the provided format with backticks)\n\n'
28
+
29
+
30
+ # Code Generation Tasks
31
+ lcb_code_generation_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=prompt_template
39
+ )
40
+ ]
41
+ )
42
+ ),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer)
45
+ )
46
+
47
+ lcb_code_generation_eval_cfg = dict(
48
+ evaluator=dict(
49
+ type=LCBCodeGenerationEvaluator,
50
+ release_version='v6',
51
+ extractor_version='v2',
52
+ num_process_evaluate=4,
53
+ timeout=6,
54
+ ),
55
+ pred_role='BOT',
56
+ )
57
+
58
+ LCBCodeGeneration_dataset = dict(
59
+ type=LCBCodeGenerationDataset,
60
+ abbr='lcb_code_generation_repeat_6',
61
+ path='opencompass/code_generation_lite',
62
+ release_version='v6',
63
+ reader_cfg=lcb_code_generation_reader_cfg,
64
+ infer_cfg=lcb_code_generation_infer_cfg,
65
+ eval_cfg=lcb_code_generation_eval_cfg,
66
+ n=6,
67
+ )
68
+
69
+ # Code Execution Dataset
70
+ lcb_code_execution_reader_cfg = dict(
71
+ input_columns=[
72
+ 'prompt',
73
+ ],
74
+ output_column='evaluation_sample',
75
+ )
76
+
77
+ lcb_code_execution_infer_cfg = dict(
78
+ prompt_template=dict(
79
+ type=PromptTemplate,
80
+ template=dict(
81
+ begin=[
82
+ dict(
83
+ role='SYSTEM',
84
+ fallback_role='HUMAN',
85
+ prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
86
+ ),
87
+ ],
88
+ round=[
89
+ dict(
90
+ role='HUMAN',
91
+ prompt='{prompt}'
92
+ )
93
+ ]
94
+ )
95
+ ),
96
+ retriever=dict(type=ZeroRetriever),
97
+ inferencer=dict(type=GenInferencer)
98
+ )
99
+
100
+ lcb_code_execution_eval_cfg = dict(
101
+ evaluator=dict(
102
+ type=LCBCodeExecutionEvaluator,
103
+ ),
104
+ pred_role='BOT',
105
+ )
106
+
107
+ LCBCodeExecution_dataset = dict(
108
+ type=LCBCodeExecutionDataset,
109
+ abbr='lcb_code_execution',
110
+ path='opencompass/execution-v2',
111
+ reader_cfg=lcb_code_execution_reader_cfg,
112
+ infer_cfg=lcb_code_execution_infer_cfg,
113
+ eval_cfg=lcb_code_execution_eval_cfg,
114
+ )
115
+
116
+ # TestOuputput Dataset
117
+ lcb_test_output_reader_cfg = dict(
118
+ input_columns=[
119
+ 'prompt',
120
+ ],
121
+ output_column='evaluation_sample',
122
+ )
123
+
124
+ system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
125
+
126
+ lcb_test_output_infer_cfg = dict(
127
+ prompt_template=dict(
128
+ type=PromptTemplate,
129
+ template=dict(
130
+ # begin=[
131
+ # dict(
132
+ # role='SYSTEM',
133
+ # prompt=system_prompt
134
+ # ),
135
+ # ],
136
+ round=[
137
+ dict(
138
+ role='HUMAN',
139
+ prompt='{prompt}'
140
+ )
141
+ ]
142
+ )
143
+ ),
144
+ retriever=dict(type=ZeroRetriever),
145
+ inferencer=dict(type=GenInferencer)
146
+ )
147
+
148
+ lcb_test_output_eval_cfg = dict(
149
+ evaluator=dict(
150
+ type=LCBTestOutputEvaluator,
151
+ ),
152
+ pred_role='BOT',
153
+ )
154
+
155
+ LCBTestOutput_dataset = dict(
156
+ type=LCBTestOutputPredictionDataset,
157
+ abbr='lcb_test_output',
158
+ path='opencompass/test_generation',
159
+ reader_cfg=lcb_test_output_reader_cfg,
160
+ infer_cfg=lcb_test_output_infer_cfg,
161
+ eval_cfg=lcb_test_output_eval_cfg,
162
+ )
163
+
164
+ LCB_datasets = [
165
+ LCBCodeGeneration_dataset,
166
+ LCBCodeExecution_dataset,
167
+ LCBTestOutput_dataset,
168
+ ]
build/lib/opencompass/configs/datasets/livemathbench/README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LiveMathBench
2
+
3
+ ## v202412
4
+
5
+ ### Details of Datsets
6
+
7
+ | dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
8
+ | -- | -- | -- | -- | -- | -- |
9
+ | AMC | cn | 0 | 0 | 0 | 46 |
10
+ | AMC | en | 0 | 0 | 0 | 46 |
11
+ | CCEE | cn | 0 | 0 | 13 | 31 |
12
+ | CCEE | en | 0 | 0 | 13 | 31 |
13
+ | CNMO | cn | 0 | 0 | 0 | 18 |
14
+ | CNMO | en | 0 | 0 | 0 | 18 |
15
+ | WLPMC | cn | 0 | 0 | 0 | 11 |
16
+ | WLPMC | en | 0 | 0 | 0 | 11 |
17
+
18
+
19
+ ### How to use
20
+
21
+ #### G-Pass@k
22
+ ```python
23
+ from mmengine.config import read_base
24
+
25
+ with read_base():
26
+ from opencompass.datasets.livemathbench_gen import livemathbench_datasets
27
+
28
+ livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
29
+ {
30
+ 'model_name': 'Qwen/Qwen2.5-72B-Instruct',
31
+ 'url': [
32
+ 'http://0.0.0.0:23333/v1',
33
+ '...'
34
+ ] # set url of evaluation models
35
+ }
36
+ )
37
+ livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
38
+ max_out_len=32768 # for o1-like models you need to update max_out_len
39
+ ))
40
+
41
+ ```
42
+
43
+ #### Greedy
44
+ ```python
45
+ from mmengine.config import read_base
46
+
47
+ with read_base():
48
+ from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
49
+
50
+ livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
51
+ {
52
+ 'model_name': 'Qwen/Qwen2.5-72B-Instruct',
53
+ 'url': [
54
+ 'http://0.0.0.0:23333/v1',
55
+ '...'
56
+ ] # set url of evaluation models
57
+ }
58
+ )
59
+ livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
60
+ max_out_len=32768 # for o1-like models you need to update max_out_len
61
+ ))
62
+
63
+ ```
64
+
65
+ ### Output Samples
66
+
67
+ | dataset | version | metric | mode | Qwen2.5-72B-Instruct |
68
+ |----- | ----- | ----- | ----- | -----|
69
+ | LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
70
+ | LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
71
+ | LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
72
+ | LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
73
+ | LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |
74
+
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .livemathbench_gen_9befbf import livemathbench_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_reader_cfg = dict(
9
+ input_columns=['prompt'],
10
+ output_column='answer'
11
+ )
12
+
13
+ livemathbench_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ round=[
18
+ dict(role='HUMAN', prompt='{prompt}'),
19
+ ]
20
+ )
21
+ ),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(
24
+ type=GenInferencer,
25
+ max_out_len=16384,
26
+ temperature=1.0
27
+ )
28
+ )
29
+
30
+ livemathbench_eval_cfg = dict(
31
+ evaluator=dict(
32
+ type=LiveMathBenchEvaluator,
33
+ model_name='Qwen/Qwen2.5-72B-Instruct',
34
+ url=['http://172.30.40.154:23333/v1/'] #'https://api.openai.com/v1/'
35
+ )
36
+ )
37
+
38
+ livemathbench_datasets = [
39
+ dict(
40
+ type=LiveMathBenchDataset,
41
+ abbr='LiveMathBench-k1-n1',
42
+ path='opencompass/LiveMathBench202412',
43
+ k=1, # K@Pass
44
+ n=1, # Run times
45
+ reader_cfg=livemathbench_reader_cfg,
46
+ infer_cfg=livemathbench_infer_cfg,
47
+ eval_cfg=livemathbench_eval_cfg
48
+ )
49
+ ]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='',
11
+ k=16,
12
+ n=48,
13
+ dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
14
+ dataset_languages=['cn', 'en'],
15
+ cot=True,
16
+ version='202412',
17
+ abbr='LiveMathBench-v202412',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer,
34
+ max_out_len=8192
35
+ ),
36
+ ),
37
+ eval_cfg=dict(
38
+ evaluator=dict(
39
+ type=LiveMathBenchEvaluator,
40
+ model_name='',
41
+ url=[]
42
+ )
43
+ )
44
+ )
45
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_reader_cfg = dict(
9
+ input_columns=['prompt'],
10
+ output_column='answer'
11
+ )
12
+
13
+ livemathbench_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ round=[
18
+ dict(role='HUMAN', prompt='{prompt}'),
19
+ ]
20
+ )
21
+ ),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(
24
+ type=GenInferencer,
25
+ max_out_len=2048,
26
+ temperature=1.0
27
+ )
28
+ )
29
+
30
+ livemathbench_eval_cfg = dict(
31
+ evaluator=dict(
32
+ type=LiveMathBenchEvaluator,
33
+ model_name='Qwen/Qwen2.5-72B-Instruct',
34
+ url=[]
35
+ )
36
+ )
37
+
38
+ livemathbench_datasets = [
39
+ dict(
40
+ type=LiveMathBenchDataset,
41
+ abbr='LiveMathBench',
42
+ path='',
43
+ k=32,
44
+ n=5,
45
+ reader_cfg=livemathbench_reader_cfg,
46
+ infer_cfg=livemathbench_infer_cfg,
47
+ eval_cfg=livemathbench_eval_cfg
48
+ )
49
+ ]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='',
11
+ k=1,
12
+ n=1,
13
+ dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
14
+ dataset_languages=['cn', 'en'],
15
+ cot=True,
16
+ version='202412',
17
+ abbr='LiveMathBench-v202412',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer,
34
+ max_out_len=8192
35
+ ),
36
+ ),
37
+ eval_cfg=dict(
38
+ evaluator=dict(
39
+ type=LiveMathBenchEvaluator,
40
+ model_name='',
41
+ url=[]
42
+ )
43
+ )
44
+ )
45
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Summary: A config for LiveMathBench-Hard-202412 Dataset Evaluation.
3
+ Setting:
4
+ Shot: 0-shot
5
+ Evaluator:
6
+ - CascadeEvaluator
7
+ - MATHVerifyEvaluator
8
+ - GenericLLMEvaluator
9
+ Repeat: 32
10
+ Avaliable Models:
11
+ - Instruct/Chat Models
12
+ """
13
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
14
+ from opencompass.openicl.icl_retriever import ZeroRetriever
15
+ from opencompass.openicl.icl_inferencer import GenInferencer
16
+ from opencompass.evaluator import GenericLLMEvaluator
17
+ from opencompass.datasets import CustomDataset
18
+ from opencompass.datasets import generic_llmjudge_postprocess
19
+ from opencompass.evaluator import (
20
+ CascadeEvaluator,
21
+ GenericLLMEvaluator,
22
+ MATHVerifyEvaluator,
23
+ )
24
+
25
+ livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
26
+
27
+
28
+ # Inference configuration
29
+ livemathbench_infer_cfg = dict(
30
+ prompt_template=dict(
31
+ type=PromptTemplate,
32
+ template=dict(
33
+ round=[
34
+ dict(
35
+ role='HUMAN',
36
+ prompt='{question}\nRemember to put your final answer within \\boxed{}.',
37
+ ),
38
+ ]
39
+ ),
40
+ ),
41
+ retriever=dict(type=ZeroRetriever),
42
+ inferencer=dict(type=GenInferencer),
43
+ )
44
+
45
+
46
+ # Template for the LLM judge
47
+ GRADER_TEMPLATE = """
48
+ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
49
+
50
+ Here are some evaluation criteria:
51
+ 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
52
+ 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
53
+ 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
54
+ 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
55
+ 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
56
+ Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
57
+ A: CORRECT
58
+ B: INCORRECT
59
+ Just return the letters "A" or "B", with no text around it.
60
+ Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
61
+ <Original Question Begin>: \n{question}\n<Original Question End>\n\n
62
+ <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
63
+ <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
64
+
65
+ Judging the correctness of candidates' answers:
66
+ """.strip()
67
+
68
+
69
+
70
+ splits = ['hard_cn', 'hard_en']
71
+ # Dataset configuration
72
+ livemathbench_datasets = [
73
+ dict(
74
+ type=CustomDataset,
75
+ abbr=f'livemathbench_hard_custom_{split}',
76
+ path='data/LiveMathBench',
77
+ local_mode=True,
78
+ file_name=f'202412/{split}.jsonl',
79
+ reader_cfg=livemathbench_reader_cfg,
80
+ infer_cfg=livemathbench_infer_cfg,
81
+ eval_cfg=dict(
82
+ # Evaluation configuration using LLM as judge
83
+ evaluator=dict(
84
+ type=CascadeEvaluator,
85
+ rule_evaluator=dict(
86
+ type=MATHVerifyEvaluator,
87
+ ),
88
+ llm_evaluator=dict(
89
+ type=GenericLLMEvaluator,
90
+ prompt_template=dict(
91
+ type=PromptTemplate,
92
+ template=dict(
93
+ begin=[
94
+ dict(
95
+ role='SYSTEM',
96
+ fallback_role='HUMAN',
97
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
98
+ )
99
+ ],
100
+ round=[
101
+ dict(role='HUMAN', prompt=GRADER_TEMPLATE),
102
+ ],
103
+ ),
104
+ ),
105
+ dataset_cfg=dict(
106
+ type=CustomDataset,
107
+ path='data/LiveMathBench',
108
+ local_mode=True,
109
+ file_name=f'202412/{split}.jsonl',
110
+ reader_cfg=livemathbench_reader_cfg,
111
+ ),
112
+ judge_cfg={},
113
+ dict_postprocessor=dict(type=generic_llmjudge_postprocess),
114
+ ),
115
+ parallel=False
116
+ ),
117
+ ),
118
+ n=1, # repeat n times
119
+ ) for split in splits
120
+ ]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.evaluator import GenericLLMEvaluator
5
+ from opencompass.datasets import CustomDataset
6
+ from opencompass.datasets import generic_llmjudge_postprocess
7
+ from itertools import product
8
+
9
+ livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
10
+
11
+
12
+ # Inference configuration
13
+ livemathbench_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt='{question}\n',
21
+ ),
22
+ ]
23
+ ),
24
+ ),
25
+ retriever=dict(type=ZeroRetriever),
26
+ inferencer=dict(type=GenInferencer),
27
+ )
28
+
29
+
30
+ # Template for the LLM judge
31
+ GRADER_TEMPLATE = """
32
+ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
33
+
34
+ Here are some evaluation criteria:
35
+ 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
36
+ 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
37
+ 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
38
+ 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
39
+ 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
40
+ Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
41
+ A: CORRECT
42
+ B: INCORRECT
43
+ Just return the letters "A" or "B", with no text around it.
44
+ Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
45
+ <Original Question Begin>: \n{question}\n<Original Question End>\n\n
46
+ <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
47
+ <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
48
+
49
+ Judging the correctness of candidates' answers:
50
+ """.strip()
51
+
52
+
53
+
54
+ splits = ['hard_cn', 'hard_en']
55
+ # Dataset configuration
56
+ livemathbench_datasets = [
57
+ dict(
58
+ type=CustomDataset,
59
+ abbr=f'livemathbench_hard_custom_{split}_run{run_idx}',
60
+ path='data/LiveMathBench',
61
+ local_mode=True,
62
+ file_name=f'202412/{split}.jsonl',
63
+ reader_cfg=livemathbench_reader_cfg,
64
+ infer_cfg=livemathbench_infer_cfg,
65
+ eval_cfg=dict(
66
+ # # Evaluation configuration using LLM as judge
67
+ evaluator=dict(
68
+ type=GenericLLMEvaluator,
69
+ prompt_template=dict(
70
+ type=PromptTemplate,
71
+ template=dict(
72
+ begin=[
73
+ dict(
74
+ role='SYSTEM',
75
+ fallback_role='HUMAN',
76
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
77
+ )
78
+ ],
79
+ round=[
80
+ dict(role='HUMAN', prompt=GRADER_TEMPLATE),
81
+ ],
82
+ ),
83
+ ),
84
+ dataset_cfg=dict(
85
+ type=CustomDataset,
86
+ path='data/LiveMathBench',
87
+ local_mode=True,
88
+ file_name=f'202412/{split}.jsonl',
89
+ reader_cfg=livemathbench_reader_cfg,
90
+ ),
91
+ judge_cfg={},
92
+ dict_postprocessor=dict(type=generic_llmjudge_postprocess),
93
+ ),
94
+ ),
95
+ ) for split, run_idx in product(splits, range(8))
96
+ ]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='',
11
+ k=16,
12
+ n=48,
13
+ dataset_splits=['hard'],
14
+ dataset_languages=['cn', 'en'],
15
+ cot=True,
16
+ version='202412',
17
+ abbr='LiveMathBench-v202412-Hard',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer
34
+ ),
35
+ ),
36
+ eval_cfg=dict(
37
+ evaluator=dict(
38
+ type=LiveMathBenchEvaluator,
39
+ model_name='',
40
+ url=[]
41
+ )
42
+ )
43
+ )
44
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='',
11
+ k=1,
12
+ n=1,
13
+ dataset_splits=['hard'],
14
+ dataset_languages=['cn', 'en'],
15
+ cot=True,
16
+ version='202412',
17
+ abbr='LiveMathBench-v202412-Hard',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer
34
+ ),
35
+ ),
36
+ eval_cfg=dict(
37
+ evaluator=dict(
38
+ type=LiveMathBenchEvaluator,
39
+ model_name='',
40
+ url=[]
41
+ )
42
+ )
43
+ )
44
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.evaluator import GenericLLMEvaluator
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset
6
+ from opencompass.datasets import generic_llmjudge_postprocess
7
+
8
+ livemathbench_reader_cfg = dict(
9
+ input_columns=['question'], output_column='answer'
10
+ )
11
+
12
+
13
+ # Inference configuration
14
+ livemathbench_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template=dict(
18
+ round=[
19
+ dict(
20
+ role='HUMAN',
21
+ prompt='{question}\n',
22
+ ),
23
+ ]
24
+ ),
25
+ ),
26
+ retriever=dict(type=ZeroRetriever),
27
+ inferencer=dict(type=GenInferencer),
28
+ )
29
+
30
+
31
+ # Template for the LLM judge
32
+ GRADER_TEMPLATE = """
33
+ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
34
+
35
+ Here are some evaluation criteria:
36
+ 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
37
+ 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
38
+ 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
39
+ 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
40
+ 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
41
+ Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
42
+ A: CORRECT
43
+ B: INCORRECT
44
+ Just return the letters "A" or "B", with no text around it.
45
+ Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
46
+ <Original Question Begin>: \n{question}\n<Original Question End>\n\n
47
+ <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
48
+ <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
49
+
50
+ Judging the correctness of candidates' answers:
51
+ """.strip()
52
+
53
+
54
+ splits = ['hard']
55
+ livemathbench_datasets = []
56
+ for split in splits:
57
+ # Dataset configuration
58
+ livemathbench_datasets.append(
59
+ dict(
60
+ type=LiveMathBenchDataset,
61
+ abbr=f'livemathbench_{split}',
62
+ path='opencompass/LiveMathBench',
63
+ dataset_splits = [split],
64
+ dataset_languages= ['cn', 'en'],
65
+ reader_cfg=livemathbench_reader_cfg,
66
+ infer_cfg=livemathbench_infer_cfg,
67
+ eval_cfg=dict(
68
+ # # Evaluation configuration using LLM as judge
69
+ evaluator=dict(
70
+ type=GenericLLMEvaluator,
71
+ prompt_template=dict(
72
+ type=PromptTemplate,
73
+ template=dict(
74
+ begin=[
75
+ dict(
76
+ role='SYSTEM',
77
+ fallback_role='HUMAN',
78
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
79
+ )
80
+ ],
81
+ round=[
82
+ dict(role='HUMAN', prompt=GRADER_TEMPLATE),
83
+ ],
84
+ ),
85
+ ),
86
+ dataset_cfg=dict(
87
+ type=LiveMathBenchDataset,
88
+ path='opencompass/LiveMathBench202412',
89
+ dataset_splits = [split],
90
+ reader_cfg=livemathbench_reader_cfg,
91
+ ),
92
+ judge_cfg={},
93
+ dict_postprocessor=dict(type=generic_llmjudge_postprocess),
94
+ ),
95
+ ),
96
+ )
97
+ )
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='opencompass/LiveMathBench',
11
+ k=16,
12
+ n=48,
13
+ dataset_splits=['all'],
14
+ dataset_languages=['en'],
15
+ cot=True,
16
+ version='202505',
17
+ abbr='LiveMathBench-v202505',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer,
34
+ max_out_len=8192
35
+ ),
36
+ ),
37
+ eval_cfg=dict(
38
+ evaluator=dict(
39
+ type=LiveMathBenchEvaluator,
40
+ model_name='',
41
+ url=[]
42
+ )
43
+ )
44
+ )
45
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='opencompass/LiveMathBench',
11
+ k=1,
12
+ n=1,
13
+ dataset_splits=['all'],
14
+ dataset_languages=['en'],
15
+ cot=True,
16
+ version='202505',
17
+ abbr='LiveMathBench-v202505',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer,
34
+ max_out_len=8192
35
+ ),
36
+ ),
37
+ eval_cfg=dict(
38
+ evaluator=dict(
39
+ type=LiveMathBenchEvaluator,
40
+ model_name='',
41
+ url=[]
42
+ )
43
+ )
44
+ )
45
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+
8
+ livemathbench_dataset = dict(
9
+ type=LiveMathBenchDataset,
10
+ path='opencompass/LiveMathBench',
11
+ k=16,
12
+ n=48,
13
+ dataset_splits=['hard'],
14
+ dataset_languages=['en'],
15
+ cot=True,
16
+ version='202505',
17
+ abbr='LiveMathBench-v202505-Hard',
18
+ reader_cfg=dict(
19
+ input_columns=['prompt'],
20
+ output_column='answer'
21
+ ),
22
+ infer_cfg=dict(
23
+ prompt_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(role='HUMAN', prompt='{prompt}'),
28
+ ]
29
+ )
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(
33
+ type=GenInferencer
34
+ ),
35
+ ),
36
+ eval_cfg=dict(
37
+ evaluator=dict(
38
+ type=LiveMathBenchEvaluator,
39
+ model_name='',
40
+ url=[]
41
+ )
42
+ )
43
+ )
44
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
6
+
7
+ livemathbench_dataset = dict(
8
+ type=LiveMathBenchDataset,
9
+ path='opencompass/LiveMathBench',
10
+ k=1,
11
+ n=1,
12
+ dataset_splits=['hard'],
13
+ dataset_languages=['en'],
14
+ cot=True,
15
+ version='202505',
16
+ abbr='LiveMathBench-v202505-Hard',
17
+ reader_cfg=dict(
18
+ input_columns=['prompt'],
19
+ output_column='answer'
20
+ ),
21
+ infer_cfg=dict(
22
+ prompt_template=dict(
23
+ type=PromptTemplate,
24
+ template=dict(
25
+ round=[
26
+ dict(role='HUMAN', prompt='{prompt}'),
27
+ ]
28
+ )
29
+ ),
30
+ retriever=dict(type=ZeroRetriever),
31
+ inferencer=dict(
32
+ type=GenInferencer
33
+ ),
34
+ ),
35
+ eval_cfg=dict(
36
+ evaluator=dict(
37
+ type=LiveMathBenchEvaluator,
38
+ model_name='',
39
+ url=[]
40
+ )
41
+ )
42
+ )
43
+ livemathbench_datasets = [livemathbench_dataset]
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .livereasonbench_gen_0283c3 import simpleqa_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ # from opencompass.datasets import SimpleQADataset, simpleqa_postprocess
6
+ from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
7
+
8
+
9
+ GRADER_TEMPLATE = """
10
+ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
11
+ First, I will give examples of each grade, and then you will grade a new example.
12
+
13
+
14
+ The following are examples of CORRECT predicted answers.
15
+ ```
16
+ Question: What are the names of Barack Obama's children?
17
+ Gold target: Malia Obama and Sasha Obama
18
+ Predicted answer 1: sasha and malia obama
19
+ Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
20
+ Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
21
+ ```
22
+ These predicted answers are all CORRECT because:
23
+ - They fully contain the important information in the gold target.
24
+ - They do not contain any information that contradicts the gold target.
25
+ - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
26
+ - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
27
+
28
+
29
+ The following are examples of INCORRECT predicted answers.
30
+ ```
31
+ Question: What are the names of Barack Obama's children?
32
+ Gold target: Malia and Sasha
33
+ Predicted answer 1: Malia.
34
+ Predicted answer 2: Malia, Sasha, and Susan.
35
+ Predicted answer 3: Barack Obama does not have any children.
36
+ Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
37
+ Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
38
+ Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
39
+ Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
40
+ ```
41
+ These predicted answers are all INCORRECT because:
42
+ - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
43
+
44
+
45
+ The following are examples of NOT_ATTEMPTED predicted answers.
46
+ ```
47
+ Question: What are the names of Barack Obama's children?
48
+ Gold target: Malia and Sasha
49
+ Predicted answer 1: I don't know.
50
+ Predicted answer 2: I need more context about which Obama you are talking about.
51
+ Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
52
+ Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
53
+ ```
54
+ These predicted answers are all NOT_ATTEMPTED because:
55
+ - The important information in the gold target is not included in the answer.
56
+ - No statements in the answer contradict the gold target.
57
+
58
+
59
+ Also note the following things:
60
+ - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
61
+ - Predicted answers "120k", "124k", and 115k" are all CORRECT.
62
+ - Predicted answers "100k" and "113k" are INCORRECT.
63
+ - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
64
+ - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
65
+ - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
66
+ - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
67
+ - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
68
+ - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
69
+ - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
70
+ - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
71
+ - Do not punish for typos in people's name if it's clearly the same name.
72
+ - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
73
+
74
+ Grade the predicted answer of this new question as one of:
75
+ A: CORRECT
76
+ B: INCORRECT
77
+ C: NOT_ATTEMPTED
78
+ Just return the letters "A", "B", or "C", with no text around it.
79
+
80
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
81
+ ```
82
+ Question: {question}
83
+ Gold target: {answer}
84
+ Predicted answer: {prediction}
85
+ ```
86
+ """.strip()
87
+
88
+ livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
89
+
90
+ livereasonbench_infer_cfg = dict(
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template=dict(
94
+ round=[
95
+ dict(role='HUMAN', prompt='Question: {question}\n'),
96
+ ],
97
+ )),
98
+ retriever=dict(type=ZeroRetriever),
99
+ inferencer=dict(type=GenInferencer, max_out_len=16384))
100
+
101
+ livereasonbench_eval_cfg = dict(
102
+ evaluator=dict(
103
+ type=LMEvaluator,
104
+ prompt_template=dict(
105
+ type=PromptTemplate,
106
+ template=dict(
107
+ begin=[
108
+ dict(
109
+ role='SYSTEM',
110
+ fallback_role='HUMAN',
111
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
112
+ ],
113
+ round=[
114
+ dict(
115
+ role='HUMAN',
116
+ prompt = GRADER_TEMPLATE
117
+ ),
118
+ ]),
119
+ ),
120
+ dict_postprocessor=dict(type=livereasonbench_postprocess),
121
+ ),
122
+ pred_role='BOT',
123
+ )
124
+
125
+ livereasonbench_datasets = [
126
+ dict(
127
+ abbr='LiveReasonBench-20241202',
128
+ type=LiveReasonBenchDataset,
129
+ path='opencompass/LiveReasonBench',
130
+ reader_cfg=livereasonbench_reader_cfg,
131
+ infer_cfg=livereasonbench_infer_cfg,
132
+ eval_cfg=livereasonbench_eval_cfg,
133
+ version='livereasonbench-20241202',
134
+ mode='singlescore',
135
+ )
136
+ ]
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.evaluator import GenericLLMEvaluator
6
+ from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
7
+
8
+
9
+ GRADER_TEMPLATE = """
10
+ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
11
+ First, I will give examples of each grade, and then you will grade a new example.
12
+
13
+
14
+ The following are examples of CORRECT predicted answers.
15
+ ```
16
+ Question: What are the names of Barack Obama's children?
17
+ Gold target: Malia Obama and Sasha Obama
18
+ Predicted answer 1: sasha and malia obama
19
+ Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
20
+ Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
21
+ ```
22
+ These predicted answers are all CORRECT because:
23
+ - They fully contain the important information in the gold target.
24
+ - They do not contain any information that contradicts the gold target.
25
+ - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
26
+ - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
27
+
28
+
29
+ The following are examples of INCORRECT predicted answers.
30
+ ```
31
+ Question: What are the names of Barack Obama's children?
32
+ Gold target: Malia and Sasha
33
+ Predicted answer 1: Malia.
34
+ Predicted answer 2: Malia, Sasha, and Susan.
35
+ Predicted answer 3: Barack Obama does not have any children.
36
+ Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
37
+ Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
38
+ Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
39
+ Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
40
+ ```
41
+ These predicted answers are all INCORRECT because:
42
+ - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
43
+
44
+
45
+ The following are examples of NOT_ATTEMPTED predicted answers.
46
+ ```
47
+ Question: What are the names of Barack Obama's children?
48
+ Gold target: Malia and Sasha
49
+ Predicted answer 1: I don't know.
50
+ Predicted answer 2: I need more context about which Obama you are talking about.
51
+ Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
52
+ Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
53
+ ```
54
+ These predicted answers are all NOT_ATTEMPTED because:
55
+ - The important information in the gold target is not included in the answer.
56
+ - No statements in the answer contradict the gold target.
57
+
58
+
59
+ Also note the following things:
60
+ - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
61
+ - Predicted answers "120k", "124k", and 115k" are all CORRECT.
62
+ - Predicted answers "100k" and "113k" are INCORRECT.
63
+ - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
64
+ - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
65
+ - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
66
+ - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
67
+ - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
68
+ - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
69
+ - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
70
+ - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
71
+ - Do not punish for typos in people's name if it's clearly the same name.
72
+ - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
73
+
74
+ Grade the predicted answer of this new question as one of:
75
+ A: CORRECT
76
+ B: INCORRECT
77
+ C: NOT_ATTEMPTED
78
+ Just return the letters "A", "B", or "C", with no text around it.
79
+
80
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
81
+ ```
82
+ Question: {question}
83
+ Gold target: {answer}
84
+ Predicted answer: {prediction}
85
+ ```
86
+ """.strip()
87
+
88
+ livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
89
+
90
+ livereasonbench_infer_cfg = dict(
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template=dict(
94
+ round=[
95
+ dict(role='HUMAN', prompt='Question: {question}\n'),
96
+ ],
97
+ )),
98
+ retriever=dict(type=ZeroRetriever),
99
+ inferencer=dict(type=GenInferencer, max_out_len=16384))
100
+
101
+ livereasonbench_eval_cfg = dict(
102
+ evaluator=dict(
103
+ type=GenericLLMEvaluator,
104
+ prompt_template=dict(
105
+ type=PromptTemplate,
106
+ template=dict(
107
+ begin=[
108
+ dict(
109
+ role='SYSTEM',
110
+ fallback_role='HUMAN',
111
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
112
+ ],
113
+ round=[
114
+ dict(
115
+ role='HUMAN',
116
+ prompt = GRADER_TEMPLATE
117
+ ),
118
+ ]),
119
+ ),
120
+ dataset_cfg=dict(
121
+ type=LiveReasonBenchDataset,
122
+ path='opencompass/LiveReasonBench',
123
+ reader_cfg=livereasonbench_reader_cfg,
124
+ ),
125
+ judge_cfg=dict(),
126
+ dict_postprocessor=dict(type=livereasonbench_postprocess),
127
+ ),
128
+ pred_role='BOT',
129
+ )
130
+
131
+ livereasonbench_datasets = [
132
+ dict(
133
+ abbr='LiveReasonBench-20241202',
134
+ type=LiveReasonBenchDataset,
135
+ path='opencompass/LiveReasonBench',
136
+ reader_cfg=livereasonbench_reader_cfg,
137
+ infer_cfg=livereasonbench_infer_cfg,
138
+ eval_cfg=livereasonbench_eval_cfg,
139
+ version='livereasonbench-20241202',
140
+ mode='singlescore',
141
+ )
142
+ ]
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+
5
+ from opencompass.evaluator import GenericLLMEvaluator
6
+ from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
7
+
8
+
9
+ GRADER_TEMPLATE = """
10
+ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
11
+ First, I will give examples of each grade, and then you will grade a new example.
12
+
13
+
14
+ The following are examples of CORRECT predicted answers.
15
+ ```
16
+ Question: What are the names of Barack Obama's children?
17
+ Gold target: Malia Obama and Sasha Obama
18
+ Predicted answer 1: sasha and malia obama
19
+ Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
20
+ Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
21
+ ```
22
+ These predicted answers are all CORRECT because:
23
+ - They fully contain the important information in the gold target.
24
+ - They do not contain any information that contradicts the gold target.
25
+ - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
26
+ - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
27
+
28
+
29
+ The following are examples of INCORRECT predicted answers.
30
+ ```
31
+ Question: What are the names of Barack Obama's children?
32
+ Gold target: Malia and Sasha
33
+ Predicted answer 1: Malia.
34
+ Predicted answer 2: Malia, Sasha, and Susan.
35
+ Predicted answer 3: Barack Obama does not have any children.
36
+ Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
37
+ Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
38
+ Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
39
+ Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
40
+ ```
41
+ These predicted answers are all INCORRECT because:
42
+ - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
43
+
44
+
45
+ The following are examples of NOT_ATTEMPTED predicted answers.
46
+ ```
47
+ Question: What are the names of Barack Obama's children?
48
+ Gold target: Malia and Sasha
49
+ Predicted answer 1: I don't know.
50
+ Predicted answer 2: I need more context about which Obama you are talking about.
51
+ Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
52
+ Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
53
+ ```
54
+ These predicted answers are all NOT_ATTEMPTED because:
55
+ - The important information in the gold target is not included in the answer.
56
+ - No statements in the answer contradict the gold target.
57
+
58
+
59
+ Also note the following things:
60
+ - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
61
+ - Predicted answers "120k", "124k", and 115k" are all CORRECT.
62
+ - Predicted answers "100k" and "113k" are INCORRECT.
63
+ - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
64
+ - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
65
+ - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
66
+ - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
67
+ - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
68
+ - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
69
+ - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
70
+ - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
71
+ - Do not punish for typos in people's name if it's clearly the same name.
72
+ - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
73
+
74
+ Grade the predicted answer of this new question as one of:
75
+ A: CORRECT
76
+ B: INCORRECT
77
+ C: NOT_ATTEMPTED
78
+ Just return the letters "A", "B", or "C", with no text around it.
79
+
80
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
81
+ ```
82
+ Question: {question}
83
+ Gold target: {answer}
84
+ Predicted answer: {prediction}
85
+ ```
86
+ """.strip()
87
+
88
+ livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
89
+
90
+ livereasonbench_infer_cfg = dict(
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template=dict(
94
+ round=[
95
+ dict(role='HUMAN', prompt='Question: {question}\n'),
96
+ ],
97
+ )),
98
+ retriever=dict(type=ZeroRetriever),
99
+ inferencer=dict(type=GenInferencer))
100
+
101
+ livereasonbench_eval_cfg = dict(
102
+ evaluator=dict(
103
+ type=GenericLLMEvaluator,
104
+ prompt_template=dict(
105
+ type=PromptTemplate,
106
+ template=dict(
107
+ begin=[
108
+ dict(
109
+ role='SYSTEM',
110
+ fallback_role='HUMAN',
111
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
112
+ ],
113
+ round=[
114
+ dict(
115
+ role='HUMAN',
116
+ prompt = GRADER_TEMPLATE
117
+ ),
118
+ ]),
119
+ ),
120
+ dataset_cfg=dict(
121
+ type=LiveReasonBenchDataset,
122
+ path='opencompass/LiveReasonBench',
123
+ reader_cfg=livereasonbench_reader_cfg,
124
+ version='livereasonbench-20250428',
125
+ ),
126
+ judge_cfg=dict(),
127
+ dict_postprocessor=dict(type=livereasonbench_postprocess),
128
+ ),
129
+ )
130
+
131
+ livereasonbench_datasets = [
132
+ dict(
133
+ abbr='LiveReasonBench-20250428',
134
+ type=LiveReasonBenchDataset,
135
+ path='opencompass/LiveReasonBench',
136
+ reader_cfg=livereasonbench_reader_cfg,
137
+ infer_cfg=livereasonbench_infer_cfg,
138
+ eval_cfg=livereasonbench_eval_cfg,
139
+ version='livereasonbench-20250428',
140
+ n=1
141
+ )
142
+ ]
build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.evaluator import GenericLLMEvaluator
6
+ from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
7
+
8
+
9
+ GRADER_TEMPLATE = """
10
+ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
11
+ First, I will give examples of each grade, and then you will grade a new example.
12
+
13
+
14
+ The following are examples of CORRECT predicted answers.
15
+ ```
16
+ Question: What are the names of Barack Obama's children?
17
+ Gold target: Malia Obama and Sasha Obama
18
+ Predicted answer 1: sasha and malia obama
19
+ Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
20
+ Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
21
+ ```
22
+ These predicted answers are all CORRECT because:
23
+ - They fully contain the important information in the gold target.
24
+ - They do not contain any information that contradicts the gold target.
25
+ - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
26
+ - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
27
+
28
+
29
+ The following are examples of INCORRECT predicted answers.
30
+ ```
31
+ Question: What are the names of Barack Obama's children?
32
+ Gold target: Malia and Sasha
33
+ Predicted answer 1: Malia.
34
+ Predicted answer 2: Malia, Sasha, and Susan.
35
+ Predicted answer 3: Barack Obama does not have any children.
36
+ Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
37
+ Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
38
+ Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
39
+ Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
40
+ ```
41
+ These predicted answers are all INCORRECT because:
42
+ - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
43
+
44
+
45
+ The following are examples of NOT_ATTEMPTED predicted answers.
46
+ ```
47
+ Question: What are the names of Barack Obama's children?
48
+ Gold target: Malia and Sasha
49
+ Predicted answer 1: I don't know.
50
+ Predicted answer 2: I need more context about which Obama you are talking about.
51
+ Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
52
+ Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
53
+ ```
54
+ These predicted answers are all NOT_ATTEMPTED because:
55
+ - The important information in the gold target is not included in the answer.
56
+ - No statements in the answer contradict the gold target.
57
+
58
+
59
+ Also note the following things:
60
+ - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
61
+ - Predicted answers "120k", "124k", and 115k" are all CORRECT.
62
+ - Predicted answers "100k" and "113k" are INCORRECT.
63
+ - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
64
+ - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
65
+ - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
66
+ - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
67
+ - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
68
+ - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
69
+ - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
70
+ - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
71
+ - Do not punish for typos in people's name if it's clearly the same name.
72
+ - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
73
+
74
+ Grade the predicted answer of this new question as one of:
75
+ A: CORRECT
76
+ B: INCORRECT
77
+ C: NOT_ATTEMPTED
78
+ Just return the letters "A", "B", or "C", with no text around it.
79
+
80
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
81
+ ```
82
+ Question: {question}
83
+ Gold target: {answer}
84
+ Predicted answer: {prediction}
85
+ ```
86
+ """.strip()
87
+
88
+ livereasonbench_subsets = {
89
+ 'biology': 'livestembench_bio',
90
+ 'chemistry': 'livestembench_che',
91
+ 'physics': 'livestembench_phy',
92
+ }
93
+
94
+ livestembench_datasets = []
95
+
96
+ for name, subset in livereasonbench_subsets.items():
97
+ livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
98
+
99
+ livereasonbench_infer_cfg = dict(
100
+ prompt_template=dict(
101
+ type=PromptTemplate,
102
+ template=dict(
103
+ round=[
104
+ dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'),
105
+ ],
106
+ )),
107
+ retriever=dict(type=ZeroRetriever),
108
+ inferencer=dict(type=GenInferencer, max_out_len=8192))
109
+
110
+ livereasonbench_eval_cfg = dict(
111
+ evaluator=dict(
112
+ type=GenericLLMEvaluator,
113
+ prompt_template=dict(
114
+ type=PromptTemplate,
115
+ template=dict(
116
+ begin=[
117
+ dict(
118
+ role='SYSTEM',
119
+ fallback_role='HUMAN',
120
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
121
+ ],
122
+ round=[
123
+ dict(
124
+ role='HUMAN',
125
+ prompt = GRADER_TEMPLATE
126
+ ),
127
+ ]),
128
+ ),
129
+ dataset_cfg=dict(
130
+ type=LiveStemBenchDataset,
131
+ path='opencompass/livestembench',
132
+ reader_cfg=livereasonbench_reader_cfg,
133
+ version=subset,
134
+ ),
135
+ judge_cfg=dict(),
136
+ dict_postprocessor=dict(type=livereasonbench_postprocess),
137
+ ),
138
+ pred_role='BOT',
139
+ )
140
+
141
+ livestembench_datasets.append(
142
+ dict(
143
+ abbr=f'LiveStemBench-{name}',
144
+ type=LiveStemBenchDataset,
145
+ path='opencompass/livestembench',
146
+ reader_cfg=livereasonbench_reader_cfg,
147
+ infer_cfg=livereasonbench_infer_cfg,
148
+ eval_cfg=livereasonbench_eval_cfg,
149
+ version=subset,
150
+ mode='singlescore',
151
+ )
152
+ )
build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.evaluator import GenericLLMEvaluator
6
+ from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
7
+ from opencompass.utils import xml_tag_postprocessor
8
+
9
+
10
+ GRADER_TEMPLATE = """
11
+ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
12
+ First, I will give examples of each grade, and then you will grade a new example.
13
+
14
+
15
+ The following are examples of CORRECT predicted answers.
16
+ ```
17
+ Question: What are the names of Barack Obama's children?
18
+ Gold target: Malia Obama and Sasha Obama
19
+ Predicted answer 1: sasha and malia obama
20
+ Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
21
+ Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
22
+ ```
23
+ These predicted answers are all CORRECT because:
24
+ - They fully contain the important information in the gold target.
25
+ - They do not contain any information that contradicts the gold target.
26
+ - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
27
+ - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
28
+
29
+
30
+ The following are examples of INCORRECT predicted answers.
31
+ ```
32
+ Question: What are the names of Barack Obama's children?
33
+ Gold target: Malia and Sasha
34
+ Predicted answer 1: Malia.
35
+ Predicted answer 2: Malia, Sasha, and Susan.
36
+ Predicted answer 3: Barack Obama does not have any children.
37
+ Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
38
+ Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
39
+ Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
40
+ Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
41
+ ```
42
+ These predicted answers are all INCORRECT because:
43
+ - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
44
+
45
+
46
+ The following are examples of NOT_ATTEMPTED predicted answers.
47
+ ```
48
+ Question: What are the names of Barack Obama's children?
49
+ Gold target: Malia and Sasha
50
+ Predicted answer 1: I don't know.
51
+ Predicted answer 2: I need more context about which Obama you are talking about.
52
+ Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
53
+ Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
54
+ ```
55
+ These predicted answers are all NOT_ATTEMPTED because:
56
+ - The important information in the gold target is not included in the answer.
57
+ - No statements in the answer contradict the gold target.
58
+
59
+
60
+ Also note the following things:
61
+ - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
62
+ - Predicted answers "120k", "124k", and 115k" are all CORRECT.
63
+ - Predicted answers "100k" and "113k" are INCORRECT.
64
+ - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
65
+ - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
66
+ - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
67
+ - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
68
+ - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
69
+ - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
70
+ - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
71
+ - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
72
+ - Do not punish for typos in people's name if it's clearly the same name.
73
+ - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
74
+
75
+ Grade the predicted answer of this new question as one of:
76
+ A: CORRECT
77
+ B: INCORRECT
78
+ C: NOT_ATTEMPTED
79
+ Just return the letters "A", "B", or "C", with no text around it.
80
+
81
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
82
+ ```
83
+ Question: {question}
84
+ Gold target: {answer}
85
+ Predicted answer: {prediction}
86
+ ```
87
+ """.strip()
88
+
89
+ livereasonbench_subsets = {
90
+ 'biology': 'livestembench_bio',
91
+ 'chemistry': 'livestembench_che',
92
+ 'physics': 'livestembench_phy',
93
+ }
94
+
95
+ livestembench_datasets = []
96
+
97
+ for name, subset in livereasonbench_subsets.items():
98
+ livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
99
+
100
+ livereasonbench_infer_cfg = dict(
101
+ prompt_template=dict(
102
+ type=PromptTemplate,
103
+ template=dict(
104
+ round=[
105
+ dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'),
106
+ ],
107
+ )),
108
+ retriever=dict(type=ZeroRetriever),
109
+ inferencer=dict(type=GenInferencer, max_out_len=8192))
110
+
111
+ livereasonbench_eval_cfg = dict(
112
+ evaluator=dict(
113
+ type=GenericLLMEvaluator,
114
+ prompt_template=dict(
115
+ type=PromptTemplate,
116
+ template=dict(
117
+ begin=[
118
+ dict(
119
+ role='SYSTEM',
120
+ fallback_role='HUMAN',
121
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
122
+ ],
123
+ round=[
124
+ dict(
125
+ role='HUMAN',
126
+ prompt = GRADER_TEMPLATE
127
+ ),
128
+ ]),
129
+ ),
130
+ dataset_cfg=dict(
131
+ type=LiveStemBenchDataset,
132
+ path='opencompass/livestembench',
133
+ reader_cfg=livereasonbench_reader_cfg,
134
+ version=subset,
135
+ ),
136
+ judge_cfg=dict(),
137
+ dict_postprocessor=dict(type=livereasonbench_postprocess),
138
+ pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
139
+
140
+ ),
141
+ pred_role='BOT',
142
+ )
143
+
144
+ livestembench_datasets.append(
145
+ dict(
146
+ abbr=f'LiveStemBench-{name}',
147
+ type=LiveStemBenchDataset,
148
+ path='opencompass/livestembench',
149
+ reader_cfg=livereasonbench_reader_cfg,
150
+ infer_cfg=livereasonbench_infer_cfg,
151
+ eval_cfg=livereasonbench_eval_cfg,
152
+ version=subset,
153
+ mode='singlescore',
154
+ )
155
+ )
build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .livestembench_gen_3e3c50 import livestembench_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.evaluator import GenericLLMEvaluator
6
+ from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
7
+
8
+
9
+ GRADER_TEMPLATE = """
10
+ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
11
+ First, I will give examples of each grade, and then you will grade a new example.
12
+
13
+
14
+ The following are examples of CORRECT predicted answers.
15
+ ```
16
+ Question: What are the names of Barack Obama's children?
17
+ Gold target: Malia Obama and Sasha Obama
18
+ Predicted answer 1: sasha and malia obama
19
+ Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
20
+ Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
21
+ ```
22
+ These predicted answers are all CORRECT because:
23
+ - They fully contain the important information in the gold target.
24
+ - They do not contain any information that contradicts the gold target.
25
+ - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
26
+ - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
27
+
28
+
29
+ The following are examples of INCORRECT predicted answers.
30
+ ```
31
+ Question: What are the names of Barack Obama's children?
32
+ Gold target: Malia and Sasha
33
+ Predicted answer 1: Malia.
34
+ Predicted answer 2: Malia, Sasha, and Susan.
35
+ Predicted answer 3: Barack Obama does not have any children.
36
+ Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
37
+ Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
38
+ Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
39
+ Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
40
+ ```
41
+ These predicted answers are all INCORRECT because:
42
+ - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
43
+
44
+
45
+ The following are examples of NOT_ATTEMPTED predicted answers.
46
+ ```
47
+ Question: What are the names of Barack Obama's children?
48
+ Gold target: Malia and Sasha
49
+ Predicted answer 1: I don't know.
50
+ Predicted answer 2: I need more context about which Obama you are talking about.
51
+ Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
52
+ Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
53
+ ```
54
+ These predicted answers are all NOT_ATTEMPTED because:
55
+ - The important information in the gold target is not included in the answer.
56
+ - No statements in the answer contradict the gold target.
57
+
58
+
59
+ Also note the following things:
60
+ - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
61
+ - Predicted answers "120k", "124k", and 115k" are all CORRECT.
62
+ - Predicted answers "100k" and "113k" are INCORRECT.
63
+ - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
64
+ - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
65
+ - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
66
+ - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
67
+ - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
68
+ - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
69
+ - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
70
+ - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
71
+ - Do not punish for typos in people's name if it's clearly the same name.
72
+ - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
73
+
74
+ Grade the predicted answer of this new question as one of:
75
+ A: CORRECT
76
+ B: INCORRECT
77
+ C: NOT_ATTEMPTED
78
+ Just return the letters "A", "B", or "C", with no text around it.
79
+
80
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
81
+ ```
82
+ Question: {question}
83
+ Gold target: {answer}
84
+ Predicted answer: {prediction}
85
+ ```
86
+ """.strip()
87
+
88
+ livereasonbench_subsets = {
89
+ 'biology': 'livestembench_bio',
90
+ 'chemistry': 'livestembench_che',
91
+ 'physics': 'livestembench_phy',
92
+ }
93
+
94
+ livestembench_datasets = []
95
+
96
+ for name, subset in livereasonbench_subsets.items():
97
+ livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
98
+
99
+ livereasonbench_infer_cfg = dict(
100
+ prompt_template=dict(
101
+ type=PromptTemplate,
102
+ template=dict(
103
+ round=[
104
+ dict(role='HUMAN', prompt='问题: {question}\n请逐步思考,并给出最终答案,答案放在 \\boxed{{}} 中。'),
105
+ ],
106
+ )),
107
+ retriever=dict(type=ZeroRetriever),
108
+ inferencer=dict(type=GenInferencer, max_out_len=8192))
109
+
110
+ livereasonbench_eval_cfg = dict(
111
+ evaluator=dict(
112
+ type=GenericLLMEvaluator,
113
+ prompt_template=dict(
114
+ type=PromptTemplate,
115
+ template=dict(
116
+ begin=[
117
+ dict(
118
+ role='SYSTEM',
119
+ fallback_role='HUMAN',
120
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
121
+ ],
122
+ round=[
123
+ dict(
124
+ role='HUMAN',
125
+ prompt = GRADER_TEMPLATE
126
+ ),
127
+ ]),
128
+ ),
129
+ dataset_cfg=dict(
130
+ type=LiveStemBenchDataset,
131
+ path='opencompass/livestembench',
132
+ reader_cfg=livereasonbench_reader_cfg,
133
+ version=subset,
134
+ ),
135
+ judge_cfg=dict(),
136
+ dict_postprocessor=dict(type=livereasonbench_postprocess),
137
+ ),
138
+ pred_role='BOT',
139
+ )
140
+
141
+ livestembench_datasets.append(
142
+ dict(
143
+ abbr=f'LiveStemBench-{name}',
144
+ type=LiveStemBenchDataset,
145
+ path='opencompass/livestembench',
146
+ reader_cfg=livereasonbench_reader_cfg,
147
+ infer_cfg=livereasonbench_infer_cfg,
148
+ eval_cfg=livereasonbench_eval_cfg,
149
+ version=subset,
150
+ mode='singlescore',
151
+ )
152
+ )
build/lib/opencompass/configs/datasets/llm_compression/README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LLM Compression
2
+
3
+ ## Introduction
4
+
5
+ The following introduction comes from the abstract of [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937):
6
+
7
+ >There is a belief that learning to compress well will lead to intelligence. Recently, language modeling has been shown to be equivalent to compression, which offers a compelling rationale for the success of large language models (LLMs): the development of more advanced language models is essentially enhancing compression which facilitates intelligence. ...our findings suggest that compression efficiency, as an unsupervised metric derived from raw text corpora, serves as a reliable evaluation measure that is linearly associated with the model capabilities. We open-source our compression datasets as well as our data collection pipelines to facilitate future researchers to assess compression properly.
8
+
9
+
10
+ ## Official Links
11
+
12
+ - Paper: [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937)
13
+ - GitHub Repository: [llm-compression-intelligence](https://github.com/hkust-nlp/llm-compression-intelligence)
14
+
15
+
16
+ ## Overview and Usage
17
+
18
+ ### Dataset
19
+ The dataset, which consists of three external corpora, can be downloaded using the following python script:
20
+
21
+ ```python
22
+ from os import os.path as osp
23
+ from datasets import load_dataset
24
+
25
+ data_path = "data/llm-compression"
26
+
27
+ subset_mapping = {
28
+ 'arxiv_math': ['arxiv_math'],
29
+ 'commoncraw': ['cc'],
30
+ 'python': ['python'],
31
+ }
32
+
33
+ for key, value in subset_mapping.items():
34
+ llmc_dataset = load_dataset(r"hkust-nlp/llm-compression", name=value)
35
+ llmc_dataset["test"].to_json(osp.join(data_path, f"{key}.jsonl"))
36
+ ```
37
+
38
+ Note: Refer to the original [repository](https://github.com/hkust-nlp/llm-compression-intelligence) for more details on data collection and design.
39
+
40
+
41
+ ### Inference
42
+
43
+ The inference stage (`SWCELossInferencer`) consists of the following key steps:
44
+
45
+ 1. For each candidate model, we obtain the encodings of each sample of the dataset using its tokenizer.
46
+ 2. Concatenate the encodings of all samples into a single array and construct a PyTorch Dataset. Each item of `__getitem__` is a chunk of the array based on a sliding window. To reproduce results from the original paper, set `block_size=1900` and `stride=512`.
47
+ 3. For each batch, calculate the cross entropy loss based on model logits and targets. The losses within each batch is reduced to a single loss by summation.
48
+ 4. Output the losses and `total_chr_num` to `BPCEvaluator` for evaluation.
49
+
50
+
51
+ ### Evaluation
52
+
53
+ `BPCEvaluator`: Using the total loss for each batch and the total number of characters in the original dataset from the inference stage, calculate the Bits per Character (BPC) metric for each model:
54
+
55
+ $$ BPC = \frac{TotalCrossEntropyLoss}{TotalCharacterNumber*log(2)} $$
56
+
57
+
58
+ ### Summarization
59
+
60
+
61
+
62
+ ### Config Files
63
+
64
+ 1. Dataset config: `configs/datasets/llm-compression.py`
65
+ 2. Evaluation config: `examples/eval_llm_compression.py`
66
+
67
+ ## Evaluation Results
68
+ ```
69
+ metric version model commoncraw python arxiv_math average
70
+ 0 bpc af04af qwen1.5-32b-hf 0.5910 0.2584 0.4080 0.4191
71
+ 1 bpc af04af qwen1.5-14b-hf 0.6459 0.2766 0.4310 0.4512
72
+ 2 bpc af04af qwen-14b-hf 0.6197 0.2849 0.4498 0.4515
73
+ 3 bpc af04af llama-30b-hf 0.5773 0.3212 0.4562 0.4516
74
+ 4 bpc af04af llama-2-13b-hf 0.5807 0.3336 0.4752 0.4632
75
+ 5 bpc af04af qwen1.5-7b-hf 0.6658 0.2935 0.4500 0.4698
76
+ 6 bpc af04af qwen-7b-hf 0.6453 0.3088 0.4830 0.4790
77
+ 7 bpc af04af llama-13b-hf 0.6083 0.3555 0.4865 0.4834
78
+ 8 bpc af04af llama-2-7b-hf 0.6117 0.3536 0.4995 0.4883
79
+ 9 bpc af04af llama-7b-hf 0.6285 0.3794 0.5096 0.5058
80
+ 10 bpc af04af qwen1.5-1.8b-hf 0.7448 0.4029 0.5625 0.5701
81
+ 11 bpc af04af qwen-1.8b-hf 0.7542 0.4175 0.5842 0.5853
82
+ 12 bpc af04af qwen1.5-0.5b-hf 0.8102 0.4520 0.6181 0.6268
83
+ ```
84
+
85
+
86
+ ## FAQ
87
+
88
+ ### I am getting this warning during inference, should I truncate long samples to `max_seq_len` to avoid further errors?
89
+ ```
90
+ Token indices sequence length is longer than the specified maximum sequence length for this model. Running this sequence through the model will result in indexing errors
91
+ ```
92
+ >A: This warning comes from the tokenizer indicating that the input sequence length exceeds the model's input length, but it does not affect the operation of the tokenizer. For loss calculation, as long as we set a `block_size` of the sliding window less than `max_seq_len`, we can safely ignore this warning.
93
+
94
+
95
+ ## Reference
96
+ ```
97
+ @misc{huang2024compression,
98
+ title={Compression Represents Intelligence Linearly},
99
+ author={Yuzhen Huang and Jinghan Zhang and Zifei Shan and Junxian He},
100
+ year={2024},
101
+ eprint={2404.09937},
102
+ archivePrefix={arXiv},
103
+ primaryClass={cs.CL}
104
+ }
105
+ ```
build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import SWCELossInferencer
4
+ from opencompass.openicl.icl_evaluator import BPCEvaluator
5
+ from opencompass.datasets import LLMCompressionDataset
6
+
7
+
8
+ # The three corpora for llm_compression used in the original paper
9
+ # See configs/datasets/llm_compression/README.md for more details
10
+ subset_mapping = {
11
+ 'arxiv_math': ['arxiv_math'],
12
+ 'commoncraw': ['cc'],
13
+ 'python': ['python'],
14
+ }
15
+
16
+
17
+ # Build LLM Compression datasets
18
+ llm_compression_datasets = []
19
+ for _name in subset_mapping.keys():
20
+ llm_cmp_infer_cfg = dict(
21
+ prompt_template=dict(
22
+ type=PromptTemplate,
23
+ template='{content}',
24
+ ),
25
+ # No in-context example, using ZeroRetriever
26
+ retriever=dict(type=ZeroRetriever),
27
+ # Calculates cross entropy loss for each batch based on a sliding context window
28
+ # Setting block_size=1900 and stride=512 according to the original paper
29
+ inferencer=dict(type=SWCELossInferencer, block_size=1900, stride=512),
30
+ )
31
+
32
+ # Calculates Bits per Character (BPC) based on the CE loss from the inference stage
33
+ llm_cmp_eval_cfg = dict(evaluator=dict(type=BPCEvaluator))
34
+
35
+ llm_compression_datasets.append(
36
+ dict(
37
+ abbr=f'llm_compression-{_name}',
38
+ type=LLMCompressionDataset,
39
+ path='./data/llm-compression',
40
+ name=_name,
41
+ samples=None, # Set small samples for testing
42
+ reader_cfg=dict(
43
+ input_columns=['content'],
44
+ output_column=None,
45
+ ),
46
+ infer_cfg=llm_cmp_infer_cfg,
47
+ eval_cfg=llm_cmp_eval_cfg,
48
+ ))
49
+
50
+ del _name
build/lib/opencompass/configs/datasets/longbench/longbench.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .longbench2wikimqa.longbench_2wikimqa_gen import LongBench_2wikimqa_datasets
5
+ from .longbenchhotpotqa.longbench_hotpotqa_gen import LongBench_hotpotqa_datasets
6
+ from .longbenchmusique.longbench_musique_gen import LongBench_musique_datasets
7
+ from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets
8
+ from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets
9
+ from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets
10
+ from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets
11
+ from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets
12
+ from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets
13
+ from .longbenchqmsum.longbench_qmsum_gen import LongBench_qmsum_datasets
14
+ from .longbenchvcsum.longbench_vcsum_gen import LongBench_vcsum_datasets
15
+ from .longbenchdureader.longbench_dureader_gen import LongBench_dureader_datasets
16
+ from .longbenchlcc.longbench_lcc_gen import LongBench_lcc_datasets
17
+ from .longbenchrepobench.longbench_repobench_gen import LongBench_repobench_datasets
18
+ from .longbenchpassage_retrieval_en.longbench_passage_retrieval_en_gen import LongBench_passage_retrieval_en_datasets
19
+ from .longbenchpassage_retrieval_zh.longbench_passage_retrieval_zh_gen import LongBench_passage_retrieval_zh_datasets
20
+ from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets
21
+ from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets
22
+ from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets
23
+ from .longbenchmulti_news.longbench_multi_news_gen import LongBench_multi_news_datasets
24
+ from .longbenchsamsum.longbench_samsum_gen import LongBench_samsum_datasets
25
+
26
+ longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .longbenchv2_gen_75fbba import LongBenchv2_datasets
build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import LongBenchv2Dataset, LongBenchv2Evaluator
5
+ from opencompass.utils.text_postprocessors import first_option_postprocess
6
+
7
+ LongBenchv2_reader_cfg = dict(
8
+ input_columns=['context', 'question', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'difficulty', 'length'],
9
+ output_column='answer',
10
+ )
11
+
12
+ LongBenchv2_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template=dict(
16
+ round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt='Please read the following text and answer the questions below.\n <text> \n {context} \n </text> \n \n What is the correct answer to this question: {question} \n \n Choices: \n (A) {choice_A} \n (B) {choice_B} \n (C) {choice_C} \n (D) {choice_D} \n Let’s think step by step. Based on the above, what is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)',
20
+ ),
21
+ ],
22
+ ),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ LongBenchv2_eval_cfg = dict(
29
+ evaluator=dict(type=LongBenchv2Evaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
32
+ )
33
+
34
+ LongBenchv2_datasets = [
35
+ dict(
36
+ type=LongBenchv2Dataset,
37
+ abbr='LongBenchv2',
38
+ path='opencompass/longbenchv2',
39
+ reader_cfg=LongBenchv2_reader_cfg,
40
+ infer_cfg=LongBenchv2_infer_cfg,
41
+ eval_cfg=LongBenchv2_eval_cfg,
42
+ )
43
+ ]
build/lib/opencompass/configs/datasets/lveval/lveval.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LVEval
2
+ ## Introduction
3
+ The following introduction comes from the introduction in [LVEval](https://github.com/infinigence/LVEval)
4
+
5
+ ```
6
+ LV-Eval是一个具备5个长度等级(16k、32k、64k、128k和256k)、最大文本测试长度达到256k的长文本评测基准。LV-Eval的平均文本长度达到102,380字,最小/最大文本长度为11,896/387,406字。LV-Eval主要有两类评测任务——单跳QA和多跳QA,共包含11个涵盖中英文的评测数据子集。LV-Eval设计时引入3个关键技术:干扰事实插入(Confusiong Facts Insertion,CFI)提高挑战性,关键词和短语替换(Keyword and Phrase Replacement,KPR)减少信息泄漏,以及基于关键词召回的评测指标(Answer Keywords,AK,指代结合答案关键词和字词黑名单的评价指标)提高评测数值客观性。我们希望LV-Eval为未来长文本大语言模型的研究发展提供有价值的性能参考。
7
+ LV-Eval is a challenging long-context benchmark with five length levels (16k, 32k, 64k, 128k, and 256k) reaching up to 256k words. The average number of words is 102,380, and the Min/Max number of words is 11,896/387,406. LV-Eval features two main tasks, single-hop QA and multi-hop QA, comprising 11 bilingual datasets. The design of LV-Eval has incorporated three key techniques, namely confusing facts insertion (CFI), keyword and phrase replacement (KPR), and keyword-recall-based metrics (AK, short for metics with Answer Keywords and word blacklist) design, which jointly provide a challenging, mitigated-knowledge-leakege, and more accurate evaluation of the long-context capability of LLMs. We anticipate that LV-Eval will serve as a valuable resource for supporting future research on long-context LLMs.
8
+ ```
9
+
10
+ ## Official link
11
+
12
+ ### Paper
13
+
14
+ [_LV_-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K](https://arxiv.org/abs/2402.05136)
15
+
16
+ ### Repository
17
+
18
+ [LVEval](https://github.com/infinigence/LVEval)
19
+
20
+ ## Use cases
21
+
22
+ In evaluation scripts, add LVEval dataset as other datasets by using
23
+ ```
24
+ from .datasets.lveval.lveval import LVEval_datasets as datasets
25
+ ```
26
+
27
+ ## Examples
28
+ Input example I (from lic_mixup datasets):
29
+ ```
30
+ 请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。
31
+
32
+ 文章:......文章 9\n\n标题:腐质酸\n内容:腐植酸是自然界中广泛存在的大分子有机物质,广泛应用于农林牧、石油、化工、建材、医药卫生、环保等各个领域。横跨几十个行业。特别是眼下提倡生态农业建设、无公害农业生产、绿色食品、无污染环保产品等,更使\"腐植酸\"备受推崇,事实证明,人类的生活和生存离不开腐植酸,它的确是一个发展中的有希望的朝阳产业,属于一个新型的特殊行业......
33
+
34
+ 请现在基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。
35
+
36
+ 问题:中国的文学受到印度哪些方面的影响?
37
+ 回答:
38
+ ```
39
+ Output example I (from chatglm3-6b-32k):
40
+ ```
41
+ 中国文学自印度文学大量吸收营养,在佛教东流之后,从语汇到修辞,从题材到体裁,即便审美取向也深受佛教与印度文学的感染。
42
+ ```
43
+ Input example II (from factrecall_zh datasets):
44
+ ```
45
+ 请基于给定的文章回答下述问题。
46
+
47
+ 文章:......庚子年间,贝多芬,乃一德裔美籍学士,研究于物理理学。彼其良图,探求相对论、量子力学,尤有大进。质能等价公式 E=mc²,千古独步,声名于当世。诺贝尔物理学奖、以资尊荣,兹矣荣耀之大典。论其学术,涉时空能量,影响深远,以其义非常人,广为当世所知,声名播于天下,实乃现代物理学之奠基者......
48
+
49
+ 现在请基于上述文章回答下面的问题。
50
+
51
+ 问题:被世人广泛推崇为现代物理学奠基人的科学家叫什么名字?
52
+ 回答:
53
+ ```
54
+ Output example II (from chatglm3-6b-32k):
55
+ ```
56
+ 贝多芬
57
+ ```
58
+ ## Evaluation results
59
+
60
+ ```
61
+ dataset version metric mode bluelm-7b-chat-32k-hf
62
+ ----------------------------------------- --------- ------------- ------ -----------------------
63
+ ---------------------------------------- - - - -
64
+ --------- LVEval All --------- - - - -
65
+ ---------------------------------------- - - - -
66
+ LVEval_qa - naive_average gen 12.00
67
+ ---------------------------------------- - - - -
68
+ --------- LVEval Tasks All --------- - - - -
69
+ ---------------------------------------- - - - -
70
+ LVEval_single_hop_qa - naive_average gen 15.11
71
+ LVEval_single_hop_cqa - naive_average gen 9.21
72
+ LVEval_multi_hop_qa - naive_average gen 6.99
73
+ LVEval_multi_hop_cqa - naive_average gen 9.90
74
+ LVEval_factrecall_cqa - naive_average gen 21.28
75
+ ---------------------------------------- - - - -
76
+ --------- LVEval Datasets All --------- - - - -
77
+ ---------------------------------------- - - - -
78
+ LVEval_loogle_SD_mixup - naive_average gen 12.81
79
+ LVEval_cmrc_mixup - naive_average gen 17.41
80
+ LVEval_multifieldqa_en_mixup - naive_average gen 7.10
81
+ LVEval_multifieldqa_zh_mixup - naive_average gen 11.31
82
+ LVEval_dureader_mixup - naive_average gen 13.19
83
+ LVEval_loogle_CR_mixup - naive_average gen 5.17
84
+ LVEval_loogle_MIR_mixup - naive_average gen 2.60
85
+ LVEval_hotpotwikiqa_mixup - naive_average gen 10.20
86
+ LVEval_lic_mixup - naive_average gen 9.60
87
+ LVEval_factrecall_en - naive_average gen 23.67
88
+ LVEval_factrecall_zh - naive_average gen 18.90
89
+ ---------------------------------------- - - - -
90
+ --------- LVEval Single_Hop QA --------- - - - -
91
+ ---------------------------------------- - - - -
92
+ LVEval_loogle_SD_mixup_16k 83bc25 LVEval_f1 gen 35.05
93
+ LVEval_loogle_SD_mixup_32k 83bc25 LVEval_f1 gen 13.37
94
+ LVEval_loogle_SD_mixup_64k 83bc25 LVEval_f1 gen 6.32
95
+ LVEval_loogle_SD_mixup_128k 83bc25 LVEval_f1 gen 5.28
96
+ LVEval_loogle_SD_mixup_256k 83bc25 LVEval_f1 gen 4.00
97
+ ---------------------------------------- - - - -
98
+ LVEval_cmrc_mixup_16k 8bac4e LVEval_f1 gen 46.45
99
+ LVEval_cmrc_mixup_32k 8bac4e LVEval_f1 gen 19.41
100
+ LVEval_cmrc_mixup_64k 8bac4e LVEval_f1 gen 11.10
101
+ LVEval_cmrc_mixup_128k 8bac4e LVEval_f1 gen 5.89
102
+ LVEval_cmrc_mixup_256k 8bac4e LVEval_f1 gen 4.22
103
+ ---------------------------------------- - - - -
104
+ --------- LVEval Single_Hop CQA --------- - - - -
105
+ ---------------------------------------- - - - -
106
+ LVEval_multifieldqa_en_mixup_16k 83bc25 LVEval_f1 gen 12.28
107
+ LVEval_multifieldqa_en_mixup_32k 83bc25 LVEval_f1 gen 4.64
108
+ LVEval_multifieldqa_en_mixup_64k 83bc25 LVEval_f1 gen 8.30
109
+ LVEval_multifieldqa_en_mixup_128k 83bc25 LVEval_f1 gen 5.63
110
+ LVEval_multifieldqa_en_mixup_256k 83bc25 LVEval_f1 gen 4.64
111
+ ---------------------------------------- - - - -
112
+ LVEval_multifieldqa_zh_mixup_16k ac4a0d LVEval_f1 gen 22.30
113
+ LVEval_multifieldqa_zh_mixup_32k ac4a0d LVEval_f1 gen 17.46
114
+ LVEval_multifieldqa_zh_mixup_64k ac4a0d LVEval_f1 gen 6.27
115
+ LVEval_multifieldqa_zh_mixup_128k ac4a0d LVEval_f1 gen 5.84
116
+ LVEval_multifieldqa_zh_mixup_256k ac4a0d LVEval_f1 gen 4.71
117
+ ---------------------------------------- - - - -
118
+ --------- LVEval Multi_Hop QA --------- - - - -
119
+ ---------------------------------------- - - - -
120
+ LVEval_dureader_mixup_16k 8bac4e LVEval_rouge gen 18.04
121
+ LVEval_dureader_mixup_32k 8bac4e LVEval_rouge gen 18.33
122
+ LVEval_dureader_mixup_64k 8bac4e LVEval_rouge gen 12.56
123
+ LVEval_dureader_mixup_128k 8bac4e LVEval_rouge gen 10.33
124
+ LVEval_dureader_mixup_256k 8bac4e LVEval_rouge gen 6.69
125
+ ---------------------------------------- - - - -
126
+ LVEval_loogle_CR_mixup_16k 83bc25 LVEval_f1 gen 9.35
127
+ LVEval_loogle_CR_mixup_32k 83bc25 LVEval_f1 gen 7.42
128
+ LVEval_loogle_CR_mixup_64k 83bc25 LVEval_f1 gen 3.18
129
+ LVEval_loogle_CR_mixup_128k 83bc25 LVEval_f1 gen 2.65
130
+ LVEval_loogle_CR_mixup_256k 83bc25 LVEval_f1 gen 3.27
131
+ ---------------------------------------- - - - -
132
+ LVEval_loogle_MIR_mixup_16k 83bc25 LVEval_f1 gen 4.50
133
+ LVEval_loogle_MIR_mixup_32k 83bc25 LVEval_f1 gen 3.19
134
+ LVEval_loogle_MIR_mixup_64k 83bc25 LVEval_f1 gen 2.34
135
+ LVEval_loogle_MIR_mixup_128k 83bc25 LVEval_f1 gen 1.76
136
+ LVEval_loogle_MIR_mixup_256k 83bc25 LVEval_f1 gen 1.20
137
+ ---------------------------------------- - - - -
138
+ --------- LVEval Multi_Hop CQA --------- - - - -
139
+ ---------------------------------------- - - - -
140
+ LVEval_hotpotwikiqa_mixup_16k e3c368 LVEval_f1 gen 19.80
141
+ LVEval_hotpotwikiqa_mixup_32k e3c368 LVEval_f1 gen 12.59
142
+ LVEval_hotpotwikiqa_mixup_64k e3c368 LVEval_f1 gen 7.33
143
+ LVEval_hotpotwikiqa_mixup_128k e3c368 LVEval_f1 gen 7.85
144
+ LVEval_hotpotwikiqa_mixup_256k e3c368 LVEval_f1 gen 3.42
145
+ ---------------------------------------- - - - -
146
+ LVEval_lic_mixup_16k fdd540 LVEval_f1 gen 21.36
147
+ LVEval_lic_mixup_32k fdd540 LVEval_f1 gen 12.92
148
+ LVEval_lic_mixup_64k fdd540 LVEval_f1 gen 4.62
149
+ LVEval_lic_mixup_128k fdd540 LVEval_f1 gen 4.25
150
+ LVEval_lic_mixup_256k fdd540 LVEval_f1 gen 4.85
151
+ ---------------------------------------- - - - -
152
+ --------- LVEval Factrecall CQA --------- - - - -
153
+ ---------------------------------------- - - - -
154
+ LVEval_factrecall_en_16k fba966 f1 gen 58.33
155
+ LVEval_factrecall_en_32k fba966 f1 gen 32.17
156
+ LVEval_factrecall_en_64k fba966 f1 gen 15.33
157
+ LVEval_factrecall_en_128k fba966 f1 gen 8.50
158
+ LVEval_factrecall_en_256k fba966 f1 gen 4.00
159
+ ---------------------------------------- - - - -
160
+ LVEval_factrecall_zh_16k ef3320 f1 gen 20.00
161
+ LVEval_factrecall_zh_32k ef3320 f1 gen 38.00
162
+ LVEval_factrecall_zh_64k ef3320 f1 gen 20.50
163
+ LVEval_factrecall_zh_128k ef3320 f1 gen 11.00
164
+ LVEval_factrecall_zh_256k ef3320 f1 gen 5.00
165
+ ```
build/lib/opencompass/configs/datasets/lveval/lveval.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .lvevalcmrc_mixup.lveval_cmrc_mixup_gen import (
5
+ LVEval_cmrc_mixup_datasets,
6
+ )
7
+ from .lvevaldureader_mixup.lveval_dureader_mixup_gen import (
8
+ LVEval_dureader_mixup_datasets,
9
+ )
10
+ from .lvevalfactrecall_en.lveval_factrecall_en_gen import (
11
+ LVEval_factrecall_en_datasets,
12
+ )
13
+ from .lvevalfactrecall_zh.lveval_factrecall_zh_gen import (
14
+ LVEval_factrecall_zh_datasets,
15
+ )
16
+ from .lvevalhotpotwikiqa_mixup.lveval_hotpotwikiqa_mixup_gen import (
17
+ LVEval_hotpotwikiqa_mixup_datasets,
18
+ )
19
+ from .lvevallic_mixup.lveval_lic_mixup_gen import LVEval_lic_mixup_datasets
20
+ from .lvevalloogle_CR_mixup.lveval_loogle_CR_mixup_gen import (
21
+ LVEval_loogle_CR_mixup_datasets,
22
+ )
23
+ from .lvevalloogle_MIR_mixup.lveval_loogle_MIR_mixup_gen import (
24
+ LVEval_loogle_MIR_mixup_datasets,
25
+ )
26
+ from .lvevalloogle_SD_mixup.lveval_loogle_SD_mixup_gen import (
27
+ LVEval_loogle_SD_mixup_datasets,
28
+ )
29
+ from .lvevalmultifieldqa_en_mixup.lveval_multifieldqa_en_mixup_gen import (
30
+ LVEval_multifieldqa_en_mixup_datasets,
31
+ )
32
+ from .lvevalmultifieldqa_zh_mixup.lveval_multifieldqa_zh_mixup_gen import (
33
+ LVEval_multifieldqa_zh_mixup_datasets,
34
+ )
35
+
36
+ LVEval_datasets = sum(
37
+ (v for k, v in locals().items() if k.endswith('_datasets')), []
38
+ )
build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .mastermath2024v1_gen_be6318 import mastermath2024v1_datasets
build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MastermathDatasetv1, MastermathDatasetv1Evaluator
5
+ from opencompass.utils import first_option_postprocess
6
+
7
+ mastermath2024v1_reader_cfg = dict(
8
+ input_columns=['question', 'A', 'B', 'C', 'D'],
9
+ output_column='answer')
10
+
11
+ mastermath2024v1_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(
15
+ round=[
16
+ dict(role='HUMAN', prompt='{question}\n选项:\n'
17
+ '(A){A}\n'
18
+ '(B){B}\n'
19
+ '(C){C}\n'
20
+ '(D){D}\n'
21
+ '你的回答格式如下: "正确答案是 (在这里插入你的答案)"'),
22
+ ], )),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer))
25
+
26
+ mastermath2024v1_eval_cfg = dict(evaluator=dict(type=MastermathDatasetv1Evaluator),
27
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
28
+
29
+ mastermath2024v1_datasets = [dict(
30
+ abbr='Mastermath2024v1',
31
+ type=MastermathDatasetv1,
32
+ path='./data/mastermath2024v1/',
33
+ name='kaoyan_math_1_mcq_Sheet1.csv',
34
+ reader_cfg=mastermath2024v1_reader_cfg,
35
+ infer_cfg=mastermath2024v1_infer_cfg,
36
+ eval_cfg=mastermath2024v1_eval_cfg)]
build/lib/opencompass/configs/datasets/matbench/matbench_gen.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # from .matbench_gen_regex_judge import matbench_datasets # noqa: F401, F403
5
+ from .matbench_llm_judge_gen_0e9276 import matbench_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification
6
+
7
+
8
+
9
+ matbench_reader_cfg = dict(
10
+ input_columns=['problem'], output_column='answer')
11
+
12
+
13
+ matbench_tasks = ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass']
14
+
15
+ matbench_datasets = []
16
+
17
+ for task in matbench_tasks:
18
+ if task in ['matbench_expt_is_metal','matbench_glass']:
19
+ matbench_infer_cfg = dict(
20
+ prompt_template=dict(
21
+ type=PromptTemplate,
22
+ template=dict(
23
+ round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer))
26
+
27
+ matbench_eval_cfg = dict(
28
+ evaluator=dict(type=MatbenchEvaluator_classification),
29
+ pred_role='BOT')
30
+
31
+ elif task in ['matbench_steels','matbench_expt_gap']:
32
+ matbench_infer_cfg = dict(
33
+ prompt_template=dict(
34
+ type=PromptTemplate,
35
+ template=dict(
36
+ round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])),
37
+ retriever=dict(type=ZeroRetriever),
38
+ inferencer=dict(type=GenInferencer))
39
+
40
+
41
+ matbench_eval_cfg = dict(
42
+ evaluator=dict(type=MatbenchEvaluator_regression),
43
+ pred_role='BOT')
44
+
45
+
46
+ matbench_datasets.append(
47
+ dict(
48
+ type=MatbenchDataset,
49
+ path=f'opencompass/Matbench',
50
+ task=task,
51
+ abbr=task,
52
+ reader_cfg=matbench_reader_cfg,
53
+ infer_cfg=matbench_infer_cfg,
54
+ eval_cfg=matbench_eval_cfg))
55
+