Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- build/lib/opencompass/configs/datasets/korbench/korbench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py +60 -0
- build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py +116 -0
- build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py +54 -0
- build/lib/opencompass/configs/datasets/korbench/readme.md +71 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py +166 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py +164 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py +164 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py +163 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py +165 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py +165 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py +132 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py +164 -0
- build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py +168 -0
- build/lib/opencompass/configs/datasets/livemathbench/README.md +74 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py +49 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py +45 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py +49 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py +4 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py +45 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py +120 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py +96 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py +44 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py +44 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py +97 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py +45 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py +45 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py +44 -0
- build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py +43 -0
- build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py +136 -0
- build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py +142 -0
- build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py +142 -0
- build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py +152 -0
- build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py +155 -0
- build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py +152 -0
- build/lib/opencompass/configs/datasets/llm_compression/README.md +105 -0
- build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py +50 -0
- build/lib/opencompass/configs/datasets/longbench/longbench.py +26 -0
- build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py +43 -0
- build/lib/opencompass/configs/datasets/lveval/lveval.md +165 -0
- build/lib/opencompass/configs/datasets/lveval/lveval.py +38 -0
- build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py +4 -0
- build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py +36 -0
- build/lib/opencompass/configs/datasets/matbench/matbench_gen.py +5 -0
- build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py +55 -0
build/lib/opencompass/configs/datasets/korbench/korbench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .korbench_single_0_shot_gen import korbench_0shot_single_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
| 2 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 3 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 4 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 5 |
+
|
| 6 |
+
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
|
| 7 |
+
|
| 8 |
+
korbench_0shot_single_datasets = []
|
| 9 |
+
|
| 10 |
+
for category in categories:
|
| 11 |
+
# Prompt template
|
| 12 |
+
prompt_template = dict(
|
| 13 |
+
type=PromptTemplate,
|
| 14 |
+
template=dict(
|
| 15 |
+
begin=[
|
| 16 |
+
dict(
|
| 17 |
+
role='HUMAN',
|
| 18 |
+
prompt=''
|
| 19 |
+
)
|
| 20 |
+
],
|
| 21 |
+
round=[
|
| 22 |
+
dict(
|
| 23 |
+
role='HUMAN',
|
| 24 |
+
prompt='{prompt}' # f-string
|
| 25 |
+
)
|
| 26 |
+
]
|
| 27 |
+
)
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Reader configuration
|
| 31 |
+
reader_cfg = dict(
|
| 32 |
+
input_columns=['prompt'],
|
| 33 |
+
output_column='answer',
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Inference configuration
|
| 37 |
+
infer_cfg = dict(
|
| 38 |
+
prompt_template=prompt_template,
|
| 39 |
+
retriever=dict(type=ZeroRetriever),
|
| 40 |
+
inferencer=dict(type=GenInferencer),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Evaluation configuration
|
| 44 |
+
eval_cfg = dict(
|
| 45 |
+
evaluator=dict(type=korbenchEvaluator),
|
| 46 |
+
pred_role='BOT',
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
korbench_dataset = dict(
|
| 50 |
+
type=korbenchDataset,
|
| 51 |
+
abbr=f'korbench_{category}',
|
| 52 |
+
path='opencompass/korbench',
|
| 53 |
+
prompt_mode='0_shot',
|
| 54 |
+
category=category,
|
| 55 |
+
reader_cfg=reader_cfg,
|
| 56 |
+
infer_cfg=infer_cfg,
|
| 57 |
+
eval_cfg=eval_cfg,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
korbench_0shot_single_datasets.append(korbench_dataset)
|
build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
| 2 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 3 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 4 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 5 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 6 |
+
from opencompass.datasets import generic_llmjudge_postprocess
|
| 7 |
+
|
| 8 |
+
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
|
| 9 |
+
|
| 10 |
+
GRADER_TEMPLATE = """
|
| 11 |
+
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
| 12 |
+
|
| 13 |
+
Here are some evaluation criteria:
|
| 14 |
+
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
| 15 |
+
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
| 16 |
+
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
| 17 |
+
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
| 18 |
+
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
| 19 |
+
|
| 20 |
+
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
| 21 |
+
A: CORRECT
|
| 22 |
+
B: INCORRECT
|
| 23 |
+
Just return the letters "A" or "B", with no text around it.
|
| 24 |
+
|
| 25 |
+
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
<Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
|
| 29 |
+
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
| 30 |
+
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
| 31 |
+
|
| 32 |
+
Judging the correctness of candidates' answers:
|
| 33 |
+
""".strip()
|
| 34 |
+
|
| 35 |
+
korbench_0shot_single_datasets = []
|
| 36 |
+
|
| 37 |
+
for category in categories:
|
| 38 |
+
# Prompt template
|
| 39 |
+
prompt_template = dict(
|
| 40 |
+
type=PromptTemplate,
|
| 41 |
+
template=dict(
|
| 42 |
+
begin=[
|
| 43 |
+
dict(
|
| 44 |
+
role='HUMAN',
|
| 45 |
+
prompt=''
|
| 46 |
+
)
|
| 47 |
+
],
|
| 48 |
+
round=[
|
| 49 |
+
dict(
|
| 50 |
+
role='HUMAN',
|
| 51 |
+
prompt='{prompt}' # f-string
|
| 52 |
+
)
|
| 53 |
+
]
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Reader configuration
|
| 58 |
+
reader_cfg = dict(
|
| 59 |
+
input_columns=['prompt'],
|
| 60 |
+
output_column='answer',
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Inference configuration
|
| 64 |
+
infer_cfg = dict(
|
| 65 |
+
prompt_template=prompt_template,
|
| 66 |
+
retriever=dict(type=ZeroRetriever),
|
| 67 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Evaluation configuration
|
| 71 |
+
eval_cfg = dict(
|
| 72 |
+
evaluator=dict(
|
| 73 |
+
type=GenericLLMEvaluator,
|
| 74 |
+
prompt_template=dict(
|
| 75 |
+
type=PromptTemplate,
|
| 76 |
+
template=dict(
|
| 77 |
+
begin=[
|
| 78 |
+
dict(
|
| 79 |
+
role='SYSTEM',
|
| 80 |
+
fallback_role='HUMAN',
|
| 81 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 82 |
+
],
|
| 83 |
+
round=[
|
| 84 |
+
dict(
|
| 85 |
+
role='HUMAN',
|
| 86 |
+
prompt=GRADER_TEMPLATE
|
| 87 |
+
),
|
| 88 |
+
]),
|
| 89 |
+
),
|
| 90 |
+
dataset_cfg=dict(
|
| 91 |
+
type=korbenchDataset,
|
| 92 |
+
path='opencompass/korbench',
|
| 93 |
+
prompt_mode='0_shot',
|
| 94 |
+
category=category,
|
| 95 |
+
reader_cfg=reader_cfg,
|
| 96 |
+
),
|
| 97 |
+
judge_cfg=dict(),
|
| 98 |
+
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
| 99 |
+
),
|
| 100 |
+
pred_role='BOT',
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Dataset
|
| 104 |
+
korbench_dataset = dict(
|
| 105 |
+
type=korbenchDataset,
|
| 106 |
+
abbr=f'korbench_{category}',
|
| 107 |
+
path='opencompass/korbench',
|
| 108 |
+
prompt_mode='0_shot',
|
| 109 |
+
category=category,
|
| 110 |
+
reader_cfg=reader_cfg,
|
| 111 |
+
infer_cfg=infer_cfg,
|
| 112 |
+
eval_cfg=eval_cfg,
|
| 113 |
+
mode='singlescore',
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
korbench_0shot_single_datasets.append(korbench_dataset)
|
build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.datasets.korbench.korbench import (
|
| 2 |
+
korbenchDataset,
|
| 3 |
+
korbenchEvaluator,
|
| 4 |
+
)
|
| 5 |
+
|
| 6 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 7 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 8 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 9 |
+
|
| 10 |
+
categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
|
| 11 |
+
|
| 12 |
+
korbench_3shot_single_datasets = []
|
| 13 |
+
|
| 14 |
+
for category in categories:
|
| 15 |
+
# Prompt template
|
| 16 |
+
prompt_template = dict(
|
| 17 |
+
type=PromptTemplate,
|
| 18 |
+
template=dict(
|
| 19 |
+
begin=[dict(role='HUMAN', prompt='')],
|
| 20 |
+
round=[dict(role='HUMAN', prompt='{prompt}')], # f-string
|
| 21 |
+
),
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Reader configuration
|
| 25 |
+
reader_cfg = dict(
|
| 26 |
+
input_columns=['prompt'],
|
| 27 |
+
output_column='answer',
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Inference configuration
|
| 31 |
+
infer_cfg = dict(
|
| 32 |
+
prompt_template=prompt_template,
|
| 33 |
+
retriever=dict(type=ZeroRetriever),
|
| 34 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Evaluation configuration
|
| 38 |
+
eval_cfg = dict(
|
| 39 |
+
evaluator=dict(type=korbenchEvaluator),
|
| 40 |
+
pred_role='BOT',
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
korbench_dataset = dict(
|
| 44 |
+
type=korbenchDataset,
|
| 45 |
+
abbr=f'korbench_{category}',
|
| 46 |
+
path='opencompass/korbench',
|
| 47 |
+
prompt_mode='3_shot',
|
| 48 |
+
category=category,
|
| 49 |
+
reader_cfg=reader_cfg,
|
| 50 |
+
infer_cfg=infer_cfg,
|
| 51 |
+
eval_cfg=eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
korbench_3shot_single_datasets.append(korbench_dataset)
|
build/lib/opencompass/configs/datasets/korbench/readme.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks
|
| 2 |
+
|
| 3 |
+
KOR-Bench is a dataset designed to evaluate large language models (LLMs) on tasks that require reasoning independent of prior knowledge. Created to assess reasoning and planning abilities, KOR-Bench introduces rule-based tasks that minimize the influence of pretrained knowledge, enabling a focused evaluation of intrinsic model capabilities.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
### Purpose
|
| 8 |
+
|
| 9 |
+
Large language models, such as GPT-4 and Claude, excel in knowledge-based tasks but face challenges in applying reasoning skills to unfamiliar scenarios. KOR-Bench is built to evaluate such reasoning capabilities across five categories:
|
| 10 |
+
- **Operation**: Arithmetic and logical operations.
|
| 11 |
+
- **Logic**: Complex deductive and inductive reasoning.
|
| 12 |
+
- **Cipher**: Code-breaking and pattern discovery.
|
| 13 |
+
- **Puzzle**: Problem-solving with creative and logical reasoning.
|
| 14 |
+
- **Counterfactual**: Hypothetical reasoning in alternate scenarios.
|
| 15 |
+
|
| 16 |
+
### Dataset Construction
|
| 17 |
+
|
| 18 |
+
KOR-Bench tasks are designed with novel rules and configurations, ensuring no reliance on pretrained knowledge. Each task includes:
|
| 19 |
+
- **Rules**: Custom rule sets to guide reasoning.
|
| 20 |
+
- **Questions**: Carefully crafted problems that require the application of rules.
|
| 21 |
+
- **Evaluation Scenarios**: Zero-shot, three-shot, and subquestion-specific configurations.
|
| 22 |
+
|
| 23 |
+
The dataset is structured to assess multistep reasoning, pattern recognition, and adaptability to new rules.
|
| 24 |
+
|
| 25 |
+
### Dataset Access
|
| 26 |
+
|
| 27 |
+
KOR-Bench is publicly available with detailed usage instructions in the [GitHub Repository](https://github.com/KOR-Bench/KOR-Bench). Download the dataset and leverage predefined evaluation scripts or customize your own.
|
| 28 |
+
|
| 29 |
+
### Evaluation
|
| 30 |
+
|
| 31 |
+
1. Install dependencies and configure your environment.
|
| 32 |
+
2. Run evaluations using `opencompass examples/eval_korbench.py` to assess LLM performance.
|
| 33 |
+
3. Analyze model performance across various reasoning tasks.
|
| 34 |
+
|
| 35 |
+
### Example Command
|
| 36 |
+
```bash
|
| 37 |
+
opencompass examples/eval_korbench.py
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## Baselines and Results
|
| 41 |
+
KOR-Bench includes baseline results for leading LLMs evaluated across various configurations, including zero-shot (gen) and few-shot modes. Below is a summary of the results.
|
| 42 |
+
| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | internlm2_5-1_8b-chat-turbomind | llama-3_1-8b-instruct-turbomind | glm-4-9b-chat-turbomind | gemma-2-9b-it-turbomind |
|
| 43 |
+
|---------|---------|--------|------|--------------------------------|---------------------------------|---------------------------------|--------------------------|--------------------------|
|
| 44 |
+
| korbench_mixed_Multi-Q | 21f998 | accuracy | gen | 0.60 | 0.20 | 9.60 | 8.70 | 7.80 |
|
| 45 |
+
| korbench_mixed_Multi-R | 21f998 | accuracy | gen | 1.70 | 0.10 | 8.80 | 12.10 | 9.80 |
|
| 46 |
+
| korbench_mixed_Multi-RQ | 21f998 | accuracy | gen | 1.50 | 0.10 | 6.40 | 8.60 | 6.00 |
|
| 47 |
+
| korbench_cipher | 21f998 | accuracy | gen | 8.80 | 0.80 | 14.00 | 6.80 | 6.40 |
|
| 48 |
+
| korbench_counterfactual | 21f998 | accuracy | gen | 83.60 | 17.20 | 88.80 | 90.40 | 87.60 |
|
| 49 |
+
| korbench_logic | 21f998 | accuracy | gen | 8.40 | 3.60 | 37.60 | 38.80 | 40.80 |
|
| 50 |
+
| korbench_operation | 21f998 | accuracy | gen | 56.00 | 25.20 | 68.40 | 63.60 | 67.60 |
|
| 51 |
+
| korbench_puzzle | 21f998 | accuracy | gen | 3.60 | 0.00 | 3.20 | 3.20 | 5.60 |
|
| 52 |
+
| korbench_cipher | 21f998 | accuracy | fewshot | 8.40 | 3.20 | 9.60 | 9.20 | 9.60 |
|
| 53 |
+
| korbench_counterfactual | 21f998 | accuracy | fewshot | 87.60 | 58.00 | 23.60 | 89.60 | 84.40 |
|
| 54 |
+
| korbench_logic | 21f998 | accuracy | fewshot | 45.20 | 19.60 | 24.40 | 38.40 | 54.00 |
|
| 55 |
+
| korbench_operation | 21f998 | accuracy | fewshot | 24.80 | 11.20 | 73.20 | 67.20 | 23.20 |
|
| 56 |
+
| korbench_puzzle | 21f998 | accuracy | fewshot | 4.80 | 2.40 | 1.60 | 3.60 | 6.80 |
|
| 57 |
+
|
| 58 |
+
### Citation
|
| 59 |
+
|
| 60 |
+
**BibTeX:**
|
| 61 |
+
```bibtex
|
| 62 |
+
@misc{ma2024korbenchbenchmarkinglanguagemodels,
|
| 63 |
+
title={KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks},
|
| 64 |
+
author={Kaijing Ma and Xinrun Du and Yunran Wang and Haoran Zhang and Zhoufutu Wen and Xingwei Qu and Jian Yang and Jiaheng Liu and Minghao Liu and Xiang Yue and Wenhao Huang and Ge Zhang},
|
| 65 |
+
year={2024},
|
| 66 |
+
eprint={2410.06526},
|
| 67 |
+
archivePrefix={arXiv},
|
| 68 |
+
primaryClass={cs.DB},
|
| 69 |
+
url={https://arxiv.org/abs/2410.06526},
|
| 70 |
+
}
|
| 71 |
+
```
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
),
|
| 53 |
+
pred_role='BOT',
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
LCBCodeGeneration_dataset = dict(
|
| 57 |
+
type=LCBCodeGenerationDataset,
|
| 58 |
+
abbr='lcb_code_generation',
|
| 59 |
+
path='opencompass/code_generation_lite',
|
| 60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 62 |
+
eval_cfg=lcb_code_generation_eval_cfg,
|
| 63 |
+
n=5,
|
| 64 |
+
k=3
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Code Execution Dataset
|
| 68 |
+
lcb_code_execution_reader_cfg = dict(
|
| 69 |
+
input_columns=[
|
| 70 |
+
'prompt',
|
| 71 |
+
],
|
| 72 |
+
output_column='evaluation_sample',
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
lcb_code_execution_infer_cfg = dict(
|
| 76 |
+
prompt_template=dict(
|
| 77 |
+
type=PromptTemplate,
|
| 78 |
+
template=dict(
|
| 79 |
+
begin=[
|
| 80 |
+
dict(
|
| 81 |
+
role='SYSTEM',
|
| 82 |
+
fallback_role='HUMAN',
|
| 83 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 84 |
+
),
|
| 85 |
+
],
|
| 86 |
+
round=[
|
| 87 |
+
dict(
|
| 88 |
+
role='HUMAN',
|
| 89 |
+
prompt='{prompt}'
|
| 90 |
+
)
|
| 91 |
+
]
|
| 92 |
+
)
|
| 93 |
+
),
|
| 94 |
+
retriever=dict(type=ZeroRetriever),
|
| 95 |
+
inferencer=dict(type=GenInferencer)
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
lcb_code_execution_eval_cfg = dict(
|
| 99 |
+
evaluator=dict(
|
| 100 |
+
type=LCBCodeExecutionEvaluator,
|
| 101 |
+
),
|
| 102 |
+
pred_role='BOT',
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
LCBCodeExecution_dataset = dict(
|
| 106 |
+
type=LCBCodeExecutionDataset,
|
| 107 |
+
abbr='lcb_code_execution',
|
| 108 |
+
path='opencompass/execution-v2',
|
| 109 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 110 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 111 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# TestOuputput Dataset
|
| 115 |
+
lcb_test_output_reader_cfg = dict(
|
| 116 |
+
input_columns=[
|
| 117 |
+
'prompt',
|
| 118 |
+
],
|
| 119 |
+
output_column='evaluation_sample',
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 123 |
+
|
| 124 |
+
lcb_test_output_infer_cfg = dict(
|
| 125 |
+
prompt_template=dict(
|
| 126 |
+
type=PromptTemplate,
|
| 127 |
+
template=dict(
|
| 128 |
+
# begin=[
|
| 129 |
+
# dict(
|
| 130 |
+
# role='SYSTEM',
|
| 131 |
+
# prompt=system_prompt
|
| 132 |
+
# ),
|
| 133 |
+
# ],
|
| 134 |
+
round=[
|
| 135 |
+
dict(
|
| 136 |
+
role='HUMAN',
|
| 137 |
+
prompt='{prompt}'
|
| 138 |
+
)
|
| 139 |
+
]
|
| 140 |
+
)
|
| 141 |
+
),
|
| 142 |
+
retriever=dict(type=ZeroRetriever),
|
| 143 |
+
inferencer=dict(type=GenInferencer)
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
lcb_test_output_eval_cfg = dict(
|
| 147 |
+
evaluator=dict(
|
| 148 |
+
type=LCBTestOutputEvaluator,
|
| 149 |
+
),
|
| 150 |
+
pred_role='BOT',
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
LCBTestOutput_dataset = dict(
|
| 154 |
+
type=LCBTestOutputPredictionDataset,
|
| 155 |
+
abbr='lcb_test_output',
|
| 156 |
+
path='opencompass/test_generation',
|
| 157 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 158 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 159 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
LCB_datasets = [
|
| 163 |
+
LCBCodeGeneration_dataset,
|
| 164 |
+
# LCBCodeExecution_dataset,
|
| 165 |
+
# LCBTestOutput_dataset,
|
| 166 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .livecodebench_gen_a4f90b import LCB_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
),
|
| 53 |
+
pred_role='BOT',
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
LCBCodeGeneration_dataset = dict(
|
| 57 |
+
type=LCBCodeGenerationDataset,
|
| 58 |
+
abbr='lcb_code_generation',
|
| 59 |
+
path='opencompass/code_generation_lite',
|
| 60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 62 |
+
eval_cfg=lcb_code_generation_eval_cfg
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Code Execution Dataset
|
| 66 |
+
lcb_code_execution_reader_cfg = dict(
|
| 67 |
+
input_columns=[
|
| 68 |
+
'prompt',
|
| 69 |
+
],
|
| 70 |
+
output_column='evaluation_sample',
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
lcb_code_execution_infer_cfg = dict(
|
| 74 |
+
prompt_template=dict(
|
| 75 |
+
type=PromptTemplate,
|
| 76 |
+
template=dict(
|
| 77 |
+
begin=[
|
| 78 |
+
dict(
|
| 79 |
+
role='SYSTEM',
|
| 80 |
+
fallback_role='HUMAN',
|
| 81 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 82 |
+
),
|
| 83 |
+
],
|
| 84 |
+
round=[
|
| 85 |
+
dict(
|
| 86 |
+
role='HUMAN',
|
| 87 |
+
prompt='{prompt}'
|
| 88 |
+
)
|
| 89 |
+
]
|
| 90 |
+
)
|
| 91 |
+
),
|
| 92 |
+
retriever=dict(type=ZeroRetriever),
|
| 93 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
lcb_code_execution_eval_cfg = dict(
|
| 97 |
+
evaluator=dict(
|
| 98 |
+
type=LCBCodeExecutionEvaluator,
|
| 99 |
+
),
|
| 100 |
+
pred_role='BOT',
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
LCBCodeExecution_dataset = dict(
|
| 104 |
+
type=LCBCodeExecutionDataset,
|
| 105 |
+
abbr='lcb_code_execution',
|
| 106 |
+
path='opencompass/execution-v2',
|
| 107 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 108 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 109 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# TestOuputput Dataset
|
| 113 |
+
lcb_test_output_reader_cfg = dict(
|
| 114 |
+
input_columns=[
|
| 115 |
+
'prompt',
|
| 116 |
+
],
|
| 117 |
+
output_column='evaluation_sample',
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 121 |
+
|
| 122 |
+
lcb_test_output_infer_cfg = dict(
|
| 123 |
+
prompt_template=dict(
|
| 124 |
+
type=PromptTemplate,
|
| 125 |
+
template=dict(
|
| 126 |
+
# begin=[
|
| 127 |
+
# dict(
|
| 128 |
+
# role='SYSTEM',
|
| 129 |
+
# prompt=system_prompt
|
| 130 |
+
# ),
|
| 131 |
+
# ],
|
| 132 |
+
round=[
|
| 133 |
+
dict(
|
| 134 |
+
role='HUMAN',
|
| 135 |
+
prompt='{prompt}'
|
| 136 |
+
)
|
| 137 |
+
]
|
| 138 |
+
)
|
| 139 |
+
),
|
| 140 |
+
retriever=dict(type=ZeroRetriever),
|
| 141 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
lcb_test_output_eval_cfg = dict(
|
| 145 |
+
evaluator=dict(
|
| 146 |
+
type=LCBTestOutputEvaluator,
|
| 147 |
+
),
|
| 148 |
+
pred_role='BOT',
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
LCBTestOutput_dataset = dict(
|
| 152 |
+
type=LCBTestOutputPredictionDataset,
|
| 153 |
+
abbr='lcb_test_output',
|
| 154 |
+
path='opencompass/test_generation',
|
| 155 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 156 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 157 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
LCB_datasets = [
|
| 161 |
+
LCBCodeGeneration_dataset,
|
| 162 |
+
LCBCodeExecution_dataset,
|
| 163 |
+
LCBTestOutput_dataset,
|
| 164 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
),
|
| 53 |
+
pred_role='BOT',
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
LCBCodeGeneration_dataset = dict(
|
| 57 |
+
type=LCBCodeGenerationDataset,
|
| 58 |
+
abbr='lcb_code_generation',
|
| 59 |
+
path='opencompass/code_generation_lite',
|
| 60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 62 |
+
eval_cfg=lcb_code_generation_eval_cfg
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Code Execution Dataset
|
| 66 |
+
lcb_code_execution_reader_cfg = dict(
|
| 67 |
+
input_columns=[
|
| 68 |
+
'prompt',
|
| 69 |
+
],
|
| 70 |
+
output_column='evaluation_sample',
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
lcb_code_execution_infer_cfg = dict(
|
| 74 |
+
prompt_template=dict(
|
| 75 |
+
type=PromptTemplate,
|
| 76 |
+
template=dict(
|
| 77 |
+
begin=[
|
| 78 |
+
dict(
|
| 79 |
+
role='SYSTEM',
|
| 80 |
+
fallback_role='HUMAN',
|
| 81 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 82 |
+
),
|
| 83 |
+
],
|
| 84 |
+
round=[
|
| 85 |
+
dict(
|
| 86 |
+
role='HUMAN',
|
| 87 |
+
prompt='{prompt}'
|
| 88 |
+
)
|
| 89 |
+
]
|
| 90 |
+
)
|
| 91 |
+
),
|
| 92 |
+
retriever=dict(type=ZeroRetriever),
|
| 93 |
+
inferencer=dict(type=GenInferencer)
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
lcb_code_execution_eval_cfg = dict(
|
| 97 |
+
evaluator=dict(
|
| 98 |
+
type=LCBCodeExecutionEvaluator,
|
| 99 |
+
),
|
| 100 |
+
pred_role='BOT',
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
LCBCodeExecution_dataset = dict(
|
| 104 |
+
type=LCBCodeExecutionDataset,
|
| 105 |
+
abbr='lcb_code_execution',
|
| 106 |
+
path='opencompass/execution-v2',
|
| 107 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 108 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 109 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# TestOuputput Dataset
|
| 113 |
+
lcb_test_output_reader_cfg = dict(
|
| 114 |
+
input_columns=[
|
| 115 |
+
'prompt',
|
| 116 |
+
],
|
| 117 |
+
output_column='evaluation_sample',
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 121 |
+
|
| 122 |
+
lcb_test_output_infer_cfg = dict(
|
| 123 |
+
prompt_template=dict(
|
| 124 |
+
type=PromptTemplate,
|
| 125 |
+
template=dict(
|
| 126 |
+
# begin=[
|
| 127 |
+
# dict(
|
| 128 |
+
# role='SYSTEM',
|
| 129 |
+
# prompt=system_prompt
|
| 130 |
+
# ),
|
| 131 |
+
# ],
|
| 132 |
+
round=[
|
| 133 |
+
dict(
|
| 134 |
+
role='HUMAN',
|
| 135 |
+
prompt='{prompt}'
|
| 136 |
+
)
|
| 137 |
+
]
|
| 138 |
+
)
|
| 139 |
+
),
|
| 140 |
+
retriever=dict(type=ZeroRetriever),
|
| 141 |
+
inferencer=dict(type=GenInferencer)
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
lcb_test_output_eval_cfg = dict(
|
| 145 |
+
evaluator=dict(
|
| 146 |
+
type=LCBTestOutputEvaluator,
|
| 147 |
+
),
|
| 148 |
+
pred_role='BOT',
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
LCBTestOutput_dataset = dict(
|
| 152 |
+
type=LCBTestOutputPredictionDataset,
|
| 153 |
+
abbr='lcb_test_output',
|
| 154 |
+
path='opencompass/test_generation',
|
| 155 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 156 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 157 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
LCB_datasets = [
|
| 161 |
+
LCBCodeGeneration_dataset,
|
| 162 |
+
LCBCodeExecution_dataset,
|
| 163 |
+
LCBTestOutput_dataset,
|
| 164 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
),
|
| 53 |
+
pred_role='BOT',
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
LCBCodeGeneration_dataset = dict(
|
| 57 |
+
type=LCBCodeGenerationDataset,
|
| 58 |
+
abbr='lcb_code_generation',
|
| 59 |
+
path='opencompass/code_generation_lite',
|
| 60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 62 |
+
eval_cfg=lcb_code_generation_eval_cfg
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Code Execution Dataset
|
| 66 |
+
lcb_code_execution_reader_cfg = dict(
|
| 67 |
+
input_columns=[
|
| 68 |
+
'prompt',
|
| 69 |
+
],
|
| 70 |
+
output_column='evaluation_sample',
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
lcb_code_execution_infer_cfg = dict(
|
| 74 |
+
prompt_template=dict(
|
| 75 |
+
type=PromptTemplate,
|
| 76 |
+
template=dict(
|
| 77 |
+
begin=[
|
| 78 |
+
dict(
|
| 79 |
+
role='SYSTEM',
|
| 80 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 81 |
+
),
|
| 82 |
+
],
|
| 83 |
+
round=[
|
| 84 |
+
dict(
|
| 85 |
+
role='HUMAN',
|
| 86 |
+
prompt='{prompt}'
|
| 87 |
+
)
|
| 88 |
+
]
|
| 89 |
+
)
|
| 90 |
+
),
|
| 91 |
+
retriever=dict(type=ZeroRetriever),
|
| 92 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
lcb_code_execution_eval_cfg = dict(
|
| 96 |
+
evaluator=dict(
|
| 97 |
+
type=LCBCodeExecutionEvaluator,
|
| 98 |
+
),
|
| 99 |
+
pred_role='BOT',
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
LCBCodeExecution_dataset = dict(
|
| 103 |
+
type=LCBCodeExecutionDataset,
|
| 104 |
+
abbr='lcb_code_execution',
|
| 105 |
+
path='opencompass/execution-v2',
|
| 106 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 107 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 108 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# TestOuputput Dataset
|
| 112 |
+
lcb_test_output_reader_cfg = dict(
|
| 113 |
+
input_columns=[
|
| 114 |
+
'prompt',
|
| 115 |
+
],
|
| 116 |
+
output_column='evaluation_sample',
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 120 |
+
|
| 121 |
+
lcb_test_output_infer_cfg = dict(
|
| 122 |
+
prompt_template=dict(
|
| 123 |
+
type=PromptTemplate,
|
| 124 |
+
template=dict(
|
| 125 |
+
# begin=[
|
| 126 |
+
# dict(
|
| 127 |
+
# role='SYSTEM',
|
| 128 |
+
# prompt=system_prompt
|
| 129 |
+
# ),
|
| 130 |
+
# ],
|
| 131 |
+
round=[
|
| 132 |
+
dict(
|
| 133 |
+
role='HUMAN',
|
| 134 |
+
prompt='{prompt}'
|
| 135 |
+
)
|
| 136 |
+
]
|
| 137 |
+
)
|
| 138 |
+
),
|
| 139 |
+
retriever=dict(type=ZeroRetriever),
|
| 140 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
lcb_test_output_eval_cfg = dict(
|
| 144 |
+
evaluator=dict(
|
| 145 |
+
type=LCBTestOutputEvaluator,
|
| 146 |
+
),
|
| 147 |
+
pred_role='BOT',
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
LCBTestOutput_dataset = dict(
|
| 151 |
+
type=LCBTestOutputPredictionDataset,
|
| 152 |
+
abbr='lcb_test_output',
|
| 153 |
+
path='opencompass/test_generation',
|
| 154 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 155 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 156 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
LCB_datasets = [
|
| 160 |
+
LCBCodeGeneration_dataset,
|
| 161 |
+
LCBCodeExecution_dataset,
|
| 162 |
+
LCBTestOutput_dataset,
|
| 163 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
release_version='release_v4',
|
| 53 |
+
),
|
| 54 |
+
pred_role='BOT',
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
LCBCodeGeneration_dataset = dict(
|
| 58 |
+
type=LCBCodeGenerationDataset,
|
| 59 |
+
abbr='lcb_code_generation_v4',
|
| 60 |
+
path='opencompass/code_generation_lite',
|
| 61 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 62 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 63 |
+
eval_cfg=lcb_code_generation_eval_cfg,
|
| 64 |
+
release_version='release_v4',
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Code Execution Dataset
|
| 68 |
+
lcb_code_execution_reader_cfg = dict(
|
| 69 |
+
input_columns=[
|
| 70 |
+
'prompt',
|
| 71 |
+
],
|
| 72 |
+
output_column='evaluation_sample',
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
lcb_code_execution_infer_cfg = dict(
|
| 76 |
+
prompt_template=dict(
|
| 77 |
+
type=PromptTemplate,
|
| 78 |
+
template=dict(
|
| 79 |
+
begin=[
|
| 80 |
+
dict(
|
| 81 |
+
role='SYSTEM',
|
| 82 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 83 |
+
),
|
| 84 |
+
],
|
| 85 |
+
round=[
|
| 86 |
+
dict(
|
| 87 |
+
role='HUMAN',
|
| 88 |
+
prompt='{prompt}'
|
| 89 |
+
)
|
| 90 |
+
]
|
| 91 |
+
)
|
| 92 |
+
),
|
| 93 |
+
retriever=dict(type=ZeroRetriever),
|
| 94 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
lcb_code_execution_eval_cfg = dict(
|
| 98 |
+
evaluator=dict(
|
| 99 |
+
type=LCBCodeExecutionEvaluator,
|
| 100 |
+
),
|
| 101 |
+
pred_role='BOT',
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
LCBCodeExecution_dataset = dict(
|
| 105 |
+
type=LCBCodeExecutionDataset,
|
| 106 |
+
abbr='lcb_code_execution',
|
| 107 |
+
path='opencompass/execution-v2',
|
| 108 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 109 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 110 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# TestOuputput Dataset
|
| 114 |
+
lcb_test_output_reader_cfg = dict(
|
| 115 |
+
input_columns=[
|
| 116 |
+
'prompt',
|
| 117 |
+
],
|
| 118 |
+
output_column='evaluation_sample',
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 122 |
+
|
| 123 |
+
lcb_test_output_infer_cfg = dict(
|
| 124 |
+
prompt_template=dict(
|
| 125 |
+
type=PromptTemplate,
|
| 126 |
+
template=dict(
|
| 127 |
+
# begin=[
|
| 128 |
+
# dict(
|
| 129 |
+
# role='SYSTEM',
|
| 130 |
+
# prompt=system_prompt
|
| 131 |
+
# ),
|
| 132 |
+
# ],
|
| 133 |
+
round=[
|
| 134 |
+
dict(
|
| 135 |
+
role='HUMAN',
|
| 136 |
+
prompt='{prompt}'
|
| 137 |
+
)
|
| 138 |
+
]
|
| 139 |
+
)
|
| 140 |
+
),
|
| 141 |
+
retriever=dict(type=ZeroRetriever),
|
| 142 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
lcb_test_output_eval_cfg = dict(
|
| 146 |
+
evaluator=dict(
|
| 147 |
+
type=LCBTestOutputEvaluator,
|
| 148 |
+
),
|
| 149 |
+
pred_role='BOT',
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
LCBTestOutput_dataset = dict(
|
| 153 |
+
type=LCBTestOutputPredictionDataset,
|
| 154 |
+
abbr='lcb_test_output',
|
| 155 |
+
path='opencompass/test_generation',
|
| 156 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 157 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 158 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
LCB_datasets = [
|
| 162 |
+
LCBCodeGeneration_dataset,
|
| 163 |
+
# LCBCodeExecution_dataset,
|
| 164 |
+
# LCBTestOutput_dataset,
|
| 165 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
release_version='release_split_v4',
|
| 53 |
+
),
|
| 54 |
+
pred_role='BOT',
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
LCBCodeGeneration_dataset = dict(
|
| 58 |
+
type=LCBCodeGenerationDataset,
|
| 59 |
+
abbr='lcb_code_generation_split_v4',
|
| 60 |
+
path='opencompass/code_generation_lite',
|
| 61 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 62 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 63 |
+
eval_cfg=lcb_code_generation_eval_cfg,
|
| 64 |
+
release_version='release_split_v4',
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Code Execution Dataset
|
| 68 |
+
lcb_code_execution_reader_cfg = dict(
|
| 69 |
+
input_columns=[
|
| 70 |
+
'prompt',
|
| 71 |
+
],
|
| 72 |
+
output_column='evaluation_sample',
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
lcb_code_execution_infer_cfg = dict(
|
| 76 |
+
prompt_template=dict(
|
| 77 |
+
type=PromptTemplate,
|
| 78 |
+
template=dict(
|
| 79 |
+
begin=[
|
| 80 |
+
dict(
|
| 81 |
+
role='SYSTEM',
|
| 82 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 83 |
+
),
|
| 84 |
+
],
|
| 85 |
+
round=[
|
| 86 |
+
dict(
|
| 87 |
+
role='HUMAN',
|
| 88 |
+
prompt='{prompt}'
|
| 89 |
+
)
|
| 90 |
+
]
|
| 91 |
+
)
|
| 92 |
+
),
|
| 93 |
+
retriever=dict(type=ZeroRetriever),
|
| 94 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
lcb_code_execution_eval_cfg = dict(
|
| 98 |
+
evaluator=dict(
|
| 99 |
+
type=LCBCodeExecutionEvaluator,
|
| 100 |
+
),
|
| 101 |
+
pred_role='BOT',
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
LCBCodeExecution_dataset = dict(
|
| 105 |
+
type=LCBCodeExecutionDataset,
|
| 106 |
+
abbr='lcb_code_execution',
|
| 107 |
+
path='opencompass/execution-v2',
|
| 108 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 109 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 110 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# TestOuputput Dataset
|
| 114 |
+
lcb_test_output_reader_cfg = dict(
|
| 115 |
+
input_columns=[
|
| 116 |
+
'prompt',
|
| 117 |
+
],
|
| 118 |
+
output_column='evaluation_sample',
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 122 |
+
|
| 123 |
+
lcb_test_output_infer_cfg = dict(
|
| 124 |
+
prompt_template=dict(
|
| 125 |
+
type=PromptTemplate,
|
| 126 |
+
template=dict(
|
| 127 |
+
# begin=[
|
| 128 |
+
# dict(
|
| 129 |
+
# role='SYSTEM',
|
| 130 |
+
# prompt=system_prompt
|
| 131 |
+
# ),
|
| 132 |
+
# ],
|
| 133 |
+
round=[
|
| 134 |
+
dict(
|
| 135 |
+
role='HUMAN',
|
| 136 |
+
prompt='{prompt}'
|
| 137 |
+
)
|
| 138 |
+
]
|
| 139 |
+
)
|
| 140 |
+
),
|
| 141 |
+
retriever=dict(type=ZeroRetriever),
|
| 142 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
lcb_test_output_eval_cfg = dict(
|
| 146 |
+
evaluator=dict(
|
| 147 |
+
type=LCBTestOutputEvaluator,
|
| 148 |
+
),
|
| 149 |
+
pred_role='BOT',
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
LCBTestOutput_dataset = dict(
|
| 153 |
+
type=LCBTestOutputPredictionDataset,
|
| 154 |
+
abbr='lcb_test_output',
|
| 155 |
+
path='opencompass/test_generation',
|
| 156 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 157 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 158 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
LCB_datasets = [
|
| 162 |
+
LCBCodeGeneration_dataset,
|
| 163 |
+
# LCBCodeExecution_dataset,
|
| 164 |
+
# LCBTestOutput_dataset,
|
| 165 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (LCBCodeGenerationDataset,
|
| 5 |
+
LCBCodeExecutionDataset,
|
| 6 |
+
LCBTestOutputPredictionDataset,
|
| 7 |
+
LCBCodeGenerationEvaluator,
|
| 8 |
+
LCBCodeExecutionEvaluator,
|
| 9 |
+
LCBTestOutputEvaluator)
|
| 10 |
+
|
| 11 |
+
lcb_code_generation_reader_cfg = dict(
|
| 12 |
+
input_columns=[
|
| 13 |
+
'question_content',
|
| 14 |
+
'format_prompt',
|
| 15 |
+
],
|
| 16 |
+
# output_column='evaluation_sample',
|
| 17 |
+
output_column='question_id',
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501
|
| 21 |
+
|
| 22 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 23 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 24 |
+
|
| 25 |
+
# Code Generation Tasks
|
| 26 |
+
lcb_code_generation_infer_cfg = dict(prompt_template=dict(
|
| 27 |
+
type=PromptTemplate,
|
| 28 |
+
template=dict(round=[dict(role='HUMAN', prompt=prompt_template)])),
|
| 29 |
+
retriever=dict(type=ZeroRetriever),
|
| 30 |
+
inferencer=dict(type=GenInferencer))
|
| 31 |
+
|
| 32 |
+
lcb_code_generation_eval_cfg = dict(
|
| 33 |
+
evaluator=dict(type=LCBCodeGenerationEvaluator,
|
| 34 |
+
num_process_evaluate=4,
|
| 35 |
+
timeout=6,
|
| 36 |
+
release_version='release_v5',
|
| 37 |
+
start_date='2024-08-01',
|
| 38 |
+
end_date='2025-02-01'),
|
| 39 |
+
pred_role='BOT',
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
LCBCodeGeneration_dataset = dict(
|
| 43 |
+
type=LCBCodeGenerationDataset,
|
| 44 |
+
abbr='lcb_code_generation',
|
| 45 |
+
path='opencompass/code_generation_lite',
|
| 46 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 47 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 48 |
+
eval_cfg=lcb_code_generation_eval_cfg,
|
| 49 |
+
release_version='release_v5',
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Code Execution Dataset
|
| 53 |
+
lcb_code_execution_reader_cfg = dict(
|
| 54 |
+
input_columns=[
|
| 55 |
+
'prompt',
|
| 56 |
+
],
|
| 57 |
+
output_column='evaluation_sample',
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
lcb_code_execution_infer_cfg = dict(
|
| 61 |
+
prompt_template=dict(
|
| 62 |
+
type=PromptTemplate,
|
| 63 |
+
template=dict(
|
| 64 |
+
begin=[
|
| 65 |
+
dict(
|
| 66 |
+
role='SYSTEM',
|
| 67 |
+
fallback_role='HUMAN',
|
| 68 |
+
prompt=
|
| 69 |
+
'You are an expert at Python programming, code execution, test case generation, and fuzzing.' # noqa: E501
|
| 70 |
+
),
|
| 71 |
+
],
|
| 72 |
+
round=[dict(role='HUMAN', prompt='{prompt}')])),
|
| 73 |
+
retriever=dict(type=ZeroRetriever),
|
| 74 |
+
inferencer=dict(type=GenInferencer))
|
| 75 |
+
|
| 76 |
+
lcb_code_execution_eval_cfg = dict(
|
| 77 |
+
evaluator=dict(type=LCBCodeExecutionEvaluator, ),
|
| 78 |
+
pred_role='BOT',
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
LCBCodeExecution_dataset = dict(
|
| 82 |
+
type=LCBCodeExecutionDataset,
|
| 83 |
+
abbr='lcb_code_execution',
|
| 84 |
+
path='opencompass/execution-v2',
|
| 85 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 86 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 87 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# TestOuputput Dataset
|
| 91 |
+
lcb_test_output_reader_cfg = dict(
|
| 92 |
+
input_columns=[
|
| 93 |
+
'prompt',
|
| 94 |
+
],
|
| 95 |
+
output_column='evaluation_sample',
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' # noqa: E501
|
| 99 |
+
|
| 100 |
+
lcb_test_output_infer_cfg = dict(
|
| 101 |
+
prompt_template=dict(
|
| 102 |
+
type=PromptTemplate,
|
| 103 |
+
template=dict(
|
| 104 |
+
# begin=[
|
| 105 |
+
# dict(
|
| 106 |
+
# role='SYSTEM',
|
| 107 |
+
# prompt=system_prompt
|
| 108 |
+
# ),
|
| 109 |
+
# ],
|
| 110 |
+
round=[dict(role='HUMAN', prompt='{prompt}')])),
|
| 111 |
+
retriever=dict(type=ZeroRetriever),
|
| 112 |
+
inferencer=dict(type=GenInferencer))
|
| 113 |
+
|
| 114 |
+
lcb_test_output_eval_cfg = dict(
|
| 115 |
+
evaluator=dict(type=LCBTestOutputEvaluator, ),
|
| 116 |
+
pred_role='BOT',
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
LCBTestOutput_dataset = dict(
|
| 120 |
+
type=LCBTestOutputPredictionDataset,
|
| 121 |
+
abbr='lcb_test_output',
|
| 122 |
+
path='opencompass/test_generation',
|
| 123 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 124 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 125 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
LCB_datasets = [
|
| 129 |
+
LCBCodeGeneration_dataset,
|
| 130 |
+
LCBCodeExecution_dataset,
|
| 131 |
+
LCBTestOutput_dataset,
|
| 132 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
num_process_evaluate=4,
|
| 51 |
+
timeout=6,
|
| 52 |
+
),
|
| 53 |
+
pred_role='BOT',
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
LCBCodeGeneration_dataset = dict(
|
| 57 |
+
type=LCBCodeGenerationDataset,
|
| 58 |
+
abbr='lcb_code_generation_v1',
|
| 59 |
+
path='opencompass/code_generation_lite',
|
| 60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 62 |
+
eval_cfg=lcb_code_generation_eval_cfg,
|
| 63 |
+
release_version='release_v1',
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Code Execution Dataset
|
| 67 |
+
lcb_code_execution_reader_cfg = dict(
|
| 68 |
+
input_columns=[
|
| 69 |
+
'prompt',
|
| 70 |
+
],
|
| 71 |
+
output_column='evaluation_sample',
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
lcb_code_execution_infer_cfg = dict(
|
| 75 |
+
prompt_template=dict(
|
| 76 |
+
type=PromptTemplate,
|
| 77 |
+
template=dict(
|
| 78 |
+
begin=[
|
| 79 |
+
dict(
|
| 80 |
+
role='SYSTEM',
|
| 81 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 82 |
+
),
|
| 83 |
+
],
|
| 84 |
+
round=[
|
| 85 |
+
dict(
|
| 86 |
+
role='HUMAN',
|
| 87 |
+
prompt='{prompt}'
|
| 88 |
+
)
|
| 89 |
+
]
|
| 90 |
+
)
|
| 91 |
+
),
|
| 92 |
+
retriever=dict(type=ZeroRetriever),
|
| 93 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
lcb_code_execution_eval_cfg = dict(
|
| 97 |
+
evaluator=dict(
|
| 98 |
+
type=LCBCodeExecutionEvaluator,
|
| 99 |
+
),
|
| 100 |
+
pred_role='BOT',
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
LCBCodeExecution_dataset = dict(
|
| 104 |
+
type=LCBCodeExecutionDataset,
|
| 105 |
+
abbr='lcb_code_execution',
|
| 106 |
+
path='opencompass/execution-v2',
|
| 107 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 108 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 109 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# TestOuputput Dataset
|
| 113 |
+
lcb_test_output_reader_cfg = dict(
|
| 114 |
+
input_columns=[
|
| 115 |
+
'prompt',
|
| 116 |
+
],
|
| 117 |
+
output_column='evaluation_sample',
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 121 |
+
|
| 122 |
+
lcb_test_output_infer_cfg = dict(
|
| 123 |
+
prompt_template=dict(
|
| 124 |
+
type=PromptTemplate,
|
| 125 |
+
template=dict(
|
| 126 |
+
# begin=[
|
| 127 |
+
# dict(
|
| 128 |
+
# role='SYSTEM',
|
| 129 |
+
# prompt=system_prompt
|
| 130 |
+
# ),
|
| 131 |
+
# ],
|
| 132 |
+
round=[
|
| 133 |
+
dict(
|
| 134 |
+
role='HUMAN',
|
| 135 |
+
prompt='{prompt}'
|
| 136 |
+
)
|
| 137 |
+
]
|
| 138 |
+
)
|
| 139 |
+
),
|
| 140 |
+
retriever=dict(type=ZeroRetriever),
|
| 141 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
lcb_test_output_eval_cfg = dict(
|
| 145 |
+
evaluator=dict(
|
| 146 |
+
type=LCBTestOutputEvaluator,
|
| 147 |
+
),
|
| 148 |
+
pred_role='BOT',
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
LCBTestOutput_dataset = dict(
|
| 152 |
+
type=LCBTestOutputPredictionDataset,
|
| 153 |
+
abbr='lcb_test_output',
|
| 154 |
+
path='opencompass/test_generation',
|
| 155 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 156 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 157 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
LCB_datasets = [
|
| 161 |
+
LCBCodeGeneration_dataset,
|
| 162 |
+
# LCBCodeExecution_dataset,
|
| 163 |
+
# LCBTestOutput_dataset,
|
| 164 |
+
]
|
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LCBCodeGenerationDataset,
|
| 6 |
+
LCBCodeExecutionDataset,
|
| 7 |
+
LCBTestOutputPredictionDataset,
|
| 8 |
+
LCBCodeGenerationEvaluator,
|
| 9 |
+
LCBCodeExecutionEvaluator,
|
| 10 |
+
LCBTestOutputEvaluator
|
| 11 |
+
)
|
| 12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
lcb_code_generation_reader_cfg = dict(
|
| 16 |
+
input_columns=[
|
| 17 |
+
'question_content',
|
| 18 |
+
'format_prompt',
|
| 19 |
+
],
|
| 20 |
+
# output_column='evaluation_sample',
|
| 21 |
+
output_column='question_id',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 25 |
+
|
| 26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
| 27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Code Generation Tasks
|
| 31 |
+
lcb_code_generation_infer_cfg = dict(
|
| 32 |
+
prompt_template=dict(
|
| 33 |
+
type=PromptTemplate,
|
| 34 |
+
template=dict(
|
| 35 |
+
round=[
|
| 36 |
+
dict(
|
| 37 |
+
role='HUMAN',
|
| 38 |
+
prompt=prompt_template
|
| 39 |
+
)
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
),
|
| 43 |
+
retriever=dict(type=ZeroRetriever),
|
| 44 |
+
inferencer=dict(type=GenInferencer)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lcb_code_generation_eval_cfg = dict(
|
| 48 |
+
evaluator=dict(
|
| 49 |
+
type=LCBCodeGenerationEvaluator,
|
| 50 |
+
release_version='v6',
|
| 51 |
+
extractor_version='v2',
|
| 52 |
+
num_process_evaluate=4,
|
| 53 |
+
timeout=6,
|
| 54 |
+
),
|
| 55 |
+
pred_role='BOT',
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
LCBCodeGeneration_dataset = dict(
|
| 59 |
+
type=LCBCodeGenerationDataset,
|
| 60 |
+
abbr='lcb_code_generation_repeat_6',
|
| 61 |
+
path='opencompass/code_generation_lite',
|
| 62 |
+
release_version='v6',
|
| 63 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
| 64 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
| 65 |
+
eval_cfg=lcb_code_generation_eval_cfg,
|
| 66 |
+
n=6,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Code Execution Dataset
|
| 70 |
+
lcb_code_execution_reader_cfg = dict(
|
| 71 |
+
input_columns=[
|
| 72 |
+
'prompt',
|
| 73 |
+
],
|
| 74 |
+
output_column='evaluation_sample',
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
lcb_code_execution_infer_cfg = dict(
|
| 78 |
+
prompt_template=dict(
|
| 79 |
+
type=PromptTemplate,
|
| 80 |
+
template=dict(
|
| 81 |
+
begin=[
|
| 82 |
+
dict(
|
| 83 |
+
role='SYSTEM',
|
| 84 |
+
fallback_role='HUMAN',
|
| 85 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
| 86 |
+
),
|
| 87 |
+
],
|
| 88 |
+
round=[
|
| 89 |
+
dict(
|
| 90 |
+
role='HUMAN',
|
| 91 |
+
prompt='{prompt}'
|
| 92 |
+
)
|
| 93 |
+
]
|
| 94 |
+
)
|
| 95 |
+
),
|
| 96 |
+
retriever=dict(type=ZeroRetriever),
|
| 97 |
+
inferencer=dict(type=GenInferencer)
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
lcb_code_execution_eval_cfg = dict(
|
| 101 |
+
evaluator=dict(
|
| 102 |
+
type=LCBCodeExecutionEvaluator,
|
| 103 |
+
),
|
| 104 |
+
pred_role='BOT',
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
LCBCodeExecution_dataset = dict(
|
| 108 |
+
type=LCBCodeExecutionDataset,
|
| 109 |
+
abbr='lcb_code_execution',
|
| 110 |
+
path='opencompass/execution-v2',
|
| 111 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
| 112 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
| 113 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# TestOuputput Dataset
|
| 117 |
+
lcb_test_output_reader_cfg = dict(
|
| 118 |
+
input_columns=[
|
| 119 |
+
'prompt',
|
| 120 |
+
],
|
| 121 |
+
output_column='evaluation_sample',
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
| 125 |
+
|
| 126 |
+
lcb_test_output_infer_cfg = dict(
|
| 127 |
+
prompt_template=dict(
|
| 128 |
+
type=PromptTemplate,
|
| 129 |
+
template=dict(
|
| 130 |
+
# begin=[
|
| 131 |
+
# dict(
|
| 132 |
+
# role='SYSTEM',
|
| 133 |
+
# prompt=system_prompt
|
| 134 |
+
# ),
|
| 135 |
+
# ],
|
| 136 |
+
round=[
|
| 137 |
+
dict(
|
| 138 |
+
role='HUMAN',
|
| 139 |
+
prompt='{prompt}'
|
| 140 |
+
)
|
| 141 |
+
]
|
| 142 |
+
)
|
| 143 |
+
),
|
| 144 |
+
retriever=dict(type=ZeroRetriever),
|
| 145 |
+
inferencer=dict(type=GenInferencer)
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
lcb_test_output_eval_cfg = dict(
|
| 149 |
+
evaluator=dict(
|
| 150 |
+
type=LCBTestOutputEvaluator,
|
| 151 |
+
),
|
| 152 |
+
pred_role='BOT',
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
LCBTestOutput_dataset = dict(
|
| 156 |
+
type=LCBTestOutputPredictionDataset,
|
| 157 |
+
abbr='lcb_test_output',
|
| 158 |
+
path='opencompass/test_generation',
|
| 159 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
| 160 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
| 161 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
LCB_datasets = [
|
| 165 |
+
LCBCodeGeneration_dataset,
|
| 166 |
+
LCBCodeExecution_dataset,
|
| 167 |
+
LCBTestOutput_dataset,
|
| 168 |
+
]
|
build/lib/opencompass/configs/datasets/livemathbench/README.md
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LiveMathBench
|
| 2 |
+
|
| 3 |
+
## v202412
|
| 4 |
+
|
| 5 |
+
### Details of Datsets
|
| 6 |
+
|
| 7 |
+
| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
|
| 8 |
+
| -- | -- | -- | -- | -- | -- |
|
| 9 |
+
| AMC | cn | 0 | 0 | 0 | 46 |
|
| 10 |
+
| AMC | en | 0 | 0 | 0 | 46 |
|
| 11 |
+
| CCEE | cn | 0 | 0 | 13 | 31 |
|
| 12 |
+
| CCEE | en | 0 | 0 | 13 | 31 |
|
| 13 |
+
| CNMO | cn | 0 | 0 | 0 | 18 |
|
| 14 |
+
| CNMO | en | 0 | 0 | 0 | 18 |
|
| 15 |
+
| WLPMC | cn | 0 | 0 | 0 | 11 |
|
| 16 |
+
| WLPMC | en | 0 | 0 | 0 | 11 |
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
### How to use
|
| 20 |
+
|
| 21 |
+
#### G-Pass@k
|
| 22 |
+
```python
|
| 23 |
+
from mmengine.config import read_base
|
| 24 |
+
|
| 25 |
+
with read_base():
|
| 26 |
+
from opencompass.datasets.livemathbench_gen import livemathbench_datasets
|
| 27 |
+
|
| 28 |
+
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
| 29 |
+
{
|
| 30 |
+
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
|
| 31 |
+
'url': [
|
| 32 |
+
'http://0.0.0.0:23333/v1',
|
| 33 |
+
'...'
|
| 34 |
+
] # set url of evaluation models
|
| 35 |
+
}
|
| 36 |
+
)
|
| 37 |
+
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
|
| 38 |
+
max_out_len=32768 # for o1-like models you need to update max_out_len
|
| 39 |
+
))
|
| 40 |
+
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
#### Greedy
|
| 44 |
+
```python
|
| 45 |
+
from mmengine.config import read_base
|
| 46 |
+
|
| 47 |
+
with read_base():
|
| 48 |
+
from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
|
| 49 |
+
|
| 50 |
+
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
| 51 |
+
{
|
| 52 |
+
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
|
| 53 |
+
'url': [
|
| 54 |
+
'http://0.0.0.0:23333/v1',
|
| 55 |
+
'...'
|
| 56 |
+
] # set url of evaluation models
|
| 57 |
+
}
|
| 58 |
+
)
|
| 59 |
+
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
|
| 60 |
+
max_out_len=32768 # for o1-like models you need to update max_out_len
|
| 61 |
+
))
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### Output Samples
|
| 66 |
+
|
| 67 |
+
| dataset | version | metric | mode | Qwen2.5-72B-Instruct |
|
| 68 |
+
|----- | ----- | ----- | ----- | -----|
|
| 69 |
+
| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
|
| 70 |
+
| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
|
| 71 |
+
| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
|
| 72 |
+
| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
|
| 73 |
+
| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |
|
| 74 |
+
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .livemathbench_gen_9befbf import livemathbench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_reader_cfg = dict(
|
| 9 |
+
input_columns=['prompt'],
|
| 10 |
+
output_column='answer'
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
livemathbench_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 19 |
+
]
|
| 20 |
+
)
|
| 21 |
+
),
|
| 22 |
+
retriever=dict(type=ZeroRetriever),
|
| 23 |
+
inferencer=dict(
|
| 24 |
+
type=GenInferencer,
|
| 25 |
+
max_out_len=16384,
|
| 26 |
+
temperature=1.0
|
| 27 |
+
)
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
livemathbench_eval_cfg = dict(
|
| 31 |
+
evaluator=dict(
|
| 32 |
+
type=LiveMathBenchEvaluator,
|
| 33 |
+
model_name='Qwen/Qwen2.5-72B-Instruct',
|
| 34 |
+
url=['http://172.30.40.154:23333/v1/'] #'https://api.openai.com/v1/'
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
livemathbench_datasets = [
|
| 39 |
+
dict(
|
| 40 |
+
type=LiveMathBenchDataset,
|
| 41 |
+
abbr='LiveMathBench-k1-n1',
|
| 42 |
+
path='opencompass/LiveMathBench202412',
|
| 43 |
+
k=1, # K@Pass
|
| 44 |
+
n=1, # Run times
|
| 45 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 46 |
+
infer_cfg=livemathbench_infer_cfg,
|
| 47 |
+
eval_cfg=livemathbench_eval_cfg
|
| 48 |
+
)
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='',
|
| 11 |
+
k=16,
|
| 12 |
+
n=48,
|
| 13 |
+
dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
|
| 14 |
+
dataset_languages=['cn', 'en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202412',
|
| 17 |
+
abbr='LiveMathBench-v202412',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer,
|
| 34 |
+
max_out_len=8192
|
| 35 |
+
),
|
| 36 |
+
),
|
| 37 |
+
eval_cfg=dict(
|
| 38 |
+
evaluator=dict(
|
| 39 |
+
type=LiveMathBenchEvaluator,
|
| 40 |
+
model_name='',
|
| 41 |
+
url=[]
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_reader_cfg = dict(
|
| 9 |
+
input_columns=['prompt'],
|
| 10 |
+
output_column='answer'
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
livemathbench_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 19 |
+
]
|
| 20 |
+
)
|
| 21 |
+
),
|
| 22 |
+
retriever=dict(type=ZeroRetriever),
|
| 23 |
+
inferencer=dict(
|
| 24 |
+
type=GenInferencer,
|
| 25 |
+
max_out_len=2048,
|
| 26 |
+
temperature=1.0
|
| 27 |
+
)
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
livemathbench_eval_cfg = dict(
|
| 31 |
+
evaluator=dict(
|
| 32 |
+
type=LiveMathBenchEvaluator,
|
| 33 |
+
model_name='Qwen/Qwen2.5-72B-Instruct',
|
| 34 |
+
url=[]
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
livemathbench_datasets = [
|
| 39 |
+
dict(
|
| 40 |
+
type=LiveMathBenchDataset,
|
| 41 |
+
abbr='LiveMathBench',
|
| 42 |
+
path='',
|
| 43 |
+
k=32,
|
| 44 |
+
n=5,
|
| 45 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 46 |
+
infer_cfg=livemathbench_infer_cfg,
|
| 47 |
+
eval_cfg=livemathbench_eval_cfg
|
| 48 |
+
)
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='',
|
| 11 |
+
k=1,
|
| 12 |
+
n=1,
|
| 13 |
+
dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
|
| 14 |
+
dataset_languages=['cn', 'en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202412',
|
| 17 |
+
abbr='LiveMathBench-v202412',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer,
|
| 34 |
+
max_out_len=8192
|
| 35 |
+
),
|
| 36 |
+
),
|
| 37 |
+
eval_cfg=dict(
|
| 38 |
+
evaluator=dict(
|
| 39 |
+
type=LiveMathBenchEvaluator,
|
| 40 |
+
model_name='',
|
| 41 |
+
url=[]
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Summary: A config for LiveMathBench-Hard-202412 Dataset Evaluation.
|
| 3 |
+
Setting:
|
| 4 |
+
Shot: 0-shot
|
| 5 |
+
Evaluator:
|
| 6 |
+
- CascadeEvaluator
|
| 7 |
+
- MATHVerifyEvaluator
|
| 8 |
+
- GenericLLMEvaluator
|
| 9 |
+
Repeat: 32
|
| 10 |
+
Avaliable Models:
|
| 11 |
+
- Instruct/Chat Models
|
| 12 |
+
"""
|
| 13 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 14 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 15 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 16 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 17 |
+
from opencompass.datasets import CustomDataset
|
| 18 |
+
from opencompass.datasets import generic_llmjudge_postprocess
|
| 19 |
+
from opencompass.evaluator import (
|
| 20 |
+
CascadeEvaluator,
|
| 21 |
+
GenericLLMEvaluator,
|
| 22 |
+
MATHVerifyEvaluator,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Inference configuration
|
| 29 |
+
livemathbench_infer_cfg = dict(
|
| 30 |
+
prompt_template=dict(
|
| 31 |
+
type=PromptTemplate,
|
| 32 |
+
template=dict(
|
| 33 |
+
round=[
|
| 34 |
+
dict(
|
| 35 |
+
role='HUMAN',
|
| 36 |
+
prompt='{question}\nRemember to put your final answer within \\boxed{}.',
|
| 37 |
+
),
|
| 38 |
+
]
|
| 39 |
+
),
|
| 40 |
+
),
|
| 41 |
+
retriever=dict(type=ZeroRetriever),
|
| 42 |
+
inferencer=dict(type=GenInferencer),
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Template for the LLM judge
|
| 47 |
+
GRADER_TEMPLATE = """
|
| 48 |
+
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
| 49 |
+
|
| 50 |
+
Here are some evaluation criteria:
|
| 51 |
+
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
| 52 |
+
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
| 53 |
+
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
| 54 |
+
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
| 55 |
+
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
| 56 |
+
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
| 57 |
+
A: CORRECT
|
| 58 |
+
B: INCORRECT
|
| 59 |
+
Just return the letters "A" or "B", with no text around it.
|
| 60 |
+
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 61 |
+
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
| 62 |
+
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
| 63 |
+
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
| 64 |
+
|
| 65 |
+
Judging the correctness of candidates' answers:
|
| 66 |
+
""".strip()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
splits = ['hard_cn', 'hard_en']
|
| 71 |
+
# Dataset configuration
|
| 72 |
+
livemathbench_datasets = [
|
| 73 |
+
dict(
|
| 74 |
+
type=CustomDataset,
|
| 75 |
+
abbr=f'livemathbench_hard_custom_{split}',
|
| 76 |
+
path='data/LiveMathBench',
|
| 77 |
+
local_mode=True,
|
| 78 |
+
file_name=f'202412/{split}.jsonl',
|
| 79 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 80 |
+
infer_cfg=livemathbench_infer_cfg,
|
| 81 |
+
eval_cfg=dict(
|
| 82 |
+
# Evaluation configuration using LLM as judge
|
| 83 |
+
evaluator=dict(
|
| 84 |
+
type=CascadeEvaluator,
|
| 85 |
+
rule_evaluator=dict(
|
| 86 |
+
type=MATHVerifyEvaluator,
|
| 87 |
+
),
|
| 88 |
+
llm_evaluator=dict(
|
| 89 |
+
type=GenericLLMEvaluator,
|
| 90 |
+
prompt_template=dict(
|
| 91 |
+
type=PromptTemplate,
|
| 92 |
+
template=dict(
|
| 93 |
+
begin=[
|
| 94 |
+
dict(
|
| 95 |
+
role='SYSTEM',
|
| 96 |
+
fallback_role='HUMAN',
|
| 97 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
| 98 |
+
)
|
| 99 |
+
],
|
| 100 |
+
round=[
|
| 101 |
+
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
| 102 |
+
],
|
| 103 |
+
),
|
| 104 |
+
),
|
| 105 |
+
dataset_cfg=dict(
|
| 106 |
+
type=CustomDataset,
|
| 107 |
+
path='data/LiveMathBench',
|
| 108 |
+
local_mode=True,
|
| 109 |
+
file_name=f'202412/{split}.jsonl',
|
| 110 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 111 |
+
),
|
| 112 |
+
judge_cfg={},
|
| 113 |
+
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
| 114 |
+
),
|
| 115 |
+
parallel=False
|
| 116 |
+
),
|
| 117 |
+
),
|
| 118 |
+
n=1, # repeat n times
|
| 119 |
+
) for split in splits
|
| 120 |
+
]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 5 |
+
from opencompass.datasets import CustomDataset
|
| 6 |
+
from opencompass.datasets import generic_llmjudge_postprocess
|
| 7 |
+
from itertools import product
|
| 8 |
+
|
| 9 |
+
livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Inference configuration
|
| 13 |
+
livemathbench_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='{question}\n',
|
| 21 |
+
),
|
| 22 |
+
]
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Template for the LLM judge
|
| 31 |
+
GRADER_TEMPLATE = """
|
| 32 |
+
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
| 33 |
+
|
| 34 |
+
Here are some evaluation criteria:
|
| 35 |
+
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
| 36 |
+
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
| 37 |
+
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
| 38 |
+
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
| 39 |
+
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
| 40 |
+
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
| 41 |
+
A: CORRECT
|
| 42 |
+
B: INCORRECT
|
| 43 |
+
Just return the letters "A" or "B", with no text around it.
|
| 44 |
+
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 45 |
+
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
| 46 |
+
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
| 47 |
+
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
| 48 |
+
|
| 49 |
+
Judging the correctness of candidates' answers:
|
| 50 |
+
""".strip()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
splits = ['hard_cn', 'hard_en']
|
| 55 |
+
# Dataset configuration
|
| 56 |
+
livemathbench_datasets = [
|
| 57 |
+
dict(
|
| 58 |
+
type=CustomDataset,
|
| 59 |
+
abbr=f'livemathbench_hard_custom_{split}_run{run_idx}',
|
| 60 |
+
path='data/LiveMathBench',
|
| 61 |
+
local_mode=True,
|
| 62 |
+
file_name=f'202412/{split}.jsonl',
|
| 63 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 64 |
+
infer_cfg=livemathbench_infer_cfg,
|
| 65 |
+
eval_cfg=dict(
|
| 66 |
+
# # Evaluation configuration using LLM as judge
|
| 67 |
+
evaluator=dict(
|
| 68 |
+
type=GenericLLMEvaluator,
|
| 69 |
+
prompt_template=dict(
|
| 70 |
+
type=PromptTemplate,
|
| 71 |
+
template=dict(
|
| 72 |
+
begin=[
|
| 73 |
+
dict(
|
| 74 |
+
role='SYSTEM',
|
| 75 |
+
fallback_role='HUMAN',
|
| 76 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
| 77 |
+
)
|
| 78 |
+
],
|
| 79 |
+
round=[
|
| 80 |
+
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
| 81 |
+
],
|
| 82 |
+
),
|
| 83 |
+
),
|
| 84 |
+
dataset_cfg=dict(
|
| 85 |
+
type=CustomDataset,
|
| 86 |
+
path='data/LiveMathBench',
|
| 87 |
+
local_mode=True,
|
| 88 |
+
file_name=f'202412/{split}.jsonl',
|
| 89 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 90 |
+
),
|
| 91 |
+
judge_cfg={},
|
| 92 |
+
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
| 93 |
+
),
|
| 94 |
+
),
|
| 95 |
+
) for split, run_idx in product(splits, range(8))
|
| 96 |
+
]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='',
|
| 11 |
+
k=16,
|
| 12 |
+
n=48,
|
| 13 |
+
dataset_splits=['hard'],
|
| 14 |
+
dataset_languages=['cn', 'en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202412',
|
| 17 |
+
abbr='LiveMathBench-v202412-Hard',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer
|
| 34 |
+
),
|
| 35 |
+
),
|
| 36 |
+
eval_cfg=dict(
|
| 37 |
+
evaluator=dict(
|
| 38 |
+
type=LiveMathBenchEvaluator,
|
| 39 |
+
model_name='',
|
| 40 |
+
url=[]
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='',
|
| 11 |
+
k=1,
|
| 12 |
+
n=1,
|
| 13 |
+
dataset_splits=['hard'],
|
| 14 |
+
dataset_languages=['cn', 'en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202412',
|
| 17 |
+
abbr='LiveMathBench-v202412-Hard',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer
|
| 34 |
+
),
|
| 35 |
+
),
|
| 36 |
+
eval_cfg=dict(
|
| 37 |
+
evaluator=dict(
|
| 38 |
+
type=LiveMathBenchEvaluator,
|
| 39 |
+
model_name='',
|
| 40 |
+
url=[]
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset
|
| 6 |
+
from opencompass.datasets import generic_llmjudge_postprocess
|
| 7 |
+
|
| 8 |
+
livemathbench_reader_cfg = dict(
|
| 9 |
+
input_columns=['question'], output_column='answer'
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Inference configuration
|
| 14 |
+
livemathbench_infer_cfg = dict(
|
| 15 |
+
prompt_template=dict(
|
| 16 |
+
type=PromptTemplate,
|
| 17 |
+
template=dict(
|
| 18 |
+
round=[
|
| 19 |
+
dict(
|
| 20 |
+
role='HUMAN',
|
| 21 |
+
prompt='{question}\n',
|
| 22 |
+
),
|
| 23 |
+
]
|
| 24 |
+
),
|
| 25 |
+
),
|
| 26 |
+
retriever=dict(type=ZeroRetriever),
|
| 27 |
+
inferencer=dict(type=GenInferencer),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Template for the LLM judge
|
| 32 |
+
GRADER_TEMPLATE = """
|
| 33 |
+
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
| 34 |
+
|
| 35 |
+
Here are some evaluation criteria:
|
| 36 |
+
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
| 37 |
+
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
| 38 |
+
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
| 39 |
+
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
| 40 |
+
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
| 41 |
+
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
| 42 |
+
A: CORRECT
|
| 43 |
+
B: INCORRECT
|
| 44 |
+
Just return the letters "A" or "B", with no text around it.
|
| 45 |
+
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 46 |
+
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
| 47 |
+
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
| 48 |
+
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
| 49 |
+
|
| 50 |
+
Judging the correctness of candidates' answers:
|
| 51 |
+
""".strip()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
splits = ['hard']
|
| 55 |
+
livemathbench_datasets = []
|
| 56 |
+
for split in splits:
|
| 57 |
+
# Dataset configuration
|
| 58 |
+
livemathbench_datasets.append(
|
| 59 |
+
dict(
|
| 60 |
+
type=LiveMathBenchDataset,
|
| 61 |
+
abbr=f'livemathbench_{split}',
|
| 62 |
+
path='opencompass/LiveMathBench',
|
| 63 |
+
dataset_splits = [split],
|
| 64 |
+
dataset_languages= ['cn', 'en'],
|
| 65 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 66 |
+
infer_cfg=livemathbench_infer_cfg,
|
| 67 |
+
eval_cfg=dict(
|
| 68 |
+
# # Evaluation configuration using LLM as judge
|
| 69 |
+
evaluator=dict(
|
| 70 |
+
type=GenericLLMEvaluator,
|
| 71 |
+
prompt_template=dict(
|
| 72 |
+
type=PromptTemplate,
|
| 73 |
+
template=dict(
|
| 74 |
+
begin=[
|
| 75 |
+
dict(
|
| 76 |
+
role='SYSTEM',
|
| 77 |
+
fallback_role='HUMAN',
|
| 78 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
| 79 |
+
)
|
| 80 |
+
],
|
| 81 |
+
round=[
|
| 82 |
+
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
| 83 |
+
],
|
| 84 |
+
),
|
| 85 |
+
),
|
| 86 |
+
dataset_cfg=dict(
|
| 87 |
+
type=LiveMathBenchDataset,
|
| 88 |
+
path='opencompass/LiveMathBench202412',
|
| 89 |
+
dataset_splits = [split],
|
| 90 |
+
reader_cfg=livemathbench_reader_cfg,
|
| 91 |
+
),
|
| 92 |
+
judge_cfg={},
|
| 93 |
+
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
| 94 |
+
),
|
| 95 |
+
),
|
| 96 |
+
)
|
| 97 |
+
)
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='opencompass/LiveMathBench',
|
| 11 |
+
k=16,
|
| 12 |
+
n=48,
|
| 13 |
+
dataset_splits=['all'],
|
| 14 |
+
dataset_languages=['en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202505',
|
| 17 |
+
abbr='LiveMathBench-v202505',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer,
|
| 34 |
+
max_out_len=8192
|
| 35 |
+
),
|
| 36 |
+
),
|
| 37 |
+
eval_cfg=dict(
|
| 38 |
+
evaluator=dict(
|
| 39 |
+
type=LiveMathBenchEvaluator,
|
| 40 |
+
model_name='',
|
| 41 |
+
url=[]
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='opencompass/LiveMathBench',
|
| 11 |
+
k=1,
|
| 12 |
+
n=1,
|
| 13 |
+
dataset_splits=['all'],
|
| 14 |
+
dataset_languages=['en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202505',
|
| 17 |
+
abbr='LiveMathBench-v202505',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer,
|
| 34 |
+
max_out_len=8192
|
| 35 |
+
),
|
| 36 |
+
),
|
| 37 |
+
eval_cfg=dict(
|
| 38 |
+
evaluator=dict(
|
| 39 |
+
type=LiveMathBenchEvaluator,
|
| 40 |
+
model_name='',
|
| 41 |
+
url=[]
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
livemathbench_dataset = dict(
|
| 9 |
+
type=LiveMathBenchDataset,
|
| 10 |
+
path='opencompass/LiveMathBench',
|
| 11 |
+
k=16,
|
| 12 |
+
n=48,
|
| 13 |
+
dataset_splits=['hard'],
|
| 14 |
+
dataset_languages=['en'],
|
| 15 |
+
cot=True,
|
| 16 |
+
version='202505',
|
| 17 |
+
abbr='LiveMathBench-v202505-Hard',
|
| 18 |
+
reader_cfg=dict(
|
| 19 |
+
input_columns=['prompt'],
|
| 20 |
+
output_column='answer'
|
| 21 |
+
),
|
| 22 |
+
infer_cfg=dict(
|
| 23 |
+
prompt_template=dict(
|
| 24 |
+
type=PromptTemplate,
|
| 25 |
+
template=dict(
|
| 26 |
+
round=[
|
| 27 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
),
|
| 31 |
+
retriever=dict(type=ZeroRetriever),
|
| 32 |
+
inferencer=dict(
|
| 33 |
+
type=GenInferencer
|
| 34 |
+
),
|
| 35 |
+
),
|
| 36 |
+
eval_cfg=dict(
|
| 37 |
+
evaluator=dict(
|
| 38 |
+
type=LiveMathBenchEvaluator,
|
| 39 |
+
model_name='',
|
| 40 |
+
url=[]
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
|
| 6 |
+
|
| 7 |
+
livemathbench_dataset = dict(
|
| 8 |
+
type=LiveMathBenchDataset,
|
| 9 |
+
path='opencompass/LiveMathBench',
|
| 10 |
+
k=1,
|
| 11 |
+
n=1,
|
| 12 |
+
dataset_splits=['hard'],
|
| 13 |
+
dataset_languages=['en'],
|
| 14 |
+
cot=True,
|
| 15 |
+
version='202505',
|
| 16 |
+
abbr='LiveMathBench-v202505-Hard',
|
| 17 |
+
reader_cfg=dict(
|
| 18 |
+
input_columns=['prompt'],
|
| 19 |
+
output_column='answer'
|
| 20 |
+
),
|
| 21 |
+
infer_cfg=dict(
|
| 22 |
+
prompt_template=dict(
|
| 23 |
+
type=PromptTemplate,
|
| 24 |
+
template=dict(
|
| 25 |
+
round=[
|
| 26 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 27 |
+
]
|
| 28 |
+
)
|
| 29 |
+
),
|
| 30 |
+
retriever=dict(type=ZeroRetriever),
|
| 31 |
+
inferencer=dict(
|
| 32 |
+
type=GenInferencer
|
| 33 |
+
),
|
| 34 |
+
),
|
| 35 |
+
eval_cfg=dict(
|
| 36 |
+
evaluator=dict(
|
| 37 |
+
type=LiveMathBenchEvaluator,
|
| 38 |
+
model_name='',
|
| 39 |
+
url=[]
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
livemathbench_datasets = [livemathbench_dataset]
|
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .livereasonbench_gen_0283c3 import simpleqa_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 5 |
+
# from opencompass.datasets import SimpleQADataset, simpleqa_postprocess
|
| 6 |
+
from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
GRADER_TEMPLATE = """
|
| 10 |
+
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
| 11 |
+
First, I will give examples of each grade, and then you will grade a new example.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
The following are examples of CORRECT predicted answers.
|
| 15 |
+
```
|
| 16 |
+
Question: What are the names of Barack Obama's children?
|
| 17 |
+
Gold target: Malia Obama and Sasha Obama
|
| 18 |
+
Predicted answer 1: sasha and malia obama
|
| 19 |
+
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
|
| 20 |
+
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
|
| 21 |
+
```
|
| 22 |
+
These predicted answers are all CORRECT because:
|
| 23 |
+
- They fully contain the important information in the gold target.
|
| 24 |
+
- They do not contain any information that contradicts the gold target.
|
| 25 |
+
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
|
| 26 |
+
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
The following are examples of INCORRECT predicted answers.
|
| 30 |
+
```
|
| 31 |
+
Question: What are the names of Barack Obama's children?
|
| 32 |
+
Gold target: Malia and Sasha
|
| 33 |
+
Predicted answer 1: Malia.
|
| 34 |
+
Predicted answer 2: Malia, Sasha, and Susan.
|
| 35 |
+
Predicted answer 3: Barack Obama does not have any children.
|
| 36 |
+
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
|
| 37 |
+
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
|
| 38 |
+
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
|
| 39 |
+
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
|
| 40 |
+
```
|
| 41 |
+
These predicted answers are all INCORRECT because:
|
| 42 |
+
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
The following are examples of NOT_ATTEMPTED predicted answers.
|
| 46 |
+
```
|
| 47 |
+
Question: What are the names of Barack Obama's children?
|
| 48 |
+
Gold target: Malia and Sasha
|
| 49 |
+
Predicted answer 1: I don't know.
|
| 50 |
+
Predicted answer 2: I need more context about which Obama you are talking about.
|
| 51 |
+
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
|
| 52 |
+
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
|
| 53 |
+
```
|
| 54 |
+
These predicted answers are all NOT_ATTEMPTED because:
|
| 55 |
+
- The important information in the gold target is not included in the answer.
|
| 56 |
+
- No statements in the answer contradict the gold target.
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
Also note the following things:
|
| 60 |
+
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
|
| 61 |
+
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
|
| 62 |
+
- Predicted answers "100k" and "113k" are INCORRECT.
|
| 63 |
+
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
|
| 64 |
+
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
|
| 65 |
+
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
|
| 66 |
+
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
|
| 67 |
+
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
|
| 68 |
+
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
|
| 69 |
+
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
|
| 70 |
+
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
|
| 71 |
+
- Do not punish for typos in people's name if it's clearly the same name.
|
| 72 |
+
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
| 73 |
+
|
| 74 |
+
Grade the predicted answer of this new question as one of:
|
| 75 |
+
A: CORRECT
|
| 76 |
+
B: INCORRECT
|
| 77 |
+
C: NOT_ATTEMPTED
|
| 78 |
+
Just return the letters "A", "B", or "C", with no text around it.
|
| 79 |
+
|
| 80 |
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 81 |
+
```
|
| 82 |
+
Question: {question}
|
| 83 |
+
Gold target: {answer}
|
| 84 |
+
Predicted answer: {prediction}
|
| 85 |
+
```
|
| 86 |
+
""".strip()
|
| 87 |
+
|
| 88 |
+
livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 89 |
+
|
| 90 |
+
livereasonbench_infer_cfg = dict(
|
| 91 |
+
prompt_template=dict(
|
| 92 |
+
type=PromptTemplate,
|
| 93 |
+
template=dict(
|
| 94 |
+
round=[
|
| 95 |
+
dict(role='HUMAN', prompt='Question: {question}\n'),
|
| 96 |
+
],
|
| 97 |
+
)),
|
| 98 |
+
retriever=dict(type=ZeroRetriever),
|
| 99 |
+
inferencer=dict(type=GenInferencer, max_out_len=16384))
|
| 100 |
+
|
| 101 |
+
livereasonbench_eval_cfg = dict(
|
| 102 |
+
evaluator=dict(
|
| 103 |
+
type=LMEvaluator,
|
| 104 |
+
prompt_template=dict(
|
| 105 |
+
type=PromptTemplate,
|
| 106 |
+
template=dict(
|
| 107 |
+
begin=[
|
| 108 |
+
dict(
|
| 109 |
+
role='SYSTEM',
|
| 110 |
+
fallback_role='HUMAN',
|
| 111 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 112 |
+
],
|
| 113 |
+
round=[
|
| 114 |
+
dict(
|
| 115 |
+
role='HUMAN',
|
| 116 |
+
prompt = GRADER_TEMPLATE
|
| 117 |
+
),
|
| 118 |
+
]),
|
| 119 |
+
),
|
| 120 |
+
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
| 121 |
+
),
|
| 122 |
+
pred_role='BOT',
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
livereasonbench_datasets = [
|
| 126 |
+
dict(
|
| 127 |
+
abbr='LiveReasonBench-20241202',
|
| 128 |
+
type=LiveReasonBenchDataset,
|
| 129 |
+
path='opencompass/LiveReasonBench',
|
| 130 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 131 |
+
infer_cfg=livereasonbench_infer_cfg,
|
| 132 |
+
eval_cfg=livereasonbench_eval_cfg,
|
| 133 |
+
version='livereasonbench-20241202',
|
| 134 |
+
mode='singlescore',
|
| 135 |
+
)
|
| 136 |
+
]
|
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 6 |
+
from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
GRADER_TEMPLATE = """
|
| 10 |
+
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
| 11 |
+
First, I will give examples of each grade, and then you will grade a new example.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
The following are examples of CORRECT predicted answers.
|
| 15 |
+
```
|
| 16 |
+
Question: What are the names of Barack Obama's children?
|
| 17 |
+
Gold target: Malia Obama and Sasha Obama
|
| 18 |
+
Predicted answer 1: sasha and malia obama
|
| 19 |
+
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
|
| 20 |
+
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
|
| 21 |
+
```
|
| 22 |
+
These predicted answers are all CORRECT because:
|
| 23 |
+
- They fully contain the important information in the gold target.
|
| 24 |
+
- They do not contain any information that contradicts the gold target.
|
| 25 |
+
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
|
| 26 |
+
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
The following are examples of INCORRECT predicted answers.
|
| 30 |
+
```
|
| 31 |
+
Question: What are the names of Barack Obama's children?
|
| 32 |
+
Gold target: Malia and Sasha
|
| 33 |
+
Predicted answer 1: Malia.
|
| 34 |
+
Predicted answer 2: Malia, Sasha, and Susan.
|
| 35 |
+
Predicted answer 3: Barack Obama does not have any children.
|
| 36 |
+
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
|
| 37 |
+
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
|
| 38 |
+
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
|
| 39 |
+
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
|
| 40 |
+
```
|
| 41 |
+
These predicted answers are all INCORRECT because:
|
| 42 |
+
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
The following are examples of NOT_ATTEMPTED predicted answers.
|
| 46 |
+
```
|
| 47 |
+
Question: What are the names of Barack Obama's children?
|
| 48 |
+
Gold target: Malia and Sasha
|
| 49 |
+
Predicted answer 1: I don't know.
|
| 50 |
+
Predicted answer 2: I need more context about which Obama you are talking about.
|
| 51 |
+
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
|
| 52 |
+
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
|
| 53 |
+
```
|
| 54 |
+
These predicted answers are all NOT_ATTEMPTED because:
|
| 55 |
+
- The important information in the gold target is not included in the answer.
|
| 56 |
+
- No statements in the answer contradict the gold target.
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
Also note the following things:
|
| 60 |
+
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
|
| 61 |
+
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
|
| 62 |
+
- Predicted answers "100k" and "113k" are INCORRECT.
|
| 63 |
+
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
|
| 64 |
+
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
|
| 65 |
+
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
|
| 66 |
+
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
|
| 67 |
+
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
|
| 68 |
+
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
|
| 69 |
+
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
|
| 70 |
+
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
|
| 71 |
+
- Do not punish for typos in people's name if it's clearly the same name.
|
| 72 |
+
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
| 73 |
+
|
| 74 |
+
Grade the predicted answer of this new question as one of:
|
| 75 |
+
A: CORRECT
|
| 76 |
+
B: INCORRECT
|
| 77 |
+
C: NOT_ATTEMPTED
|
| 78 |
+
Just return the letters "A", "B", or "C", with no text around it.
|
| 79 |
+
|
| 80 |
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 81 |
+
```
|
| 82 |
+
Question: {question}
|
| 83 |
+
Gold target: {answer}
|
| 84 |
+
Predicted answer: {prediction}
|
| 85 |
+
```
|
| 86 |
+
""".strip()
|
| 87 |
+
|
| 88 |
+
livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 89 |
+
|
| 90 |
+
livereasonbench_infer_cfg = dict(
|
| 91 |
+
prompt_template=dict(
|
| 92 |
+
type=PromptTemplate,
|
| 93 |
+
template=dict(
|
| 94 |
+
round=[
|
| 95 |
+
dict(role='HUMAN', prompt='Question: {question}\n'),
|
| 96 |
+
],
|
| 97 |
+
)),
|
| 98 |
+
retriever=dict(type=ZeroRetriever),
|
| 99 |
+
inferencer=dict(type=GenInferencer, max_out_len=16384))
|
| 100 |
+
|
| 101 |
+
livereasonbench_eval_cfg = dict(
|
| 102 |
+
evaluator=dict(
|
| 103 |
+
type=GenericLLMEvaluator,
|
| 104 |
+
prompt_template=dict(
|
| 105 |
+
type=PromptTemplate,
|
| 106 |
+
template=dict(
|
| 107 |
+
begin=[
|
| 108 |
+
dict(
|
| 109 |
+
role='SYSTEM',
|
| 110 |
+
fallback_role='HUMAN',
|
| 111 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 112 |
+
],
|
| 113 |
+
round=[
|
| 114 |
+
dict(
|
| 115 |
+
role='HUMAN',
|
| 116 |
+
prompt = GRADER_TEMPLATE
|
| 117 |
+
),
|
| 118 |
+
]),
|
| 119 |
+
),
|
| 120 |
+
dataset_cfg=dict(
|
| 121 |
+
type=LiveReasonBenchDataset,
|
| 122 |
+
path='opencompass/LiveReasonBench',
|
| 123 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 124 |
+
),
|
| 125 |
+
judge_cfg=dict(),
|
| 126 |
+
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
| 127 |
+
),
|
| 128 |
+
pred_role='BOT',
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
livereasonbench_datasets = [
|
| 132 |
+
dict(
|
| 133 |
+
abbr='LiveReasonBench-20241202',
|
| 134 |
+
type=LiveReasonBenchDataset,
|
| 135 |
+
path='opencompass/LiveReasonBench',
|
| 136 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 137 |
+
infer_cfg=livereasonbench_infer_cfg,
|
| 138 |
+
eval_cfg=livereasonbench_eval_cfg,
|
| 139 |
+
version='livereasonbench-20241202',
|
| 140 |
+
mode='singlescore',
|
| 141 |
+
)
|
| 142 |
+
]
|
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
|
| 5 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 6 |
+
from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
GRADER_TEMPLATE = """
|
| 10 |
+
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
| 11 |
+
First, I will give examples of each grade, and then you will grade a new example.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
The following are examples of CORRECT predicted answers.
|
| 15 |
+
```
|
| 16 |
+
Question: What are the names of Barack Obama's children?
|
| 17 |
+
Gold target: Malia Obama and Sasha Obama
|
| 18 |
+
Predicted answer 1: sasha and malia obama
|
| 19 |
+
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
|
| 20 |
+
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
|
| 21 |
+
```
|
| 22 |
+
These predicted answers are all CORRECT because:
|
| 23 |
+
- They fully contain the important information in the gold target.
|
| 24 |
+
- They do not contain any information that contradicts the gold target.
|
| 25 |
+
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
|
| 26 |
+
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
The following are examples of INCORRECT predicted answers.
|
| 30 |
+
```
|
| 31 |
+
Question: What are the names of Barack Obama's children?
|
| 32 |
+
Gold target: Malia and Sasha
|
| 33 |
+
Predicted answer 1: Malia.
|
| 34 |
+
Predicted answer 2: Malia, Sasha, and Susan.
|
| 35 |
+
Predicted answer 3: Barack Obama does not have any children.
|
| 36 |
+
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
|
| 37 |
+
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
|
| 38 |
+
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
|
| 39 |
+
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
|
| 40 |
+
```
|
| 41 |
+
These predicted answers are all INCORRECT because:
|
| 42 |
+
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
The following are examples of NOT_ATTEMPTED predicted answers.
|
| 46 |
+
```
|
| 47 |
+
Question: What are the names of Barack Obama's children?
|
| 48 |
+
Gold target: Malia and Sasha
|
| 49 |
+
Predicted answer 1: I don't know.
|
| 50 |
+
Predicted answer 2: I need more context about which Obama you are talking about.
|
| 51 |
+
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
|
| 52 |
+
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
|
| 53 |
+
```
|
| 54 |
+
These predicted answers are all NOT_ATTEMPTED because:
|
| 55 |
+
- The important information in the gold target is not included in the answer.
|
| 56 |
+
- No statements in the answer contradict the gold target.
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
Also note the following things:
|
| 60 |
+
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
|
| 61 |
+
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
|
| 62 |
+
- Predicted answers "100k" and "113k" are INCORRECT.
|
| 63 |
+
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
|
| 64 |
+
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
|
| 65 |
+
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
|
| 66 |
+
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
|
| 67 |
+
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
|
| 68 |
+
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
|
| 69 |
+
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
|
| 70 |
+
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
|
| 71 |
+
- Do not punish for typos in people's name if it's clearly the same name.
|
| 72 |
+
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
| 73 |
+
|
| 74 |
+
Grade the predicted answer of this new question as one of:
|
| 75 |
+
A: CORRECT
|
| 76 |
+
B: INCORRECT
|
| 77 |
+
C: NOT_ATTEMPTED
|
| 78 |
+
Just return the letters "A", "B", or "C", with no text around it.
|
| 79 |
+
|
| 80 |
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 81 |
+
```
|
| 82 |
+
Question: {question}
|
| 83 |
+
Gold target: {answer}
|
| 84 |
+
Predicted answer: {prediction}
|
| 85 |
+
```
|
| 86 |
+
""".strip()
|
| 87 |
+
|
| 88 |
+
livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 89 |
+
|
| 90 |
+
livereasonbench_infer_cfg = dict(
|
| 91 |
+
prompt_template=dict(
|
| 92 |
+
type=PromptTemplate,
|
| 93 |
+
template=dict(
|
| 94 |
+
round=[
|
| 95 |
+
dict(role='HUMAN', prompt='Question: {question}\n'),
|
| 96 |
+
],
|
| 97 |
+
)),
|
| 98 |
+
retriever=dict(type=ZeroRetriever),
|
| 99 |
+
inferencer=dict(type=GenInferencer))
|
| 100 |
+
|
| 101 |
+
livereasonbench_eval_cfg = dict(
|
| 102 |
+
evaluator=dict(
|
| 103 |
+
type=GenericLLMEvaluator,
|
| 104 |
+
prompt_template=dict(
|
| 105 |
+
type=PromptTemplate,
|
| 106 |
+
template=dict(
|
| 107 |
+
begin=[
|
| 108 |
+
dict(
|
| 109 |
+
role='SYSTEM',
|
| 110 |
+
fallback_role='HUMAN',
|
| 111 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 112 |
+
],
|
| 113 |
+
round=[
|
| 114 |
+
dict(
|
| 115 |
+
role='HUMAN',
|
| 116 |
+
prompt = GRADER_TEMPLATE
|
| 117 |
+
),
|
| 118 |
+
]),
|
| 119 |
+
),
|
| 120 |
+
dataset_cfg=dict(
|
| 121 |
+
type=LiveReasonBenchDataset,
|
| 122 |
+
path='opencompass/LiveReasonBench',
|
| 123 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 124 |
+
version='livereasonbench-20250428',
|
| 125 |
+
),
|
| 126 |
+
judge_cfg=dict(),
|
| 127 |
+
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
| 128 |
+
),
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
livereasonbench_datasets = [
|
| 132 |
+
dict(
|
| 133 |
+
abbr='LiveReasonBench-20250428',
|
| 134 |
+
type=LiveReasonBenchDataset,
|
| 135 |
+
path='opencompass/LiveReasonBench',
|
| 136 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 137 |
+
infer_cfg=livereasonbench_infer_cfg,
|
| 138 |
+
eval_cfg=livereasonbench_eval_cfg,
|
| 139 |
+
version='livereasonbench-20250428',
|
| 140 |
+
n=1
|
| 141 |
+
)
|
| 142 |
+
]
|
build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 5 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 6 |
+
from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
GRADER_TEMPLATE = """
|
| 10 |
+
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
| 11 |
+
First, I will give examples of each grade, and then you will grade a new example.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
The following are examples of CORRECT predicted answers.
|
| 15 |
+
```
|
| 16 |
+
Question: What are the names of Barack Obama's children?
|
| 17 |
+
Gold target: Malia Obama and Sasha Obama
|
| 18 |
+
Predicted answer 1: sasha and malia obama
|
| 19 |
+
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
|
| 20 |
+
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
|
| 21 |
+
```
|
| 22 |
+
These predicted answers are all CORRECT because:
|
| 23 |
+
- They fully contain the important information in the gold target.
|
| 24 |
+
- They do not contain any information that contradicts the gold target.
|
| 25 |
+
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
|
| 26 |
+
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
The following are examples of INCORRECT predicted answers.
|
| 30 |
+
```
|
| 31 |
+
Question: What are the names of Barack Obama's children?
|
| 32 |
+
Gold target: Malia and Sasha
|
| 33 |
+
Predicted answer 1: Malia.
|
| 34 |
+
Predicted answer 2: Malia, Sasha, and Susan.
|
| 35 |
+
Predicted answer 3: Barack Obama does not have any children.
|
| 36 |
+
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
|
| 37 |
+
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
|
| 38 |
+
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
|
| 39 |
+
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
|
| 40 |
+
```
|
| 41 |
+
These predicted answers are all INCORRECT because:
|
| 42 |
+
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
The following are examples of NOT_ATTEMPTED predicted answers.
|
| 46 |
+
```
|
| 47 |
+
Question: What are the names of Barack Obama's children?
|
| 48 |
+
Gold target: Malia and Sasha
|
| 49 |
+
Predicted answer 1: I don't know.
|
| 50 |
+
Predicted answer 2: I need more context about which Obama you are talking about.
|
| 51 |
+
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
|
| 52 |
+
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
|
| 53 |
+
```
|
| 54 |
+
These predicted answers are all NOT_ATTEMPTED because:
|
| 55 |
+
- The important information in the gold target is not included in the answer.
|
| 56 |
+
- No statements in the answer contradict the gold target.
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
Also note the following things:
|
| 60 |
+
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
|
| 61 |
+
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
|
| 62 |
+
- Predicted answers "100k" and "113k" are INCORRECT.
|
| 63 |
+
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
|
| 64 |
+
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
|
| 65 |
+
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
|
| 66 |
+
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
|
| 67 |
+
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
|
| 68 |
+
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
|
| 69 |
+
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
|
| 70 |
+
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
|
| 71 |
+
- Do not punish for typos in people's name if it's clearly the same name.
|
| 72 |
+
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
| 73 |
+
|
| 74 |
+
Grade the predicted answer of this new question as one of:
|
| 75 |
+
A: CORRECT
|
| 76 |
+
B: INCORRECT
|
| 77 |
+
C: NOT_ATTEMPTED
|
| 78 |
+
Just return the letters "A", "B", or "C", with no text around it.
|
| 79 |
+
|
| 80 |
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 81 |
+
```
|
| 82 |
+
Question: {question}
|
| 83 |
+
Gold target: {answer}
|
| 84 |
+
Predicted answer: {prediction}
|
| 85 |
+
```
|
| 86 |
+
""".strip()
|
| 87 |
+
|
| 88 |
+
livereasonbench_subsets = {
|
| 89 |
+
'biology': 'livestembench_bio',
|
| 90 |
+
'chemistry': 'livestembench_che',
|
| 91 |
+
'physics': 'livestembench_phy',
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
livestembench_datasets = []
|
| 95 |
+
|
| 96 |
+
for name, subset in livereasonbench_subsets.items():
|
| 97 |
+
livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 98 |
+
|
| 99 |
+
livereasonbench_infer_cfg = dict(
|
| 100 |
+
prompt_template=dict(
|
| 101 |
+
type=PromptTemplate,
|
| 102 |
+
template=dict(
|
| 103 |
+
round=[
|
| 104 |
+
dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'),
|
| 105 |
+
],
|
| 106 |
+
)),
|
| 107 |
+
retriever=dict(type=ZeroRetriever),
|
| 108 |
+
inferencer=dict(type=GenInferencer, max_out_len=8192))
|
| 109 |
+
|
| 110 |
+
livereasonbench_eval_cfg = dict(
|
| 111 |
+
evaluator=dict(
|
| 112 |
+
type=GenericLLMEvaluator,
|
| 113 |
+
prompt_template=dict(
|
| 114 |
+
type=PromptTemplate,
|
| 115 |
+
template=dict(
|
| 116 |
+
begin=[
|
| 117 |
+
dict(
|
| 118 |
+
role='SYSTEM',
|
| 119 |
+
fallback_role='HUMAN',
|
| 120 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 121 |
+
],
|
| 122 |
+
round=[
|
| 123 |
+
dict(
|
| 124 |
+
role='HUMAN',
|
| 125 |
+
prompt = GRADER_TEMPLATE
|
| 126 |
+
),
|
| 127 |
+
]),
|
| 128 |
+
),
|
| 129 |
+
dataset_cfg=dict(
|
| 130 |
+
type=LiveStemBenchDataset,
|
| 131 |
+
path='opencompass/livestembench',
|
| 132 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 133 |
+
version=subset,
|
| 134 |
+
),
|
| 135 |
+
judge_cfg=dict(),
|
| 136 |
+
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
| 137 |
+
),
|
| 138 |
+
pred_role='BOT',
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
livestembench_datasets.append(
|
| 142 |
+
dict(
|
| 143 |
+
abbr=f'LiveStemBench-{name}',
|
| 144 |
+
type=LiveStemBenchDataset,
|
| 145 |
+
path='opencompass/livestembench',
|
| 146 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 147 |
+
infer_cfg=livereasonbench_infer_cfg,
|
| 148 |
+
eval_cfg=livereasonbench_eval_cfg,
|
| 149 |
+
version=subset,
|
| 150 |
+
mode='singlescore',
|
| 151 |
+
)
|
| 152 |
+
)
|
build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 5 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 6 |
+
from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
|
| 7 |
+
from opencompass.utils import xml_tag_postprocessor
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
GRADER_TEMPLATE = """
|
| 11 |
+
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
| 12 |
+
First, I will give examples of each grade, and then you will grade a new example.
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
The following are examples of CORRECT predicted answers.
|
| 16 |
+
```
|
| 17 |
+
Question: What are the names of Barack Obama's children?
|
| 18 |
+
Gold target: Malia Obama and Sasha Obama
|
| 19 |
+
Predicted answer 1: sasha and malia obama
|
| 20 |
+
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
|
| 21 |
+
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
|
| 22 |
+
```
|
| 23 |
+
These predicted answers are all CORRECT because:
|
| 24 |
+
- They fully contain the important information in the gold target.
|
| 25 |
+
- They do not contain any information that contradicts the gold target.
|
| 26 |
+
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
|
| 27 |
+
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
The following are examples of INCORRECT predicted answers.
|
| 31 |
+
```
|
| 32 |
+
Question: What are the names of Barack Obama's children?
|
| 33 |
+
Gold target: Malia and Sasha
|
| 34 |
+
Predicted answer 1: Malia.
|
| 35 |
+
Predicted answer 2: Malia, Sasha, and Susan.
|
| 36 |
+
Predicted answer 3: Barack Obama does not have any children.
|
| 37 |
+
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
|
| 38 |
+
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
|
| 39 |
+
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
|
| 40 |
+
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
|
| 41 |
+
```
|
| 42 |
+
These predicted answers are all INCORRECT because:
|
| 43 |
+
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
The following are examples of NOT_ATTEMPTED predicted answers.
|
| 47 |
+
```
|
| 48 |
+
Question: What are the names of Barack Obama's children?
|
| 49 |
+
Gold target: Malia and Sasha
|
| 50 |
+
Predicted answer 1: I don't know.
|
| 51 |
+
Predicted answer 2: I need more context about which Obama you are talking about.
|
| 52 |
+
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
|
| 53 |
+
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
|
| 54 |
+
```
|
| 55 |
+
These predicted answers are all NOT_ATTEMPTED because:
|
| 56 |
+
- The important information in the gold target is not included in the answer.
|
| 57 |
+
- No statements in the answer contradict the gold target.
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
Also note the following things:
|
| 61 |
+
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
|
| 62 |
+
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
|
| 63 |
+
- Predicted answers "100k" and "113k" are INCORRECT.
|
| 64 |
+
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
|
| 65 |
+
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
|
| 66 |
+
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
|
| 67 |
+
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
|
| 68 |
+
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
|
| 69 |
+
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
|
| 70 |
+
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
|
| 71 |
+
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
|
| 72 |
+
- Do not punish for typos in people's name if it's clearly the same name.
|
| 73 |
+
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
| 74 |
+
|
| 75 |
+
Grade the predicted answer of this new question as one of:
|
| 76 |
+
A: CORRECT
|
| 77 |
+
B: INCORRECT
|
| 78 |
+
C: NOT_ATTEMPTED
|
| 79 |
+
Just return the letters "A", "B", or "C", with no text around it.
|
| 80 |
+
|
| 81 |
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 82 |
+
```
|
| 83 |
+
Question: {question}
|
| 84 |
+
Gold target: {answer}
|
| 85 |
+
Predicted answer: {prediction}
|
| 86 |
+
```
|
| 87 |
+
""".strip()
|
| 88 |
+
|
| 89 |
+
livereasonbench_subsets = {
|
| 90 |
+
'biology': 'livestembench_bio',
|
| 91 |
+
'chemistry': 'livestembench_che',
|
| 92 |
+
'physics': 'livestembench_phy',
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
livestembench_datasets = []
|
| 96 |
+
|
| 97 |
+
for name, subset in livereasonbench_subsets.items():
|
| 98 |
+
livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 99 |
+
|
| 100 |
+
livereasonbench_infer_cfg = dict(
|
| 101 |
+
prompt_template=dict(
|
| 102 |
+
type=PromptTemplate,
|
| 103 |
+
template=dict(
|
| 104 |
+
round=[
|
| 105 |
+
dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'),
|
| 106 |
+
],
|
| 107 |
+
)),
|
| 108 |
+
retriever=dict(type=ZeroRetriever),
|
| 109 |
+
inferencer=dict(type=GenInferencer, max_out_len=8192))
|
| 110 |
+
|
| 111 |
+
livereasonbench_eval_cfg = dict(
|
| 112 |
+
evaluator=dict(
|
| 113 |
+
type=GenericLLMEvaluator,
|
| 114 |
+
prompt_template=dict(
|
| 115 |
+
type=PromptTemplate,
|
| 116 |
+
template=dict(
|
| 117 |
+
begin=[
|
| 118 |
+
dict(
|
| 119 |
+
role='SYSTEM',
|
| 120 |
+
fallback_role='HUMAN',
|
| 121 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 122 |
+
],
|
| 123 |
+
round=[
|
| 124 |
+
dict(
|
| 125 |
+
role='HUMAN',
|
| 126 |
+
prompt = GRADER_TEMPLATE
|
| 127 |
+
),
|
| 128 |
+
]),
|
| 129 |
+
),
|
| 130 |
+
dataset_cfg=dict(
|
| 131 |
+
type=LiveStemBenchDataset,
|
| 132 |
+
path='opencompass/livestembench',
|
| 133 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 134 |
+
version=subset,
|
| 135 |
+
),
|
| 136 |
+
judge_cfg=dict(),
|
| 137 |
+
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
| 138 |
+
pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
|
| 139 |
+
|
| 140 |
+
),
|
| 141 |
+
pred_role='BOT',
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
livestembench_datasets.append(
|
| 145 |
+
dict(
|
| 146 |
+
abbr=f'LiveStemBench-{name}',
|
| 147 |
+
type=LiveStemBenchDataset,
|
| 148 |
+
path='opencompass/livestembench',
|
| 149 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 150 |
+
infer_cfg=livereasonbench_infer_cfg,
|
| 151 |
+
eval_cfg=livereasonbench_eval_cfg,
|
| 152 |
+
version=subset,
|
| 153 |
+
mode='singlescore',
|
| 154 |
+
)
|
| 155 |
+
)
|
build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .livestembench_gen_3e3c50 import livestembench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 5 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 6 |
+
from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
GRADER_TEMPLATE = """
|
| 10 |
+
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
|
| 11 |
+
First, I will give examples of each grade, and then you will grade a new example.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
The following are examples of CORRECT predicted answers.
|
| 15 |
+
```
|
| 16 |
+
Question: What are the names of Barack Obama's children?
|
| 17 |
+
Gold target: Malia Obama and Sasha Obama
|
| 18 |
+
Predicted answer 1: sasha and malia obama
|
| 19 |
+
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
|
| 20 |
+
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
|
| 21 |
+
```
|
| 22 |
+
These predicted answers are all CORRECT because:
|
| 23 |
+
- They fully contain the important information in the gold target.
|
| 24 |
+
- They do not contain any information that contradicts the gold target.
|
| 25 |
+
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
|
| 26 |
+
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
The following are examples of INCORRECT predicted answers.
|
| 30 |
+
```
|
| 31 |
+
Question: What are the names of Barack Obama's children?
|
| 32 |
+
Gold target: Malia and Sasha
|
| 33 |
+
Predicted answer 1: Malia.
|
| 34 |
+
Predicted answer 2: Malia, Sasha, and Susan.
|
| 35 |
+
Predicted answer 3: Barack Obama does not have any children.
|
| 36 |
+
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
|
| 37 |
+
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
|
| 38 |
+
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
|
| 39 |
+
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
|
| 40 |
+
```
|
| 41 |
+
These predicted answers are all INCORRECT because:
|
| 42 |
+
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
The following are examples of NOT_ATTEMPTED predicted answers.
|
| 46 |
+
```
|
| 47 |
+
Question: What are the names of Barack Obama's children?
|
| 48 |
+
Gold target: Malia and Sasha
|
| 49 |
+
Predicted answer 1: I don't know.
|
| 50 |
+
Predicted answer 2: I need more context about which Obama you are talking about.
|
| 51 |
+
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
|
| 52 |
+
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
|
| 53 |
+
```
|
| 54 |
+
These predicted answers are all NOT_ATTEMPTED because:
|
| 55 |
+
- The important information in the gold target is not included in the answer.
|
| 56 |
+
- No statements in the answer contradict the gold target.
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
Also note the following things:
|
| 60 |
+
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
|
| 61 |
+
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
|
| 62 |
+
- Predicted answers "100k" and "113k" are INCORRECT.
|
| 63 |
+
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
|
| 64 |
+
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
|
| 65 |
+
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
|
| 66 |
+
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
|
| 67 |
+
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
|
| 68 |
+
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
|
| 69 |
+
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
|
| 70 |
+
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
|
| 71 |
+
- Do not punish for typos in people's name if it's clearly the same name.
|
| 72 |
+
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
|
| 73 |
+
|
| 74 |
+
Grade the predicted answer of this new question as one of:
|
| 75 |
+
A: CORRECT
|
| 76 |
+
B: INCORRECT
|
| 77 |
+
C: NOT_ATTEMPTED
|
| 78 |
+
Just return the letters "A", "B", or "C", with no text around it.
|
| 79 |
+
|
| 80 |
+
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 81 |
+
```
|
| 82 |
+
Question: {question}
|
| 83 |
+
Gold target: {answer}
|
| 84 |
+
Predicted answer: {prediction}
|
| 85 |
+
```
|
| 86 |
+
""".strip()
|
| 87 |
+
|
| 88 |
+
livereasonbench_subsets = {
|
| 89 |
+
'biology': 'livestembench_bio',
|
| 90 |
+
'chemistry': 'livestembench_che',
|
| 91 |
+
'physics': 'livestembench_phy',
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
livestembench_datasets = []
|
| 95 |
+
|
| 96 |
+
for name, subset in livereasonbench_subsets.items():
|
| 97 |
+
livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 98 |
+
|
| 99 |
+
livereasonbench_infer_cfg = dict(
|
| 100 |
+
prompt_template=dict(
|
| 101 |
+
type=PromptTemplate,
|
| 102 |
+
template=dict(
|
| 103 |
+
round=[
|
| 104 |
+
dict(role='HUMAN', prompt='问题: {question}\n请逐步思考,并给出最终答案,答案放在 \\boxed{{}} 中。'),
|
| 105 |
+
],
|
| 106 |
+
)),
|
| 107 |
+
retriever=dict(type=ZeroRetriever),
|
| 108 |
+
inferencer=dict(type=GenInferencer, max_out_len=8192))
|
| 109 |
+
|
| 110 |
+
livereasonbench_eval_cfg = dict(
|
| 111 |
+
evaluator=dict(
|
| 112 |
+
type=GenericLLMEvaluator,
|
| 113 |
+
prompt_template=dict(
|
| 114 |
+
type=PromptTemplate,
|
| 115 |
+
template=dict(
|
| 116 |
+
begin=[
|
| 117 |
+
dict(
|
| 118 |
+
role='SYSTEM',
|
| 119 |
+
fallback_role='HUMAN',
|
| 120 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
| 121 |
+
],
|
| 122 |
+
round=[
|
| 123 |
+
dict(
|
| 124 |
+
role='HUMAN',
|
| 125 |
+
prompt = GRADER_TEMPLATE
|
| 126 |
+
),
|
| 127 |
+
]),
|
| 128 |
+
),
|
| 129 |
+
dataset_cfg=dict(
|
| 130 |
+
type=LiveStemBenchDataset,
|
| 131 |
+
path='opencompass/livestembench',
|
| 132 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 133 |
+
version=subset,
|
| 134 |
+
),
|
| 135 |
+
judge_cfg=dict(),
|
| 136 |
+
dict_postprocessor=dict(type=livereasonbench_postprocess),
|
| 137 |
+
),
|
| 138 |
+
pred_role='BOT',
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
livestembench_datasets.append(
|
| 142 |
+
dict(
|
| 143 |
+
abbr=f'LiveStemBench-{name}',
|
| 144 |
+
type=LiveStemBenchDataset,
|
| 145 |
+
path='opencompass/livestembench',
|
| 146 |
+
reader_cfg=livereasonbench_reader_cfg,
|
| 147 |
+
infer_cfg=livereasonbench_infer_cfg,
|
| 148 |
+
eval_cfg=livereasonbench_eval_cfg,
|
| 149 |
+
version=subset,
|
| 150 |
+
mode='singlescore',
|
| 151 |
+
)
|
| 152 |
+
)
|
build/lib/opencompass/configs/datasets/llm_compression/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LLM Compression
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
The following introduction comes from the abstract of [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937):
|
| 6 |
+
|
| 7 |
+
>There is a belief that learning to compress well will lead to intelligence. Recently, language modeling has been shown to be equivalent to compression, which offers a compelling rationale for the success of large language models (LLMs): the development of more advanced language models is essentially enhancing compression which facilitates intelligence. ...our findings suggest that compression efficiency, as an unsupervised metric derived from raw text corpora, serves as a reliable evaluation measure that is linearly associated with the model capabilities. We open-source our compression datasets as well as our data collection pipelines to facilitate future researchers to assess compression properly.
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
## Official Links
|
| 11 |
+
|
| 12 |
+
- Paper: [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937)
|
| 13 |
+
- GitHub Repository: [llm-compression-intelligence](https://github.com/hkust-nlp/llm-compression-intelligence)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
## Overview and Usage
|
| 17 |
+
|
| 18 |
+
### Dataset
|
| 19 |
+
The dataset, which consists of three external corpora, can be downloaded using the following python script:
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
from os import os.path as osp
|
| 23 |
+
from datasets import load_dataset
|
| 24 |
+
|
| 25 |
+
data_path = "data/llm-compression"
|
| 26 |
+
|
| 27 |
+
subset_mapping = {
|
| 28 |
+
'arxiv_math': ['arxiv_math'],
|
| 29 |
+
'commoncraw': ['cc'],
|
| 30 |
+
'python': ['python'],
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
for key, value in subset_mapping.items():
|
| 34 |
+
llmc_dataset = load_dataset(r"hkust-nlp/llm-compression", name=value)
|
| 35 |
+
llmc_dataset["test"].to_json(osp.join(data_path, f"{key}.jsonl"))
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
Note: Refer to the original [repository](https://github.com/hkust-nlp/llm-compression-intelligence) for more details on data collection and design.
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
### Inference
|
| 42 |
+
|
| 43 |
+
The inference stage (`SWCELossInferencer`) consists of the following key steps:
|
| 44 |
+
|
| 45 |
+
1. For each candidate model, we obtain the encodings of each sample of the dataset using its tokenizer.
|
| 46 |
+
2. Concatenate the encodings of all samples into a single array and construct a PyTorch Dataset. Each item of `__getitem__` is a chunk of the array based on a sliding window. To reproduce results from the original paper, set `block_size=1900` and `stride=512`.
|
| 47 |
+
3. For each batch, calculate the cross entropy loss based on model logits and targets. The losses within each batch is reduced to a single loss by summation.
|
| 48 |
+
4. Output the losses and `total_chr_num` to `BPCEvaluator` for evaluation.
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
### Evaluation
|
| 52 |
+
|
| 53 |
+
`BPCEvaluator`: Using the total loss for each batch and the total number of characters in the original dataset from the inference stage, calculate the Bits per Character (BPC) metric for each model:
|
| 54 |
+
|
| 55 |
+
$$ BPC = \frac{TotalCrossEntropyLoss}{TotalCharacterNumber*log(2)} $$
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
### Summarization
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
### Config Files
|
| 63 |
+
|
| 64 |
+
1. Dataset config: `configs/datasets/llm-compression.py`
|
| 65 |
+
2. Evaluation config: `examples/eval_llm_compression.py`
|
| 66 |
+
|
| 67 |
+
## Evaluation Results
|
| 68 |
+
```
|
| 69 |
+
metric version model commoncraw python arxiv_math average
|
| 70 |
+
0 bpc af04af qwen1.5-32b-hf 0.5910 0.2584 0.4080 0.4191
|
| 71 |
+
1 bpc af04af qwen1.5-14b-hf 0.6459 0.2766 0.4310 0.4512
|
| 72 |
+
2 bpc af04af qwen-14b-hf 0.6197 0.2849 0.4498 0.4515
|
| 73 |
+
3 bpc af04af llama-30b-hf 0.5773 0.3212 0.4562 0.4516
|
| 74 |
+
4 bpc af04af llama-2-13b-hf 0.5807 0.3336 0.4752 0.4632
|
| 75 |
+
5 bpc af04af qwen1.5-7b-hf 0.6658 0.2935 0.4500 0.4698
|
| 76 |
+
6 bpc af04af qwen-7b-hf 0.6453 0.3088 0.4830 0.4790
|
| 77 |
+
7 bpc af04af llama-13b-hf 0.6083 0.3555 0.4865 0.4834
|
| 78 |
+
8 bpc af04af llama-2-7b-hf 0.6117 0.3536 0.4995 0.4883
|
| 79 |
+
9 bpc af04af llama-7b-hf 0.6285 0.3794 0.5096 0.5058
|
| 80 |
+
10 bpc af04af qwen1.5-1.8b-hf 0.7448 0.4029 0.5625 0.5701
|
| 81 |
+
11 bpc af04af qwen-1.8b-hf 0.7542 0.4175 0.5842 0.5853
|
| 82 |
+
12 bpc af04af qwen1.5-0.5b-hf 0.8102 0.4520 0.6181 0.6268
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
## FAQ
|
| 87 |
+
|
| 88 |
+
### I am getting this warning during inference, should I truncate long samples to `max_seq_len` to avoid further errors?
|
| 89 |
+
```
|
| 90 |
+
Token indices sequence length is longer than the specified maximum sequence length for this model. Running this sequence through the model will result in indexing errors
|
| 91 |
+
```
|
| 92 |
+
>A: This warning comes from the tokenizer indicating that the input sequence length exceeds the model's input length, but it does not affect the operation of the tokenizer. For loss calculation, as long as we set a `block_size` of the sliding window less than `max_seq_len`, we can safely ignore this warning.
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
## Reference
|
| 96 |
+
```
|
| 97 |
+
@misc{huang2024compression,
|
| 98 |
+
title={Compression Represents Intelligence Linearly},
|
| 99 |
+
author={Yuzhen Huang and Jinghan Zhang and Zifei Shan and Junxian He},
|
| 100 |
+
year={2024},
|
| 101 |
+
eprint={2404.09937},
|
| 102 |
+
archivePrefix={arXiv},
|
| 103 |
+
primaryClass={cs.CL}
|
| 104 |
+
}
|
| 105 |
+
```
|
build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import SWCELossInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import BPCEvaluator
|
| 5 |
+
from opencompass.datasets import LLMCompressionDataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# The three corpora for llm_compression used in the original paper
|
| 9 |
+
# See configs/datasets/llm_compression/README.md for more details
|
| 10 |
+
subset_mapping = {
|
| 11 |
+
'arxiv_math': ['arxiv_math'],
|
| 12 |
+
'commoncraw': ['cc'],
|
| 13 |
+
'python': ['python'],
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Build LLM Compression datasets
|
| 18 |
+
llm_compression_datasets = []
|
| 19 |
+
for _name in subset_mapping.keys():
|
| 20 |
+
llm_cmp_infer_cfg = dict(
|
| 21 |
+
prompt_template=dict(
|
| 22 |
+
type=PromptTemplate,
|
| 23 |
+
template='{content}',
|
| 24 |
+
),
|
| 25 |
+
# No in-context example, using ZeroRetriever
|
| 26 |
+
retriever=dict(type=ZeroRetriever),
|
| 27 |
+
# Calculates cross entropy loss for each batch based on a sliding context window
|
| 28 |
+
# Setting block_size=1900 and stride=512 according to the original paper
|
| 29 |
+
inferencer=dict(type=SWCELossInferencer, block_size=1900, stride=512),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Calculates Bits per Character (BPC) based on the CE loss from the inference stage
|
| 33 |
+
llm_cmp_eval_cfg = dict(evaluator=dict(type=BPCEvaluator))
|
| 34 |
+
|
| 35 |
+
llm_compression_datasets.append(
|
| 36 |
+
dict(
|
| 37 |
+
abbr=f'llm_compression-{_name}',
|
| 38 |
+
type=LLMCompressionDataset,
|
| 39 |
+
path='./data/llm-compression',
|
| 40 |
+
name=_name,
|
| 41 |
+
samples=None, # Set small samples for testing
|
| 42 |
+
reader_cfg=dict(
|
| 43 |
+
input_columns=['content'],
|
| 44 |
+
output_column=None,
|
| 45 |
+
),
|
| 46 |
+
infer_cfg=llm_cmp_infer_cfg,
|
| 47 |
+
eval_cfg=llm_cmp_eval_cfg,
|
| 48 |
+
))
|
| 49 |
+
|
| 50 |
+
del _name
|
build/lib/opencompass/configs/datasets/longbench/longbench.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench2wikimqa.longbench_2wikimqa_gen import LongBench_2wikimqa_datasets
|
| 5 |
+
from .longbenchhotpotqa.longbench_hotpotqa_gen import LongBench_hotpotqa_datasets
|
| 6 |
+
from .longbenchmusique.longbench_musique_gen import LongBench_musique_datasets
|
| 7 |
+
from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets
|
| 8 |
+
from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets
|
| 9 |
+
from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets
|
| 10 |
+
from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets
|
| 11 |
+
from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets
|
| 12 |
+
from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets
|
| 13 |
+
from .longbenchqmsum.longbench_qmsum_gen import LongBench_qmsum_datasets
|
| 14 |
+
from .longbenchvcsum.longbench_vcsum_gen import LongBench_vcsum_datasets
|
| 15 |
+
from .longbenchdureader.longbench_dureader_gen import LongBench_dureader_datasets
|
| 16 |
+
from .longbenchlcc.longbench_lcc_gen import LongBench_lcc_datasets
|
| 17 |
+
from .longbenchrepobench.longbench_repobench_gen import LongBench_repobench_datasets
|
| 18 |
+
from .longbenchpassage_retrieval_en.longbench_passage_retrieval_en_gen import LongBench_passage_retrieval_en_datasets
|
| 19 |
+
from .longbenchpassage_retrieval_zh.longbench_passage_retrieval_zh_gen import LongBench_passage_retrieval_zh_datasets
|
| 20 |
+
from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets
|
| 21 |
+
from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets
|
| 22 |
+
from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets
|
| 23 |
+
from .longbenchmulti_news.longbench_multi_news_gen import LongBench_multi_news_datasets
|
| 24 |
+
from .longbenchsamsum.longbench_samsum_gen import LongBench_samsum_datasets
|
| 25 |
+
|
| 26 |
+
longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbenchv2_gen_75fbba import LongBenchv2_datasets
|
build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LongBenchv2Dataset, LongBenchv2Evaluator
|
| 5 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
| 6 |
+
|
| 7 |
+
LongBenchv2_reader_cfg = dict(
|
| 8 |
+
input_columns=['context', 'question', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'difficulty', 'length'],
|
| 9 |
+
output_column='answer',
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
LongBenchv2_infer_cfg = dict(
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
type=PromptTemplate,
|
| 15 |
+
template=dict(
|
| 16 |
+
round=[
|
| 17 |
+
dict(
|
| 18 |
+
role='HUMAN',
|
| 19 |
+
prompt='Please read the following text and answer the questions below.\n <text> \n {context} \n </text> \n \n What is the correct answer to this question: {question} \n \n Choices: \n (A) {choice_A} \n (B) {choice_B} \n (C) {choice_C} \n (D) {choice_D} \n Let’s think step by step. Based on the above, what is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)',
|
| 20 |
+
),
|
| 21 |
+
],
|
| 22 |
+
),
|
| 23 |
+
),
|
| 24 |
+
retriever=dict(type=ZeroRetriever),
|
| 25 |
+
inferencer=dict(type=GenInferencer),
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
LongBenchv2_eval_cfg = dict(
|
| 29 |
+
evaluator=dict(type=LongBenchv2Evaluator),
|
| 30 |
+
pred_role='BOT',
|
| 31 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
LongBenchv2_datasets = [
|
| 35 |
+
dict(
|
| 36 |
+
type=LongBenchv2Dataset,
|
| 37 |
+
abbr='LongBenchv2',
|
| 38 |
+
path='opencompass/longbenchv2',
|
| 39 |
+
reader_cfg=LongBenchv2_reader_cfg,
|
| 40 |
+
infer_cfg=LongBenchv2_infer_cfg,
|
| 41 |
+
eval_cfg=LongBenchv2_eval_cfg,
|
| 42 |
+
)
|
| 43 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lveval.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LVEval
|
| 2 |
+
## Introduction
|
| 3 |
+
The following introduction comes from the introduction in [LVEval](https://github.com/infinigence/LVEval)
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
LV-Eval是一个具备5个长度等级(16k、32k、64k、128k和256k)、最大文本测试长度达到256k的长文本评测基准。LV-Eval的平均文本长度达到102,380字,最小/最大文本长度为11,896/387,406字。LV-Eval主要有两类评测任务——单跳QA和多跳QA,共包含11个涵盖中英文的评测数据子集。LV-Eval设计时引入3个关键技术:干扰事实插入(Confusiong Facts Insertion,CFI)提高挑战性,关键词和短语替换(Keyword and Phrase Replacement,KPR)减少信息泄漏,以及基于关键词召回的评测指标(Answer Keywords,AK,指代结合答案关键词和字词黑名单的评价指标)提高评测数值客观性。我们希望LV-Eval为未来长文本大语言模型的研究发展提供有价值的性能参考。
|
| 7 |
+
LV-Eval is a challenging long-context benchmark with five length levels (16k, 32k, 64k, 128k, and 256k) reaching up to 256k words. The average number of words is 102,380, and the Min/Max number of words is 11,896/387,406. LV-Eval features two main tasks, single-hop QA and multi-hop QA, comprising 11 bilingual datasets. The design of LV-Eval has incorporated three key techniques, namely confusing facts insertion (CFI), keyword and phrase replacement (KPR), and keyword-recall-based metrics (AK, short for metics with Answer Keywords and word blacklist) design, which jointly provide a challenging, mitigated-knowledge-leakege, and more accurate evaluation of the long-context capability of LLMs. We anticipate that LV-Eval will serve as a valuable resource for supporting future research on long-context LLMs.
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
## Official link
|
| 11 |
+
|
| 12 |
+
### Paper
|
| 13 |
+
|
| 14 |
+
[_LV_-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K](https://arxiv.org/abs/2402.05136)
|
| 15 |
+
|
| 16 |
+
### Repository
|
| 17 |
+
|
| 18 |
+
[LVEval](https://github.com/infinigence/LVEval)
|
| 19 |
+
|
| 20 |
+
## Use cases
|
| 21 |
+
|
| 22 |
+
In evaluation scripts, add LVEval dataset as other datasets by using
|
| 23 |
+
```
|
| 24 |
+
from .datasets.lveval.lveval import LVEval_datasets as datasets
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Examples
|
| 28 |
+
Input example I (from lic_mixup datasets):
|
| 29 |
+
```
|
| 30 |
+
请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。
|
| 31 |
+
|
| 32 |
+
文章:......文章 9\n\n标题:腐质酸\n内容:腐植酸是自然界中广泛存在的大分子有机物质,广泛应用于农林牧、石油、化工、建材、医药卫生、环保等各个领域。横跨几十个行业。特别是眼下提倡生态农业建设、无公害农业生产、绿色食品、无污染环保产品等,更使\"腐植酸\"备受推崇,事实证明,人类的生活和生存离不开腐植酸,它的确是一个发展中的有希望的朝阳产业,属于一个新型的特殊行业......
|
| 33 |
+
|
| 34 |
+
请现在基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。
|
| 35 |
+
|
| 36 |
+
问题:中国的文学受到印度哪些方面的影响?
|
| 37 |
+
回答:
|
| 38 |
+
```
|
| 39 |
+
Output example I (from chatglm3-6b-32k):
|
| 40 |
+
```
|
| 41 |
+
中国文学自印度文学大量吸收营养,在佛教东流之后,从语汇到修辞,从题材到体裁,即便审美取向也深受佛教与印度文学的感染。
|
| 42 |
+
```
|
| 43 |
+
Input example II (from factrecall_zh datasets):
|
| 44 |
+
```
|
| 45 |
+
请基于给定的文章回答下述问题。
|
| 46 |
+
|
| 47 |
+
文章:......庚子年间,贝多芬,乃一德裔美籍学士,研究于物理理学。彼其良图,探求相对论、量子力学,尤有大进。质能等价公式 E=mc²,千古独步,声名于当世。诺贝尔物理学奖、以资尊荣,兹矣荣耀之大典。论其学术,涉时空能量,影响深远,以其义非常人,广为当世所知,声名播于天下,实乃现代物理学之奠基者......
|
| 48 |
+
|
| 49 |
+
现在请基于上述文章回答下面的问题。
|
| 50 |
+
|
| 51 |
+
问题:被世人广泛推崇为现代物理学奠基人的科学家叫什么名字?
|
| 52 |
+
回答:
|
| 53 |
+
```
|
| 54 |
+
Output example II (from chatglm3-6b-32k):
|
| 55 |
+
```
|
| 56 |
+
贝多芬
|
| 57 |
+
```
|
| 58 |
+
## Evaluation results
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
dataset version metric mode bluelm-7b-chat-32k-hf
|
| 62 |
+
----------------------------------------- --------- ------------- ------ -----------------------
|
| 63 |
+
---------------------------------------- - - - -
|
| 64 |
+
--------- LVEval All --------- - - - -
|
| 65 |
+
---------------------------------------- - - - -
|
| 66 |
+
LVEval_qa - naive_average gen 12.00
|
| 67 |
+
---------------------------------------- - - - -
|
| 68 |
+
--------- LVEval Tasks All --------- - - - -
|
| 69 |
+
---------------------------------------- - - - -
|
| 70 |
+
LVEval_single_hop_qa - naive_average gen 15.11
|
| 71 |
+
LVEval_single_hop_cqa - naive_average gen 9.21
|
| 72 |
+
LVEval_multi_hop_qa - naive_average gen 6.99
|
| 73 |
+
LVEval_multi_hop_cqa - naive_average gen 9.90
|
| 74 |
+
LVEval_factrecall_cqa - naive_average gen 21.28
|
| 75 |
+
---------------------------------------- - - - -
|
| 76 |
+
--------- LVEval Datasets All --------- - - - -
|
| 77 |
+
---------------------------------------- - - - -
|
| 78 |
+
LVEval_loogle_SD_mixup - naive_average gen 12.81
|
| 79 |
+
LVEval_cmrc_mixup - naive_average gen 17.41
|
| 80 |
+
LVEval_multifieldqa_en_mixup - naive_average gen 7.10
|
| 81 |
+
LVEval_multifieldqa_zh_mixup - naive_average gen 11.31
|
| 82 |
+
LVEval_dureader_mixup - naive_average gen 13.19
|
| 83 |
+
LVEval_loogle_CR_mixup - naive_average gen 5.17
|
| 84 |
+
LVEval_loogle_MIR_mixup - naive_average gen 2.60
|
| 85 |
+
LVEval_hotpotwikiqa_mixup - naive_average gen 10.20
|
| 86 |
+
LVEval_lic_mixup - naive_average gen 9.60
|
| 87 |
+
LVEval_factrecall_en - naive_average gen 23.67
|
| 88 |
+
LVEval_factrecall_zh - naive_average gen 18.90
|
| 89 |
+
---------------------------------------- - - - -
|
| 90 |
+
--------- LVEval Single_Hop QA --------- - - - -
|
| 91 |
+
---------------------------------------- - - - -
|
| 92 |
+
LVEval_loogle_SD_mixup_16k 83bc25 LVEval_f1 gen 35.05
|
| 93 |
+
LVEval_loogle_SD_mixup_32k 83bc25 LVEval_f1 gen 13.37
|
| 94 |
+
LVEval_loogle_SD_mixup_64k 83bc25 LVEval_f1 gen 6.32
|
| 95 |
+
LVEval_loogle_SD_mixup_128k 83bc25 LVEval_f1 gen 5.28
|
| 96 |
+
LVEval_loogle_SD_mixup_256k 83bc25 LVEval_f1 gen 4.00
|
| 97 |
+
---------------------------------------- - - - -
|
| 98 |
+
LVEval_cmrc_mixup_16k 8bac4e LVEval_f1 gen 46.45
|
| 99 |
+
LVEval_cmrc_mixup_32k 8bac4e LVEval_f1 gen 19.41
|
| 100 |
+
LVEval_cmrc_mixup_64k 8bac4e LVEval_f1 gen 11.10
|
| 101 |
+
LVEval_cmrc_mixup_128k 8bac4e LVEval_f1 gen 5.89
|
| 102 |
+
LVEval_cmrc_mixup_256k 8bac4e LVEval_f1 gen 4.22
|
| 103 |
+
---------------------------------------- - - - -
|
| 104 |
+
--------- LVEval Single_Hop CQA --------- - - - -
|
| 105 |
+
---------------------------------------- - - - -
|
| 106 |
+
LVEval_multifieldqa_en_mixup_16k 83bc25 LVEval_f1 gen 12.28
|
| 107 |
+
LVEval_multifieldqa_en_mixup_32k 83bc25 LVEval_f1 gen 4.64
|
| 108 |
+
LVEval_multifieldqa_en_mixup_64k 83bc25 LVEval_f1 gen 8.30
|
| 109 |
+
LVEval_multifieldqa_en_mixup_128k 83bc25 LVEval_f1 gen 5.63
|
| 110 |
+
LVEval_multifieldqa_en_mixup_256k 83bc25 LVEval_f1 gen 4.64
|
| 111 |
+
---------------------------------------- - - - -
|
| 112 |
+
LVEval_multifieldqa_zh_mixup_16k ac4a0d LVEval_f1 gen 22.30
|
| 113 |
+
LVEval_multifieldqa_zh_mixup_32k ac4a0d LVEval_f1 gen 17.46
|
| 114 |
+
LVEval_multifieldqa_zh_mixup_64k ac4a0d LVEval_f1 gen 6.27
|
| 115 |
+
LVEval_multifieldqa_zh_mixup_128k ac4a0d LVEval_f1 gen 5.84
|
| 116 |
+
LVEval_multifieldqa_zh_mixup_256k ac4a0d LVEval_f1 gen 4.71
|
| 117 |
+
---------------------------------------- - - - -
|
| 118 |
+
--------- LVEval Multi_Hop QA --------- - - - -
|
| 119 |
+
---------------------------------------- - - - -
|
| 120 |
+
LVEval_dureader_mixup_16k 8bac4e LVEval_rouge gen 18.04
|
| 121 |
+
LVEval_dureader_mixup_32k 8bac4e LVEval_rouge gen 18.33
|
| 122 |
+
LVEval_dureader_mixup_64k 8bac4e LVEval_rouge gen 12.56
|
| 123 |
+
LVEval_dureader_mixup_128k 8bac4e LVEval_rouge gen 10.33
|
| 124 |
+
LVEval_dureader_mixup_256k 8bac4e LVEval_rouge gen 6.69
|
| 125 |
+
---------------------------------------- - - - -
|
| 126 |
+
LVEval_loogle_CR_mixup_16k 83bc25 LVEval_f1 gen 9.35
|
| 127 |
+
LVEval_loogle_CR_mixup_32k 83bc25 LVEval_f1 gen 7.42
|
| 128 |
+
LVEval_loogle_CR_mixup_64k 83bc25 LVEval_f1 gen 3.18
|
| 129 |
+
LVEval_loogle_CR_mixup_128k 83bc25 LVEval_f1 gen 2.65
|
| 130 |
+
LVEval_loogle_CR_mixup_256k 83bc25 LVEval_f1 gen 3.27
|
| 131 |
+
---------------------------------------- - - - -
|
| 132 |
+
LVEval_loogle_MIR_mixup_16k 83bc25 LVEval_f1 gen 4.50
|
| 133 |
+
LVEval_loogle_MIR_mixup_32k 83bc25 LVEval_f1 gen 3.19
|
| 134 |
+
LVEval_loogle_MIR_mixup_64k 83bc25 LVEval_f1 gen 2.34
|
| 135 |
+
LVEval_loogle_MIR_mixup_128k 83bc25 LVEval_f1 gen 1.76
|
| 136 |
+
LVEval_loogle_MIR_mixup_256k 83bc25 LVEval_f1 gen 1.20
|
| 137 |
+
---------------------------------------- - - - -
|
| 138 |
+
--------- LVEval Multi_Hop CQA --------- - - - -
|
| 139 |
+
---------------------------------------- - - - -
|
| 140 |
+
LVEval_hotpotwikiqa_mixup_16k e3c368 LVEval_f1 gen 19.80
|
| 141 |
+
LVEval_hotpotwikiqa_mixup_32k e3c368 LVEval_f1 gen 12.59
|
| 142 |
+
LVEval_hotpotwikiqa_mixup_64k e3c368 LVEval_f1 gen 7.33
|
| 143 |
+
LVEval_hotpotwikiqa_mixup_128k e3c368 LVEval_f1 gen 7.85
|
| 144 |
+
LVEval_hotpotwikiqa_mixup_256k e3c368 LVEval_f1 gen 3.42
|
| 145 |
+
---------------------------------------- - - - -
|
| 146 |
+
LVEval_lic_mixup_16k fdd540 LVEval_f1 gen 21.36
|
| 147 |
+
LVEval_lic_mixup_32k fdd540 LVEval_f1 gen 12.92
|
| 148 |
+
LVEval_lic_mixup_64k fdd540 LVEval_f1 gen 4.62
|
| 149 |
+
LVEval_lic_mixup_128k fdd540 LVEval_f1 gen 4.25
|
| 150 |
+
LVEval_lic_mixup_256k fdd540 LVEval_f1 gen 4.85
|
| 151 |
+
---------------------------------------- - - - -
|
| 152 |
+
--------- LVEval Factrecall CQA --------- - - - -
|
| 153 |
+
---------------------------------------- - - - -
|
| 154 |
+
LVEval_factrecall_en_16k fba966 f1 gen 58.33
|
| 155 |
+
LVEval_factrecall_en_32k fba966 f1 gen 32.17
|
| 156 |
+
LVEval_factrecall_en_64k fba966 f1 gen 15.33
|
| 157 |
+
LVEval_factrecall_en_128k fba966 f1 gen 8.50
|
| 158 |
+
LVEval_factrecall_en_256k fba966 f1 gen 4.00
|
| 159 |
+
---------------------------------------- - - - -
|
| 160 |
+
LVEval_factrecall_zh_16k ef3320 f1 gen 20.00
|
| 161 |
+
LVEval_factrecall_zh_32k ef3320 f1 gen 38.00
|
| 162 |
+
LVEval_factrecall_zh_64k ef3320 f1 gen 20.50
|
| 163 |
+
LVEval_factrecall_zh_128k ef3320 f1 gen 11.00
|
| 164 |
+
LVEval_factrecall_zh_256k ef3320 f1 gen 5.00
|
| 165 |
+
```
|
build/lib/opencompass/configs/datasets/lveval/lveval.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lvevalcmrc_mixup.lveval_cmrc_mixup_gen import (
|
| 5 |
+
LVEval_cmrc_mixup_datasets,
|
| 6 |
+
)
|
| 7 |
+
from .lvevaldureader_mixup.lveval_dureader_mixup_gen import (
|
| 8 |
+
LVEval_dureader_mixup_datasets,
|
| 9 |
+
)
|
| 10 |
+
from .lvevalfactrecall_en.lveval_factrecall_en_gen import (
|
| 11 |
+
LVEval_factrecall_en_datasets,
|
| 12 |
+
)
|
| 13 |
+
from .lvevalfactrecall_zh.lveval_factrecall_zh_gen import (
|
| 14 |
+
LVEval_factrecall_zh_datasets,
|
| 15 |
+
)
|
| 16 |
+
from .lvevalhotpotwikiqa_mixup.lveval_hotpotwikiqa_mixup_gen import (
|
| 17 |
+
LVEval_hotpotwikiqa_mixup_datasets,
|
| 18 |
+
)
|
| 19 |
+
from .lvevallic_mixup.lveval_lic_mixup_gen import LVEval_lic_mixup_datasets
|
| 20 |
+
from .lvevalloogle_CR_mixup.lveval_loogle_CR_mixup_gen import (
|
| 21 |
+
LVEval_loogle_CR_mixup_datasets,
|
| 22 |
+
)
|
| 23 |
+
from .lvevalloogle_MIR_mixup.lveval_loogle_MIR_mixup_gen import (
|
| 24 |
+
LVEval_loogle_MIR_mixup_datasets,
|
| 25 |
+
)
|
| 26 |
+
from .lvevalloogle_SD_mixup.lveval_loogle_SD_mixup_gen import (
|
| 27 |
+
LVEval_loogle_SD_mixup_datasets,
|
| 28 |
+
)
|
| 29 |
+
from .lvevalmultifieldqa_en_mixup.lveval_multifieldqa_en_mixup_gen import (
|
| 30 |
+
LVEval_multifieldqa_en_mixup_datasets,
|
| 31 |
+
)
|
| 32 |
+
from .lvevalmultifieldqa_zh_mixup.lveval_multifieldqa_zh_mixup_gen import (
|
| 33 |
+
LVEval_multifieldqa_zh_mixup_datasets,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
LVEval_datasets = sum(
|
| 37 |
+
(v for k, v in locals().items() if k.endswith('_datasets')), []
|
| 38 |
+
)
|
build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .mastermath2024v1_gen_be6318 import mastermath2024v1_datasets
|
build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import MastermathDatasetv1, MastermathDatasetv1Evaluator
|
| 5 |
+
from opencompass.utils import first_option_postprocess
|
| 6 |
+
|
| 7 |
+
mastermath2024v1_reader_cfg = dict(
|
| 8 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
| 9 |
+
output_column='answer')
|
| 10 |
+
|
| 11 |
+
mastermath2024v1_infer_cfg = dict(
|
| 12 |
+
prompt_template=dict(
|
| 13 |
+
type=PromptTemplate,
|
| 14 |
+
template=dict(
|
| 15 |
+
round=[
|
| 16 |
+
dict(role='HUMAN', prompt='{question}\n选项:\n'
|
| 17 |
+
'(A){A}\n'
|
| 18 |
+
'(B){B}\n'
|
| 19 |
+
'(C){C}\n'
|
| 20 |
+
'(D){D}\n'
|
| 21 |
+
'你的回答格式如下: "正确答案是 (在这里插入你的答案)"'),
|
| 22 |
+
], )),
|
| 23 |
+
retriever=dict(type=ZeroRetriever),
|
| 24 |
+
inferencer=dict(type=GenInferencer))
|
| 25 |
+
|
| 26 |
+
mastermath2024v1_eval_cfg = dict(evaluator=dict(type=MastermathDatasetv1Evaluator),
|
| 27 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
| 28 |
+
|
| 29 |
+
mastermath2024v1_datasets = [dict(
|
| 30 |
+
abbr='Mastermath2024v1',
|
| 31 |
+
type=MastermathDatasetv1,
|
| 32 |
+
path='./data/mastermath2024v1/',
|
| 33 |
+
name='kaoyan_math_1_mcq_Sheet1.csv',
|
| 34 |
+
reader_cfg=mastermath2024v1_reader_cfg,
|
| 35 |
+
infer_cfg=mastermath2024v1_infer_cfg,
|
| 36 |
+
eval_cfg=mastermath2024v1_eval_cfg)]
|
build/lib/opencompass/configs/datasets/matbench/matbench_gen.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
# from .matbench_gen_regex_judge import matbench_datasets # noqa: F401, F403
|
| 5 |
+
from .matbench_llm_judge_gen_0e9276 import matbench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
matbench_reader_cfg = dict(
|
| 10 |
+
input_columns=['problem'], output_column='answer')
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
matbench_tasks = ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass']
|
| 14 |
+
|
| 15 |
+
matbench_datasets = []
|
| 16 |
+
|
| 17 |
+
for task in matbench_tasks:
|
| 18 |
+
if task in ['matbench_expt_is_metal','matbench_glass']:
|
| 19 |
+
matbench_infer_cfg = dict(
|
| 20 |
+
prompt_template=dict(
|
| 21 |
+
type=PromptTemplate,
|
| 22 |
+
template=dict(
|
| 23 |
+
round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])),
|
| 24 |
+
retriever=dict(type=ZeroRetriever),
|
| 25 |
+
inferencer=dict(type=GenInferencer))
|
| 26 |
+
|
| 27 |
+
matbench_eval_cfg = dict(
|
| 28 |
+
evaluator=dict(type=MatbenchEvaluator_classification),
|
| 29 |
+
pred_role='BOT')
|
| 30 |
+
|
| 31 |
+
elif task in ['matbench_steels','matbench_expt_gap']:
|
| 32 |
+
matbench_infer_cfg = dict(
|
| 33 |
+
prompt_template=dict(
|
| 34 |
+
type=PromptTemplate,
|
| 35 |
+
template=dict(
|
| 36 |
+
round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])),
|
| 37 |
+
retriever=dict(type=ZeroRetriever),
|
| 38 |
+
inferencer=dict(type=GenInferencer))
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
matbench_eval_cfg = dict(
|
| 42 |
+
evaluator=dict(type=MatbenchEvaluator_regression),
|
| 43 |
+
pred_role='BOT')
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
matbench_datasets.append(
|
| 47 |
+
dict(
|
| 48 |
+
type=MatbenchDataset,
|
| 49 |
+
path=f'opencompass/Matbench',
|
| 50 |
+
task=task,
|
| 51 |
+
abbr=task,
|
| 52 |
+
reader_cfg=matbench_reader_cfg,
|
| 53 |
+
infer_cfg=matbench_infer_cfg,
|
| 54 |
+
eval_cfg=matbench_eval_cfg))
|
| 55 |
+
|