msj19 commited on Jan 22

Commit

a20260e

verified ·

1 Parent(s): f47d3f6

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/lib/opencompass/configs/datasets/korbench/korbench_gen.py +4 -0
build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py +60 -0
build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py +116 -0
build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py +54 -0
build/lib/opencompass/configs/datasets/korbench/readme.md +71 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py +166 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py +4 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py +164 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py +164 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py +163 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py +165 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py +165 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py +132 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py +164 -0
build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py +168 -0
build/lib/opencompass/configs/datasets/livemathbench/README.md +74 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py +4 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py +49 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py +45 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py +49 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py +4 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py +45 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py +120 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py +96 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py +44 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py +44 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py +97 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py +45 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py +45 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py +44 -0
build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py +43 -0
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py +4 -0
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py +136 -0
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py +142 -0
build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py +142 -0
build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py +152 -0
build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py +155 -0
build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py +4 -0
build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py +152 -0
build/lib/opencompass/configs/datasets/llm_compression/README.md +105 -0
build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py +50 -0
build/lib/opencompass/configs/datasets/longbench/longbench.py +26 -0
build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py +4 -0
build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py +43 -0
build/lib/opencompass/configs/datasets/lveval/lveval.md +165 -0
build/lib/opencompass/configs/datasets/lveval/lveval.py +38 -0
build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py +4 -0
build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py +36 -0
build/lib/opencompass/configs/datasets/matbench/matbench_gen.py +5 -0
build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py +55 -0

build/lib/opencompass/configs/datasets/korbench/korbench_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .korbench_single_0_shot_gen import korbench_0shot_single_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
+korbench_0shot_single_datasets = []
+for category in categories:
+    # Prompt template
+    prompt_template = dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='HUMAN',
+                    prompt=''
+                )
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}' # f-string
+                )
+            ]
+        )
+    )
+    # Reader configuration
+    reader_cfg = dict(
+        input_columns=['prompt'],
+        output_column='answer',
+    )
+    # Inference configuration
+    infer_cfg = dict(
+        prompt_template=prompt_template,
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    # Evaluation configuration
+    eval_cfg = dict(
+        evaluator=dict(type=korbenchEvaluator),
+        pred_role='BOT',
+    )
+    korbench_dataset = dict(
+        type=korbenchDataset,
+        abbr=f'korbench_{category}',
+        path='opencompass/korbench',
+        prompt_mode='0_shot',
+        category=category,
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+    korbench_0shot_single_datasets.append(korbench_dataset)

build/lib/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+korbench_0shot_single_datasets = []
+for category in categories:
+    # Prompt template
+    prompt_template = dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='HUMAN',
+                    prompt=''
+                )
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'  # f-string
+                )
+            ]
+        )
+    )
+    # Reader configuration
+    reader_cfg = dict(
+        input_columns=['prompt'],
+        output_column='answer',
+    )
+    # Inference configuration
+    infer_cfg = dict(
+        prompt_template=prompt_template,
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024),
+    )
+    # Evaluation configuration
+    eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=korbenchDataset,
+                path='opencompass/korbench',
+                prompt_mode='0_shot',
+                category=category,
+                reader_cfg=reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    # Dataset
+    korbench_dataset = dict(
+        type=korbenchDataset,
+        abbr=f'korbench_{category}',
+        path='opencompass/korbench',
+        prompt_mode='0_shot',
+        category=category,
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+        mode='singlescore',
+    )
+    korbench_0shot_single_datasets.append(korbench_dataset)

build/lib/opencompass/configs/datasets/korbench/korbench_single_3_shot_gen.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from opencompass.datasets.korbench.korbench import (
+    korbenchDataset,
+    korbenchEvaluator,
+)
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
+korbench_3shot_single_datasets = []
+for category in categories:
+    # Prompt template
+    prompt_template = dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='HUMAN', prompt='')],
+            round=[dict(role='HUMAN', prompt='{prompt}')],  # f-string
+        ),
+    )
+    # Reader configuration
+    reader_cfg = dict(
+        input_columns=['prompt'],
+        output_column='answer',
+    )
+    # Inference configuration
+    infer_cfg = dict(
+        prompt_template=prompt_template,
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024),
+    )
+    # Evaluation configuration
+    eval_cfg = dict(
+        evaluator=dict(type=korbenchEvaluator),
+        pred_role='BOT',
+    )
+    korbench_dataset = dict(
+        type=korbenchDataset,
+        abbr=f'korbench_{category}',
+        path='opencompass/korbench',
+        prompt_mode='3_shot',
+        category=category,
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+    korbench_3shot_single_datasets.append(korbench_dataset)

build/lib/opencompass/configs/datasets/korbench/readme.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks
+KOR-Bench is a dataset designed to evaluate large language models (LLMs) on tasks that require reasoning independent of prior knowledge. Created to assess reasoning and planning abilities, KOR-Bench introduces rule-based tasks that minimize the influence of pretrained knowledge, enabling a focused evaluation of intrinsic model capabilities.
+## Overview
+### Purpose
+Large language models, such as GPT-4 and Claude, excel in knowledge-based tasks but face challenges in applying reasoning skills to unfamiliar scenarios. KOR-Bench is built to evaluate such reasoning capabilities across five categories:
+- **Operation**: Arithmetic and logical operations.
+- **Logic**: Complex deductive and inductive reasoning.
+- **Cipher**: Code-breaking and pattern discovery.
+- **Puzzle**: Problem-solving with creative and logical reasoning.
+- **Counterfactual**: Hypothetical reasoning in alternate scenarios.
+### Dataset Construction
+KOR-Bench tasks are designed with novel rules and configurations, ensuring no reliance on pretrained knowledge. Each task includes:
+- **Rules**: Custom rule sets to guide reasoning.
+- **Questions**: Carefully crafted problems that require the application of rules.
+- **Evaluation Scenarios**: Zero-shot, three-shot, and subquestion-specific configurations.
+The dataset is structured to assess multistep reasoning, pattern recognition, and adaptability to new rules.
+### Dataset Access
+KOR-Bench is publicly available with detailed usage instructions in the [GitHub Repository](https://github.com/KOR-Bench/KOR-Bench). Download the dataset and leverage predefined evaluation scripts or customize your own.
+### Evaluation
+1. Install dependencies and configure your environment.
+2. Run evaluations using `opencompass examples/eval_korbench.py` to assess LLM performance.
+3. Analyze model performance across various reasoning tasks.
+### Example Command
+```bash
+opencompass examples/eval_korbench.py
+```
+## Baselines and Results
+KOR-Bench includes baseline results for leading LLMs evaluated across various configurations, including zero-shot (gen) and few-shot modes. Below is a summary of the results.
+| dataset | version | metric | mode | internlm2_5-7b-chat-turbomind | internlm2_5-1_8b-chat-turbomind | llama-3_1-8b-instruct-turbomind | glm-4-9b-chat-turbomind | gemma-2-9b-it-turbomind |
+|---------|---------|--------|------|--------------------------------|---------------------------------|---------------------------------|--------------------------|--------------------------|
+| korbench_mixed_Multi-Q | 21f998 | accuracy | gen | 0.60 | 0.20 | 9.60 | 8.70 | 7.80 |
+| korbench_mixed_Multi-R | 21f998 | accuracy | gen | 1.70 | 0.10 | 8.80 | 12.10 | 9.80 |
+| korbench_mixed_Multi-RQ | 21f998 | accuracy | gen | 1.50 | 0.10 | 6.40 | 8.60 | 6.00 |
+| korbench_cipher | 21f998 | accuracy | gen | 8.80 | 0.80 | 14.00 | 6.80 | 6.40 |
+| korbench_counterfactual | 21f998 | accuracy | gen | 83.60 | 17.20 | 88.80 | 90.40 | 87.60 |
+| korbench_logic | 21f998 | accuracy | gen | 8.40 | 3.60 | 37.60 | 38.80 | 40.80 |
+| korbench_operation | 21f998 | accuracy | gen | 56.00 | 25.20 | 68.40 | 63.60 | 67.60 |
+| korbench_puzzle | 21f998 | accuracy | gen | 3.60 | 0.00 | 3.20 | 3.20 | 5.60 |
+| korbench_cipher | 21f998 | accuracy | fewshot | 8.40 | 3.20 | 9.60 | 9.20 | 9.60 |
+| korbench_counterfactual | 21f998 | accuracy | fewshot | 87.60 | 58.00 | 23.60 | 89.60 | 84.40 |
+| korbench_logic | 21f998 | accuracy | fewshot | 45.20 | 19.60 | 24.40 | 38.40 | 54.00 |
+| korbench_operation | 21f998 | accuracy | fewshot | 24.80 | 11.20 | 73.20 | 67.20 | 23.20 |
+| korbench_puzzle | 21f998 | accuracy | fewshot | 4.80 | 2.40 | 1.60 | 3.60 | 6.80 |
+### Citation
+**BibTeX:**
+```bibtex
+@misc{ma2024korbenchbenchmarkinglanguagemodels,
+title={KOR-Bench: Benchmarking Language Models on Knowledge-Orthogonal Reasoning Tasks},
+author={Kaijing Ma and Xinrun Du and Yunran Wang and Haoran Zhang and Zhoufutu Wen and Xingwei Qu and Jian Yang and Jiaheng Liu and Minghao Liu and Xiang Yue and Wenhao Huang and Ge Zhang},
+year={2024},
+eprint={2410.06526},
+archivePrefix={arXiv},
+primaryClass={cs.DB},
+url={https://arxiv.org/abs/2410.06526},
+}
+```

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    n=5,
+    k=3
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .livecodebench_gen_a4f90b import LCB_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_o1_gen_f0ed6c.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+        release_version='release_v4',
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation_v4',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    release_version='release_v4',
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_split_v4_o1_gen_f0ed6c.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+        release_version='release_split_v4',
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation_split_v4',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    release_version='release_split_v4',
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (LCBCodeGenerationDataset,
+                                  LCBCodeExecutionDataset,
+                                  LCBTestOutputPredictionDataset,
+                                  LCBCodeGenerationEvaluator,
+                                  LCBCodeExecutionEvaluator,
+                                  LCBTestOutputEvaluator)
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'  # noqa: E501
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(round=[dict(role='HUMAN', prompt=prompt_template)])),
+                                     retriever=dict(type=ZeroRetriever),
+                                     inferencer=dict(type=GenInferencer))
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(type=LCBCodeGenerationEvaluator,
+                   num_process_evaluate=4,
+                   timeout=6,
+                   release_version='release_v5',
+                   start_date='2024-08-01',
+                   end_date='2025-02-01'),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    release_version='release_v5',
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt=
+                    'You are an expert at Python programming, code execution, test case generation, and fuzzing.'  # noqa: E501
+                ),
+            ],
+            round=[dict(role='HUMAN', prompt='{prompt}')])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(type=LCBCodeExecutionEvaluator, ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'  # noqa: E501
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[dict(role='HUMAN', prompt='{prompt}')])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(type=LCBTestOutputEvaluator, ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v1_o1_gen_f0ed6c.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation_v1',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    release_version='release_v1',
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livecodebench/livecodebench_v6_academic.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        release_version='v6',
+        extractor_version='v2',
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation_repeat_6',
+    path='opencompass/code_generation_lite',
+    release_version='v6',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    n=6,
+)
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]

build/lib/opencompass/configs/datasets/livemathbench/README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# LiveMathBench
+## v202412
+### Details of Datsets
+| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
+| -- | -- | -- | -- | -- | -- |
+| AMC | cn | 0 | 0 | 0 | 46 |
+| AMC | en | 0 | 0 | 0 | 46 |
+| CCEE | cn | 0 | 0 | 13 | 31 |
+| CCEE | en | 0 | 0 | 13 | 31 |
+| CNMO | cn | 0 | 0 | 0 | 18 |
+| CNMO | en | 0 | 0 | 0 | 18 |
+| WLPMC | cn | 0 | 0 | 0 | 11 |
+| WLPMC | en | 0 | 0 | 0 | 11 |
+### How to use
+#### G-Pass@k
+```python
+from mmengine.config import read_base
+with read_base():
+    from opencompass.datasets.livemathbench_gen import livemathbench_datasets
+livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
+    {
+        'model_name': 'Qwen/Qwen2.5-72B-Instruct',
+        'url': [
+            'http://0.0.0.0:23333/v1',
+            '...'
+        ]  # set url of evaluation models
+    }
+)
+livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
+    max_out_len=32768 # for o1-like models you need to update max_out_len
+))
+```
+#### Greedy
+```python
+from mmengine.config import read_base
+with read_base():
+    from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
+livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
+    {
+        'model_name': 'Qwen/Qwen2.5-72B-Instruct',
+        'url': [
+            'http://0.0.0.0:23333/v1',
+            '...'
+        ]  # set url of evaluation models
+    }
+)
+livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
+    max_out_len=32768 # for o1-like models you need to update max_out_len
+))
+```
+### Output Samples
+| dataset | version | metric | mode | Qwen2.5-72B-Instruct |
+|----- | ----- | ----- | ----- | -----|
+| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .livemathbench_gen_9befbf import livemathbench_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answer'
+)
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=GenInferencer,
+        max_out_len=16384,
+        temperature=1.0
+    )
+)
+livemathbench_eval_cfg = dict(
+    evaluator=dict(
+        type=LiveMathBenchEvaluator,
+        model_name='Qwen/Qwen2.5-72B-Instruct',
+        url=['http://172.30.40.154:23333/v1/'] #'https://api.openai.com/v1/'
+    )
+)
+livemathbench_datasets = [
+    dict(
+        type=LiveMathBenchDataset,
+        abbr='LiveMathBench-k1-n1',
+        path='opencompass/LiveMathBench202412',
+        k=1, # K@Pass
+        n=1,  # Run times
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=livemathbench_eval_cfg
+    )
+]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='',
+    k=16,
+    n=48,
+    dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
+    dataset_languages=['cn', 'en'],
+    cot=True,
+    version='202412',
+    abbr='LiveMathBench-v202412',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer,
+            max_out_len=8192
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answer'
+)
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=GenInferencer,
+        max_out_len=2048,
+        temperature=1.0
+    )
+)
+livemathbench_eval_cfg = dict(
+    evaluator=dict(
+        type=LiveMathBenchEvaluator,
+        model_name='Qwen/Qwen2.5-72B-Instruct',
+        url=[]
+    )
+)
+livemathbench_datasets = [
+    dict(
+        type=LiveMathBenchDataset,
+        abbr='LiveMathBench',
+        path='',
+        k=32,
+        n=5,
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=livemathbench_eval_cfg
+    )
+]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .livemathbench_greedy_gen_9befbf import livemathbench_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='',
+    k=1,
+    n=1,
+    dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
+    dataset_languages=['cn', 'en'],
+    cot=True,
+    version='202412',
+    abbr='LiveMathBench-v202412',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer,
+            max_out_len=8192
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Summary: A config for LiveMathBench-Hard-202412 Dataset Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 32
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator,
+)
+livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+splits = ['hard_cn', 'hard_en']
+# Dataset configuration
+livemathbench_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr=f'livemathbench_hard_custom_{split}',
+        path='data/LiveMathBench',
+        local_mode=True,
+        file_name=f'202412/{split}.jsonl',
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=dict(
+            # Evaluation configuration using LLM as judge
+            evaluator=dict(
+                type=CascadeEvaluator,
+                rule_evaluator=dict(
+                    type=MATHVerifyEvaluator,
+                ),
+                llm_evaluator=dict(
+                    type=GenericLLMEvaluator,
+                    prompt_template=dict(
+                        type=PromptTemplate,
+                        template=dict(
+                            begin=[
+                                dict(
+                                    role='SYSTEM',
+                                    fallback_role='HUMAN',
+                                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                                )
+                            ],
+                            round=[
+                                dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                            ],
+                        ),
+                    ),
+                    dataset_cfg=dict(
+                        type=CustomDataset,
+                        path='data/LiveMathBench',
+                        local_mode=True,
+                        file_name=f'202412/{split}.jsonl',
+                        reader_cfg=livemathbench_reader_cfg,
+                    ),
+                    judge_cfg={},
+                    dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                ),
+                parallel=False
+            ),
+        ),
+        n=1, # repeat n times
+    ) for split in splits
+]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from itertools import product
+livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\n',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+splits = ['hard_cn', 'hard_en']
+# Dataset configuration
+livemathbench_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr=f'livemathbench_hard_custom_{split}_run{run_idx}',
+        path='data/LiveMathBench',
+        local_mode=True,
+        file_name=f'202412/{split}.jsonl',
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=dict(
+            # # Evaluation configuration using LLM as judge
+            evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                        begin=[
+                            dict(
+                                role='SYSTEM',
+                                fallback_role='HUMAN',
+                                prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                            )
+                        ],
+                        round=[
+                            dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                        ],
+                    ),
+                ),
+                dataset_cfg=dict(
+                    type=CustomDataset,
+                    path='data/LiveMathBench',
+                    local_mode=True,
+                    file_name=f'202412/{split}.jsonl',
+                    reader_cfg=livemathbench_reader_cfg,
+                ),
+                judge_cfg={},
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+        ),
+    ) for split, run_idx in product(splits, range(8))
+]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='',
+    k=16,
+    n=48,
+    dataset_splits=['hard'],
+    dataset_languages=['cn', 'en'],
+    cot=True,
+    version='202412',
+    abbr='LiveMathBench-v202412-Hard',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='',
+    k=1,
+    n=1,
+    dataset_splits=['hard'],
+    dataset_languages=['cn', 'en'],
+    cot=True,
+    version='202412',
+    abbr='LiveMathBench-v202412-Hard',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets.livemathbench import LiveMathBenchDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+livemathbench_reader_cfg = dict(
+    input_columns=['question'], output_column='answer'
+)
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\n',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+splits = ['hard']
+livemathbench_datasets = []
+for split in splits:
+    # Dataset configuration
+    livemathbench_datasets.append(
+        dict(
+            type=LiveMathBenchDataset,
+            abbr=f'livemathbench_{split}',
+            path='opencompass/LiveMathBench',
+            dataset_splits = [split],
+            dataset_languages= ['cn', 'en'],
+            reader_cfg=livemathbench_reader_cfg,
+            infer_cfg=livemathbench_infer_cfg,
+            eval_cfg=dict(
+                # # Evaluation configuration using LLM as judge
+                evaluator=dict(
+                    type=GenericLLMEvaluator,
+                    prompt_template=dict(
+                        type=PromptTemplate,
+                        template=dict(
+                            begin=[
+                                dict(
+                                    role='SYSTEM',
+                                    fallback_role='HUMAN',
+                                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                                )
+                            ],
+                            round=[
+                                dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                            ],
+                        ),
+                    ),
+                    dataset_cfg=dict(
+                        type=LiveMathBenchDataset,
+                        path='opencompass/LiveMathBench202412',
+                        dataset_splits = [split],
+                        reader_cfg=livemathbench_reader_cfg,
+                    ),
+                    judge_cfg={},
+                    dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                ),
+            ),
+        )
+    )

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_gen_9befbf.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='opencompass/LiveMathBench',
+    k=16,
+    n=48,
+    dataset_splits=['all'],
+    dataset_languages=['en'],
+    cot=True,
+    version='202505',
+    abbr='LiveMathBench-v202505',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer,
+            max_out_len=8192
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_greedy_gen_9befbf.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='opencompass/LiveMathBench',
+    k=1,
+    n=1,
+    dataset_splits=['all'],
+    dataset_languages=['en'],
+    cot=True,
+    version='202505',
+    abbr='LiveMathBench-v202505',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer,
+            max_out_len=8192
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_gen_353ae7.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='opencompass/LiveMathBench',
+    k=16,
+    n=48,
+    dataset_splits=['hard'],
+    dataset_languages=['en'],
+    cot=True,
+    version='202505',
+    abbr='LiveMathBench-v202505-Hard',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livemathbench/livemathbench_v202505_hard_greedy_gen_353ae7.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='opencompass/LiveMathBench',
+    k=1,
+    n=1,
+    dataset_splits=['hard'],
+    dataset_languages=['en'],
+    cot=True,
+    version='202505',
+    abbr='LiveMathBench-v202505-Hard',
+    reader_cfg=dict(
+        input_columns=['prompt'],
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]

build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .livereasonbench_gen_0283c3 import simpleqa_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+# from opencompass.datasets import SimpleQADataset, simpleqa_postprocess
+from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+livereasonbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\n'),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=16384))
+livereasonbench_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dict_postprocessor=dict(type=livereasonbench_postprocess),
+    ),
+    pred_role='BOT',
+)
+livereasonbench_datasets = [
+    dict(
+        abbr='LiveReasonBench-20241202',
+        type=LiveReasonBenchDataset,
+        path='opencompass/LiveReasonBench',
+        reader_cfg=livereasonbench_reader_cfg,
+        infer_cfg=livereasonbench_infer_cfg,
+        eval_cfg=livereasonbench_eval_cfg,
+        version='livereasonbench-20241202',
+        mode='singlescore',
+  )
+]

build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_gen_f990de.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+livereasonbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\n'),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=16384))
+livereasonbench_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=LiveReasonBenchDataset,
+            path='opencompass/LiveReasonBench',
+            reader_cfg=livereasonbench_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=livereasonbench_postprocess),
+    ),
+    pred_role='BOT',
+)
+livereasonbench_datasets = [
+    dict(
+        abbr='LiveReasonBench-20241202',
+        type=LiveReasonBenchDataset,
+        path='opencompass/LiveReasonBench',
+        reader_cfg=livereasonbench_reader_cfg,
+        infer_cfg=livereasonbench_infer_cfg,
+        eval_cfg=livereasonbench_eval_cfg,
+        version='livereasonbench-20241202',
+        mode='singlescore',
+  )
+]

build/lib/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+livereasonbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\n'),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+livereasonbench_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=LiveReasonBenchDataset,
+            path='opencompass/LiveReasonBench',
+            reader_cfg=livereasonbench_reader_cfg,
+            version='livereasonbench-20250428',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=livereasonbench_postprocess),
+    ),
+)
+livereasonbench_datasets = [
+    dict(
+        abbr='LiveReasonBench-20250428',
+        type=LiveReasonBenchDataset,
+        path='opencompass/LiveReasonBench',
+        reader_cfg=livereasonbench_reader_cfg,
+        infer_cfg=livereasonbench_infer_cfg,
+        eval_cfg=livereasonbench_eval_cfg,
+        version='livereasonbench-20250428',
+        n=1
+  )
+]

build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_gen_2e6d10.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+livereasonbench_subsets = {
+    'biology': 'livestembench_bio',
+    'chemistry': 'livestembench_che',
+    'physics': 'livestembench_phy',
+}
+livestembench_datasets = []
+for name, subset in livereasonbench_subsets.items():
+    livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+    livereasonbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'),
+                ],
+            )),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    livereasonbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=LiveStemBenchDataset,
+                path='opencompass/livestembench',
+                reader_cfg=livereasonbench_reader_cfg,
+                version=subset,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=livereasonbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    livestembench_datasets.append(
+        dict(
+            abbr=f'LiveStemBench-{name}',
+            type=LiveStemBenchDataset,
+            path='opencompass/livestembench',
+            reader_cfg=livereasonbench_reader_cfg,
+            infer_cfg=livereasonbench_infer_cfg,
+            eval_cfg=livereasonbench_eval_cfg,
+            version=subset,
+            mode='singlescore',
+        )
+    )

build/lib/opencompass/configs/datasets/livestembench/livestembench_0shot_noncot_xml_gen_2e6d10.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
+from opencompass.utils import xml_tag_postprocessor
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+livereasonbench_subsets = {
+    'biology': 'livestembench_bio',
+    'chemistry': 'livestembench_che',
+    'physics': 'livestembench_phy',
+}
+livestembench_datasets = []
+for name, subset in livereasonbench_subsets.items():
+    livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+    livereasonbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='问题: {question}\n 请回答这道问题'),
+                ],
+            )),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    livereasonbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=LiveStemBenchDataset,
+                path='opencompass/livestembench',
+                reader_cfg=livereasonbench_reader_cfg,
+                version=subset,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=livereasonbench_postprocess),
+            pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
+        ),
+        pred_role='BOT',
+    )
+    livestembench_datasets.append(
+        dict(
+            abbr=f'LiveStemBench-{name}',
+            type=LiveStemBenchDataset,
+            path='opencompass/livestembench',
+            reader_cfg=livereasonbench_reader_cfg,
+            infer_cfg=livereasonbench_infer_cfg,
+            eval_cfg=livereasonbench_eval_cfg,
+            version=subset,
+            mode='singlescore',
+        )
+    )

build/lib/opencompass/configs/datasets/livestembench/livestembench_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .livestembench_gen_3e3c50 import livestembench_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/livestembench/livestembench_gen_3e3c50.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import LiveStemBenchDataset, livereasonbench_postprocess
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+livereasonbench_subsets = {
+    'biology': 'livestembench_bio',
+    'chemistry': 'livestembench_che',
+    'physics': 'livestembench_phy',
+}
+livestembench_datasets = []
+for name, subset in livereasonbench_subsets.items():
+    livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+    livereasonbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='问题: {question}\n请逐步思考，并给出最终答案，答案放在 \\boxed{{}} 中。'),
+                ],
+            )),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    livereasonbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=LiveStemBenchDataset,
+                path='opencompass/livestembench',
+                reader_cfg=livereasonbench_reader_cfg,
+                version=subset,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=livereasonbench_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    livestembench_datasets.append(
+        dict(
+            abbr=f'LiveStemBench-{name}',
+            type=LiveStemBenchDataset,
+            path='opencompass/livestembench',
+            reader_cfg=livereasonbench_reader_cfg,
+            infer_cfg=livereasonbench_infer_cfg,
+            eval_cfg=livereasonbench_eval_cfg,
+            version=subset,
+            mode='singlescore',
+        )
+    )

build/lib/opencompass/configs/datasets/llm_compression/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# LLM Compression
+## Introduction
+The following introduction comes from the abstract of [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937):
+>There is a belief that learning to compress well will lead to intelligence. Recently, language modeling has been shown to be equivalent to compression, which offers a compelling rationale for the success of large language models (LLMs): the development of more advanced language models is essentially enhancing compression which facilitates intelligence. ...our findings suggest that compression efficiency, as an unsupervised metric derived from raw text corpora, serves as a reliable evaluation measure that is linearly associated with the model capabilities. We open-source our compression datasets as well as our data collection pipelines to facilitate future researchers to assess compression properly.
+## Official Links
+- Paper: [Compression Represents Intelligence Linearly](https://arxiv.org/abs/2404.09937)
+- GitHub Repository: [llm-compression-intelligence](https://github.com/hkust-nlp/llm-compression-intelligence)
+## Overview and Usage
+### Dataset
+The dataset, which consists of three external corpora, can be downloaded using the following python script:
+```python
+from os import os.path as osp
+from datasets import load_dataset
+data_path = "data/llm-compression"
+subset_mapping = {
+    'arxiv_math': ['arxiv_math'],
+    'commoncraw': ['cc'],
+    'python': ['python'],
+}
+for key, value in subset_mapping.items():
+    llmc_dataset = load_dataset(r"hkust-nlp/llm-compression", name=value)
+    llmc_dataset["test"].to_json(osp.join(data_path, f"{key}.jsonl"))
+```
+Note: Refer to the original [repository](https://github.com/hkust-nlp/llm-compression-intelligence) for more details on data collection and design.
+### Inference
+The inference stage (`SWCELossInferencer`) consists of the following key steps:
+1. For each candidate model, we obtain the encodings of each sample of the dataset using its tokenizer.
+2. Concatenate the encodings of all samples into a single array and construct a PyTorch Dataset. Each item of `__getitem__` is a chunk of the array based on a sliding window. To reproduce results from the original paper, set `block_size=1900` and `stride=512`.
+3. For each batch, calculate the cross entropy loss based on model logits and targets. The losses within each batch is reduced to a single loss by summation.
+4. Output the losses and `total_chr_num` to `BPCEvaluator` for evaluation.
+### Evaluation
+`BPCEvaluator`: Using the total loss for each batch and the total number of characters in the original dataset from the inference stage, calculate the Bits per Character (BPC) metric for each model:
+$$ BPC = \frac{TotalCrossEntropyLoss}{TotalCharacterNumber*log(2)} $$
+### Summarization
+### Config Files
+1. Dataset config: `configs/datasets/llm-compression.py`
+2. Evaluation config: `examples/eval_llm_compression.py`
+## Evaluation Results
+```
+   metric version            model commoncraw  python arxiv_math  average
+0     bpc  af04af   qwen1.5-32b-hf     0.5910  0.2584     0.4080   0.4191
+1     bpc  af04af   qwen1.5-14b-hf     0.6459  0.2766     0.4310   0.4512
+2     bpc  af04af      qwen-14b-hf     0.6197  0.2849     0.4498   0.4515
+3     bpc  af04af     llama-30b-hf     0.5773  0.3212     0.4562   0.4516
+4     bpc  af04af   llama-2-13b-hf     0.5807  0.3336     0.4752   0.4632
+5     bpc  af04af    qwen1.5-7b-hf     0.6658  0.2935     0.4500   0.4698
+6     bpc  af04af       qwen-7b-hf     0.6453  0.3088     0.4830   0.4790
+7     bpc  af04af     llama-13b-hf     0.6083  0.3555     0.4865   0.4834
+8     bpc  af04af    llama-2-7b-hf     0.6117  0.3536     0.4995   0.4883
+9     bpc  af04af      llama-7b-hf     0.6285  0.3794     0.5096   0.5058
+10    bpc  af04af  qwen1.5-1.8b-hf     0.7448  0.4029     0.5625   0.5701
+11    bpc  af04af     qwen-1.8b-hf     0.7542  0.4175     0.5842   0.5853
+12    bpc  af04af  qwen1.5-0.5b-hf     0.8102  0.4520     0.6181   0.6268
+```
+## FAQ
+### I am getting this warning during inference, should I truncate long samples to `max_seq_len` to avoid further errors?
+```
+Token indices sequence length is longer than the specified maximum sequence length for this model. Running this sequence through the model will result in indexing errors
+```
+>A: This warning comes from the tokenizer indicating that the input sequence length exceeds the model's input length, but it does not affect the operation of the tokenizer. For loss calculation, as long as we set a `block_size` of the sliding window less than `max_seq_len`, we can safely ignore this warning.
+## Reference
+```
+@misc{huang2024compression,
+      title={Compression Represents Intelligence Linearly},
+      author={Yuzhen Huang and Jinghan Zhang and Zifei Shan and Junxian He},
+      year={2024},
+      eprint={2404.09937},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```

build/lib/opencompass/configs/datasets/llm_compression/llm_compression.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import SWCELossInferencer
+from opencompass.openicl.icl_evaluator import BPCEvaluator
+from opencompass.datasets import LLMCompressionDataset
+# The three corpora for llm_compression used in the original paper
+# See configs/datasets/llm_compression/README.md for more details
+subset_mapping = {
+    'arxiv_math': ['arxiv_math'],
+    'commoncraw': ['cc'],
+    'python': ['python'],
+}
+# Build LLM Compression datasets
+llm_compression_datasets = []
+for _name in subset_mapping.keys():
+    llm_cmp_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template='{content}',
+        ),
+        # No in-context example, using ZeroRetriever
+        retriever=dict(type=ZeroRetriever),
+        # Calculates cross entropy loss for each batch based on a sliding context window
+        # Setting block_size=1900 and stride=512 according to the original paper
+        inferencer=dict(type=SWCELossInferencer, block_size=1900, stride=512),
+    )
+    # Calculates Bits per Character (BPC) based on the CE loss from the inference stage
+    llm_cmp_eval_cfg = dict(evaluator=dict(type=BPCEvaluator))
+    llm_compression_datasets.append(
+        dict(
+            abbr=f'llm_compression-{_name}',
+            type=LLMCompressionDataset,
+            path='./data/llm-compression',
+            name=_name,
+            samples=None,  # Set small samples for testing
+            reader_cfg=dict(
+                input_columns=['content'],
+                output_column=None,
+            ),
+            infer_cfg=llm_cmp_infer_cfg,
+            eval_cfg=llm_cmp_eval_cfg,
+        ))
+del _name

build/lib/opencompass/configs/datasets/longbench/longbench.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from mmengine.config import read_base
+with read_base():
+    from .longbench2wikimqa.longbench_2wikimqa_gen import LongBench_2wikimqa_datasets
+    from .longbenchhotpotqa.longbench_hotpotqa_gen import LongBench_hotpotqa_datasets
+    from .longbenchmusique.longbench_musique_gen import LongBench_musique_datasets
+    from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets
+    from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets
+    from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets
+    from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets
+    from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets
+    from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets
+    from .longbenchqmsum.longbench_qmsum_gen import LongBench_qmsum_datasets
+    from .longbenchvcsum.longbench_vcsum_gen import LongBench_vcsum_datasets
+    from .longbenchdureader.longbench_dureader_gen import LongBench_dureader_datasets
+    from .longbenchlcc.longbench_lcc_gen import LongBench_lcc_datasets
+    from .longbenchrepobench.longbench_repobench_gen import LongBench_repobench_datasets
+    from .longbenchpassage_retrieval_en.longbench_passage_retrieval_en_gen import LongBench_passage_retrieval_en_datasets
+    from .longbenchpassage_retrieval_zh.longbench_passage_retrieval_zh_gen import LongBench_passage_retrieval_zh_datasets
+    from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets
+    from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets
+    from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets
+    from .longbenchmulti_news.longbench_multi_news_gen import LongBench_multi_news_datasets
+    from .longbenchsamsum.longbench_samsum_gen import LongBench_samsum_datasets
+longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .longbenchv2_gen_75fbba import LongBenchv2_datasets

build/lib/opencompass/configs/datasets/longbenchv2/longbenchv2_gen_75fbba.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import LongBenchv2Dataset, LongBenchv2Evaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+LongBenchv2_reader_cfg = dict(
+    input_columns=['context', 'question', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'difficulty', 'length'],
+    output_column='answer',
+)
+LongBenchv2_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='Please read the following text and answer the questions below.\n <text> \n {context} \n </text> \n \n What is the correct answer to this question: {question} \n \n Choices: \n (A) {choice_A} \n (B) {choice_B} \n (C) {choice_C} \n (D) {choice_D} \n Let’s think step by step. Based on the above, what is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+LongBenchv2_eval_cfg = dict(
+    evaluator=dict(type=LongBenchv2Evaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
+)
+LongBenchv2_datasets = [
+    dict(
+        type=LongBenchv2Dataset,
+        abbr='LongBenchv2',
+        path='opencompass/longbenchv2',
+        reader_cfg=LongBenchv2_reader_cfg,
+        infer_cfg=LongBenchv2_infer_cfg,
+        eval_cfg=LongBenchv2_eval_cfg,
+    )
+]

build/lib/opencompass/configs/datasets/lveval/lveval.md ADDED Viewed

	@@ -0,0 +1,165 @@

+# LVEval
+## Introduction
+The following introduction comes from the introduction in [LVEval](https://github.com/infinigence/LVEval)
+```
+LV-Eval是一个具备5个长度等级（16k、32k、64k、128k和256k）、最大文本测试长度达到256k的长文本评测基准。LV-Eval的平均文本长度达到102,380字，最小/最大文本长度为11,896/387,406字。LV-Eval主要有两类评测任务——单跳QA和多跳QA，共包含11个涵盖中英文的评测数据子集。LV-Eval设计时引入3个关键技术：干扰事实插入（Confusiong Facts Insertion，CFI）提高挑战性，关键词和短语替换（Keyword and Phrase Replacement，KPR）减少信息泄漏，以及基于关键词召回的评测指标（Answer Keywords，AK，指代结合答案关键词和字词黑名单的评价指标）提高评测数值客观性。我们希望LV-Eval为未来长文本大语言模型的研究发展提供有价值的性能参考。
+LV-Eval is a challenging long-context benchmark with five length levels (16k, 32k, 64k, 128k, and 256k) reaching up to 256k words. The average number of words is 102,380, and the Min/Max number of words is 11,896/387,406. LV-Eval features two main tasks, single-hop QA and multi-hop QA, comprising 11 bilingual datasets. The design of LV-Eval has incorporated three key techniques, namely confusing facts insertion (CFI), keyword and phrase replacement (KPR), and keyword-recall-based metrics (AK, short for metics with Answer Keywords and word blacklist) design, which jointly provide a challenging, mitigated-knowledge-leakege, and more accurate evaluation of the long-context capability of LLMs. We anticipate that LV-Eval will serve as a valuable resource for supporting future research on long-context LLMs.
+```
+## Official link
+### Paper
+[_LV_-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K](https://arxiv.org/abs/2402.05136)
+### Repository
+[LVEval](https://github.com/infinigence/LVEval)
+## Use cases
+In evaluation scripts, add LVEval dataset as other datasets by using
+```
+from .datasets.lveval.lveval import LVEval_datasets as datasets
+```
+## Examples
+Input example I (from lic_mixup datasets):
+```
+请根据下面给定的文章回答问题，问题和答案只与其中一篇文章有关。
+文章：......文章 9\n\n标题：腐质酸\n内容：腐植酸是自然界中广泛存在的大分子有机物质，广泛应用于农林牧、石油、化工、建材、医药卫生、环保等各个领域。横跨几十个行业。特别是眼下提倡生态农业建设、无公害农业生产、绿色食品、无污染环保产品等，更使\"腐植酸\"备受推崇，事实证明，人类的生活和生存离不开腐植酸，它的确是一个发展中的有希望的朝阳产业，属于一个新型的特殊行业......
+请现在基于上述文章回答下面的问题，问题和答案只与其中一篇文章有关。
+问题：中国的文学受到印度哪些方面的影响？
+回答：
+```
+Output example I (from chatglm3-6b-32k):
+```
+中国文学自印度文学大量吸收营养，在佛教东流之后，从语汇到修辞，从题材到体裁，即便审美取向也深受佛教与印度文学的感染。
+```
+Input example II (from factrecall_zh datasets):
+```
+请基于给定的文章回答下述问题。
+文章：......庚子年间，贝多芬，乃一德裔美籍学士，研究于物理理学。彼其良图，探求相对论、量子力学，尤有大进。质能等价公式 E=mc²，千古独步，声名于当世。诺贝尔物理学奖、以资尊荣，兹矣荣耀之大典。论其学术，涉时空能量，影响深远，以其义非常人，广为当世所知，声名播于天下，实乃现代物理学之奠基者......
+现在请基于上述文章回答下面的问题。
+问题：被世人广泛推崇为现代物理学奠基人的科学家叫什么名字？
+回答：
+```
+Output example II (from chatglm3-6b-32k):
+```
+贝多芬
+```
+## Evaluation results
+```
+dataset                                    version    metric         mode    bluelm-7b-chat-32k-hf
+-----------------------------------------  ---------  -------------  ------  -----------------------
+----------------------------------------   -          -              -       -
+--------- LVEval All ---------             -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_qa                                  -          naive_average  gen     12.00
+----------------------------------------   -          -              -       -
+--------- LVEval Tasks All ---------       -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_single_hop_qa                       -          naive_average  gen     15.11
+LVEval_single_hop_cqa                      -          naive_average  gen     9.21
+LVEval_multi_hop_qa                        -          naive_average  gen     6.99
+LVEval_multi_hop_cqa                       -          naive_average  gen     9.90
+LVEval_factrecall_cqa                      -          naive_average  gen     21.28
+----------------------------------------   -          -              -       -
+--------- LVEval Datasets All ---------    -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_loogle_SD_mixup                     -          naive_average  gen     12.81
+LVEval_cmrc_mixup                          -          naive_average  gen     17.41
+LVEval_multifieldqa_en_mixup               -          naive_average  gen     7.10
+LVEval_multifieldqa_zh_mixup               -          naive_average  gen     11.31
+LVEval_dureader_mixup                      -          naive_average  gen     13.19
+LVEval_loogle_CR_mixup                     -          naive_average  gen     5.17
+LVEval_loogle_MIR_mixup                    -          naive_average  gen     2.60
+LVEval_hotpotwikiqa_mixup                  -          naive_average  gen     10.20
+LVEval_lic_mixup                           -          naive_average  gen     9.60
+LVEval_factrecall_en                       -          naive_average  gen     23.67
+LVEval_factrecall_zh                       -          naive_average  gen     18.90
+----------------------------------------   -          -              -       -
+--------- LVEval Single_Hop QA ---------   -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_loogle_SD_mixup_16k                 83bc25     LVEval_f1      gen     35.05
+LVEval_loogle_SD_mixup_32k                 83bc25     LVEval_f1      gen     13.37
+LVEval_loogle_SD_mixup_64k                 83bc25     LVEval_f1      gen     6.32
+LVEval_loogle_SD_mixup_128k                83bc25     LVEval_f1      gen     5.28
+LVEval_loogle_SD_mixup_256k                83bc25     LVEval_f1      gen     4.00
+----------------------------------------   -          -              -       -
+LVEval_cmrc_mixup_16k                      8bac4e     LVEval_f1      gen     46.45
+LVEval_cmrc_mixup_32k                      8bac4e     LVEval_f1      gen     19.41
+LVEval_cmrc_mixup_64k                      8bac4e     LVEval_f1      gen     11.10
+LVEval_cmrc_mixup_128k                     8bac4e     LVEval_f1      gen     5.89
+LVEval_cmrc_mixup_256k                     8bac4e     LVEval_f1      gen     4.22
+----------------------------------------   -          -              -       -
+--------- LVEval Single_Hop CQA ---------  -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_multifieldqa_en_mixup_16k           83bc25     LVEval_f1      gen     12.28
+LVEval_multifieldqa_en_mixup_32k           83bc25     LVEval_f1      gen     4.64
+LVEval_multifieldqa_en_mixup_64k           83bc25     LVEval_f1      gen     8.30
+LVEval_multifieldqa_en_mixup_128k          83bc25     LVEval_f1      gen     5.63
+LVEval_multifieldqa_en_mixup_256k          83bc25     LVEval_f1      gen     4.64
+----------------------------------------   -          -              -       -
+LVEval_multifieldqa_zh_mixup_16k           ac4a0d     LVEval_f1      gen     22.30
+LVEval_multifieldqa_zh_mixup_32k           ac4a0d     LVEval_f1      gen     17.46
+LVEval_multifieldqa_zh_mixup_64k           ac4a0d     LVEval_f1      gen     6.27
+LVEval_multifieldqa_zh_mixup_128k          ac4a0d     LVEval_f1      gen     5.84
+LVEval_multifieldqa_zh_mixup_256k          ac4a0d     LVEval_f1      gen     4.71
+----------------------------------------   -          -              -       -
+--------- LVEval Multi_Hop QA ---------    -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_dureader_mixup_16k                  8bac4e     LVEval_rouge   gen     18.04
+LVEval_dureader_mixup_32k                  8bac4e     LVEval_rouge   gen     18.33
+LVEval_dureader_mixup_64k                  8bac4e     LVEval_rouge   gen     12.56
+LVEval_dureader_mixup_128k                 8bac4e     LVEval_rouge   gen     10.33
+LVEval_dureader_mixup_256k                 8bac4e     LVEval_rouge   gen     6.69
+----------------------------------------   -          -              -       -
+LVEval_loogle_CR_mixup_16k                 83bc25     LVEval_f1      gen     9.35
+LVEval_loogle_CR_mixup_32k                 83bc25     LVEval_f1      gen     7.42
+LVEval_loogle_CR_mixup_64k                 83bc25     LVEval_f1      gen     3.18
+LVEval_loogle_CR_mixup_128k                83bc25     LVEval_f1      gen     2.65
+LVEval_loogle_CR_mixup_256k                83bc25     LVEval_f1      gen     3.27
+----------------------------------------   -          -              -       -
+LVEval_loogle_MIR_mixup_16k                83bc25     LVEval_f1      gen     4.50
+LVEval_loogle_MIR_mixup_32k                83bc25     LVEval_f1      gen     3.19
+LVEval_loogle_MIR_mixup_64k                83bc25     LVEval_f1      gen     2.34
+LVEval_loogle_MIR_mixup_128k               83bc25     LVEval_f1      gen     1.76
+LVEval_loogle_MIR_mixup_256k               83bc25     LVEval_f1      gen     1.20
+----------------------------------------   -          -              -       -
+--------- LVEval Multi_Hop CQA ---------   -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_hotpotwikiqa_mixup_16k              e3c368     LVEval_f1      gen     19.80
+LVEval_hotpotwikiqa_mixup_32k              e3c368     LVEval_f1      gen     12.59
+LVEval_hotpotwikiqa_mixup_64k              e3c368     LVEval_f1      gen     7.33
+LVEval_hotpotwikiqa_mixup_128k             e3c368     LVEval_f1      gen     7.85
+LVEval_hotpotwikiqa_mixup_256k             e3c368     LVEval_f1      gen     3.42
+----------------------------------------   -          -              -       -
+LVEval_lic_mixup_16k                       fdd540     LVEval_f1      gen     21.36
+LVEval_lic_mixup_32k                       fdd540     LVEval_f1      gen     12.92
+LVEval_lic_mixup_64k                       fdd540     LVEval_f1      gen     4.62
+LVEval_lic_mixup_128k                      fdd540     LVEval_f1      gen     4.25
+LVEval_lic_mixup_256k                      fdd540     LVEval_f1      gen     4.85
+----------------------------------------   -          -              -       -
+--------- LVEval Factrecall CQA ---------  -          -              -       -
+----------------------------------------   -          -              -       -
+LVEval_factrecall_en_16k                   fba966     f1             gen     58.33
+LVEval_factrecall_en_32k                   fba966     f1             gen     32.17
+LVEval_factrecall_en_64k                   fba966     f1             gen     15.33
+LVEval_factrecall_en_128k                  fba966     f1             gen     8.50
+LVEval_factrecall_en_256k                  fba966     f1             gen     4.00
+----------------------------------------   -          -              -       -
+LVEval_factrecall_zh_16k                   ef3320     f1             gen     20.00
+LVEval_factrecall_zh_32k                   ef3320     f1             gen     38.00
+LVEval_factrecall_zh_64k                   ef3320     f1             gen     20.50
+LVEval_factrecall_zh_128k                  ef3320     f1             gen     11.00
+LVEval_factrecall_zh_256k                  ef3320     f1             gen     5.00
+```

build/lib/opencompass/configs/datasets/lveval/lveval.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from mmengine.config import read_base
+with read_base():
+    from .lvevalcmrc_mixup.lveval_cmrc_mixup_gen import (
+        LVEval_cmrc_mixup_datasets,
+    )
+    from .lvevaldureader_mixup.lveval_dureader_mixup_gen import (
+        LVEval_dureader_mixup_datasets,
+    )
+    from .lvevalfactrecall_en.lveval_factrecall_en_gen import (
+        LVEval_factrecall_en_datasets,
+    )
+    from .lvevalfactrecall_zh.lveval_factrecall_zh_gen import (
+        LVEval_factrecall_zh_datasets,
+    )
+    from .lvevalhotpotwikiqa_mixup.lveval_hotpotwikiqa_mixup_gen import (
+        LVEval_hotpotwikiqa_mixup_datasets,
+    )
+    from .lvevallic_mixup.lveval_lic_mixup_gen import LVEval_lic_mixup_datasets
+    from .lvevalloogle_CR_mixup.lveval_loogle_CR_mixup_gen import (
+        LVEval_loogle_CR_mixup_datasets,
+    )
+    from .lvevalloogle_MIR_mixup.lveval_loogle_MIR_mixup_gen import (
+        LVEval_loogle_MIR_mixup_datasets,
+    )
+    from .lvevalloogle_SD_mixup.lveval_loogle_SD_mixup_gen import (
+        LVEval_loogle_SD_mixup_datasets,
+    )
+    from .lvevalmultifieldqa_en_mixup.lveval_multifieldqa_en_mixup_gen import (
+        LVEval_multifieldqa_en_mixup_datasets,
+    )
+    from .lvevalmultifieldqa_zh_mixup.lveval_multifieldqa_zh_mixup_gen import (
+        LVEval_multifieldqa_zh_mixup_datasets,
+    )
+LVEval_datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')), []
+)

build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .mastermath2024v1_gen_be6318 import mastermath2024v1_datasets

build/lib/opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen_be6318.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MastermathDatasetv1, MastermathDatasetv1Evaluator
+from opencompass.utils import first_option_postprocess
+mastermath2024v1_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+mastermath2024v1_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\n选项:\n'
+                                          '(A){A}\n'
+                                          '(B){B}\n'
+                                          '(C){C}\n'
+                                          '(D){D}\n'
+                                          '你的回答格式如下: "正确答案是 (在这里插入你的答案)"'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+mastermath2024v1_eval_cfg = dict(evaluator=dict(type=MastermathDatasetv1Evaluator),
+                                 pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+mastermath2024v1_datasets = [dict(
+        abbr='Mastermath2024v1',
+        type=MastermathDatasetv1,
+        path='./data/mastermath2024v1/',
+        name='kaoyan_math_1_mcq_Sheet1.csv',
+        reader_cfg=mastermath2024v1_reader_cfg,
+        infer_cfg=mastermath2024v1_infer_cfg,
+        eval_cfg=mastermath2024v1_eval_cfg)]

build/lib/opencompass/configs/datasets/matbench/matbench_gen.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from mmengine.config import read_base
+with read_base():
+    # from .matbench_gen_regex_judge import matbench_datasets  # noqa: F401, F403
+    from .matbench_llm_judge_gen_0e9276 import matbench_datasets  # noqa: F401, F403

build/lib/opencompass/configs/datasets/matbench/matbench_gen_f71840.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification
+matbench_reader_cfg = dict(
+    input_columns=['problem'], output_column='answer')
+matbench_tasks =  ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass']
+matbench_datasets = []
+for task in matbench_tasks:
+    if task in ['matbench_expt_is_metal','matbench_glass']:
+        matbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+        matbench_eval_cfg = dict(
+            evaluator=dict(type=MatbenchEvaluator_classification),
+            pred_role='BOT')
+    elif task in ['matbench_steels','matbench_expt_gap']:
+        matbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+        matbench_eval_cfg = dict(
+            evaluator=dict(type=MatbenchEvaluator_regression),
+            pred_role='BOT')
+    matbench_datasets.append(
+        dict(
+            type=MatbenchDataset,
+            path=f'opencompass/Matbench',
+            task=task,
+            abbr=task,
+            reader_cfg=matbench_reader_cfg,
+            infer_cfg=matbench_infer_cfg,
+            eval_cfg=matbench_eval_cfg))