msj19 commited on Jan 22

Commit

a908f55

verified ·

1 Parent(s): 8082566

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

examples/eval_OlympiadBench.py +36 -0
examples/eval_attack.py +28 -0
examples/eval_base_demo.py +14 -0
examples/eval_charm_rea.py +66 -0
examples/eval_chat_agent_baseline.py +38 -0
examples/eval_code_passk.py +53 -0
examples/eval_corebench_2409_base_objective.py +175 -0
examples/eval_deepseek_r1.py +212 -0
examples/eval_ds1000_interpreter.py +45 -0
examples/eval_eese_api_judge.py +47 -0
examples/eval_gpt4.py +44 -0
examples/eval_hf_llama_7b.py +8 -0
examples/eval_inference_ppl.py +51 -0
examples/eval_internLM.py +9 -0
examples/eval_internlm_7b.py +9 -0
examples/eval_internlm_chat_turbomind.py +96 -0
examples/eval_internlm_turbomind.py +55 -0
examples/eval_judge_dataset_all.py +61 -0
examples/eval_judgebench.py +52 -0
examples/eval_judgerbench.py +58 -0
examples/eval_judgerbenchv2.py +53 -0
examples/eval_korbench.py +14 -0
examples/eval_livestembench.py +66 -0
examples/eval_llm_judge.py +116 -0
examples/eval_lmdeploy_demo.py +10 -0
examples/eval_longbenchv2.py +28 -0
examples/eval_math_llm_judge.py +136 -0
examples/eval_math_verify.py +77 -0
examples/eval_mmlu_cf.py +36 -0
examples/eval_mmlu_pro.py +39 -0
examples/eval_mmlu_with_zero_retriever_overwritten.py +16 -0
examples/eval_multi_prompt_demo.py +52 -0
examples/eval_musr.py +34 -0
examples/eval_needlebench_v2.py +27 -0
examples/eval_qwen3.py +142 -0
examples/eval_qwen_7b_chat.py +58 -0
examples/eval_qwen_7b_chat_lawbench.py +13 -0
examples/eval_rewardbench.py +53 -0
examples/eval_rmb.py +53 -0
examples/eval_ruler.py +97 -0
examples/eval_rwkv5_3b.py +7 -0
examples/eval_simpleqa.py +45 -0
examples/eval_subjective.py +104 -0
examples/eval_subjective_bradleyterry.py +120 -0
examples/eval_teval.py +81 -0
examples/eval_with_model_dataset_combinations.py +45 -0
tmp/38bf021a-c80f-4a23-9021-f2adc82afa5d_params.py +1424 -0
tmp/3baffa8c-bc69-4789-aa49-f30266896eb4_params.py +0 -0
tmp/3bc1afd5-60f6-4b89-9fc0-909218b5c248_params.py +53 -0
tmp/401500cf-6431-490c-9e43-14532e24796f_params.py +1424 -0

examples/eval_OlympiadBench.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_gen_be8b13 import olympiadbench_datasets
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
+    from opencompass.configs.summarizers.OlympiadBench import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=8,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLEvalTask)
+    ),
+)
+work_dir = 'outputs/debug/OlympiadBench'

examples/eval_attack.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from mmengine.config import read_base
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLAttackTask
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.promptbench.promptbench_wnli_gen_50662f import \
+        wnli_datasets
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import models
+datasets = wnli_datasets
+# Please run whole dataset at a time, aka use `NaivePartitioner` only
+# Please use `OpenICLAttackTask` if want to perform attack experiment
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLAttackTask)),
+)
+attack = dict(
+    attack='textfooler',
+    query_budget=100,
+    prompt_topk=1,
+)

examples/eval_base_demo.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_base_gen import \
+        math_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import \
+        models as hf_internlm2_1_8b_models
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
+        models as hf_qwen2_1_5b_models
+datasets = gsm8k_datasets + math_datasets
+models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models

examples/eval_charm_rea.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import \
+        charm_reason_datasets as datasets
+    # ------>>>>>> https://arxiv.org/abs/2403.14112
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
+    # from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
+    # from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
+    # from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
+    # <<<<<<------ https://arxiv.org/abs/2403.14112
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
+    # from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
+    # from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
+    # from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
+    from .summarizers.charm_reason import summarizer
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+work_dir = './outputs/CHARM_rea/chat/'
+# dataset                                                        version    metric         mode    internlm2-chat-7b-turbomind
+# -------------------------------------------------------------  ---------  -------------  ------  -----------------------------
+# charm-reason-Direct                                               -          naive_average  gen     49.51
+# charm-reason-ZH-CoT                                               -          naive_average  gen     61.33
+# charm-reason-EN-CoT                                               -          naive_average  gen     54.55
+# charm-reason-XLT                                                  -          naive_average  gen     58.46
+# charm-reason-Translate-EN                                         -          naive_average  gen     56.15
+#                                                                -          -              -       -
+# charm-reason-Chinese_Direct                                       -          naive_average  gen     47.14
+# charm-reason-Chinese_ZH-CoT                                       -          naive_average  gen     58.40
+# charm-reason-Chinese_EN-CoT                                       -          naive_average  gen     48.31
+# charm-reason-Chinese_XLT                                          -          naive_average  gen     53.57
+# charm-reason-Chinese_Translate-EN                                 -          naive_average  gen     48.21
+# charm-reason-Global_Direct                                        -          naive_average  gen     51.88
+# charm-reason-Global_ZH-CoT                                        -          naive_average  gen     64.26
+# charm-reason-Global_EN-CoT                                        -          naive_average  gen     60.79
+# charm-reason-Global_XLT                                           -          naive_average  gen     63.36
+# charm-reason-Global_Translate-EN                                  -          naive_average  gen     64.10

examples/eval_chat_agent_baseline.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from mmengine.config import read_base
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_gen import \
+        mathbench_datasets
+    from opencompass.configs.summarizers.math_baseline import summarizer
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+datasets += mathbench_datasets
+models = [
+    dict(
+        abbr='gpt-3.5-react',
+        type=OpenAI,
+        path='gpt-3.5-turbo',
+        key='ENV',
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+    ),
+]
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)

examples/eval_code_passk.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# This config is used for pass@k evaluation with `num_return_sequences`
+# That model can generate multiple responses for single input
+from mmengine.config import read_base
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+with read_base():
+    from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
+        mbpp_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
+        sanitized_mbpp_datasets
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+datasets += sanitized_mbpp_datasets
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-7b-Python',
+        path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        generation_kwargs=dict(
+            num_return_sequences=10,
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=300),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)

examples/eval_corebench_2409_base_objective.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import os.path as osp
+from mmengine.config import read_base
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
+        gpqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
+        hellaswag_datasets
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import \
+        humaneval_datasets
+    # ## Math
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
+        math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
+        sanitized_mbpp_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets
+    # Model List
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
+        models as lmdeploy_qwen2_5_1_5b_model
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups
+    # TODO: Add LiveCodeBench
+    # ## Instruction Following
+    # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
+                    ['cmmlu', 'accuracy'], ['bbh', 'naive_average'],
+                    ['hellaswag', 'accuracy'], ['drop', 'accuracy'],
+                    ['math', 'accuracy'], ['gsm8k', 'accuracy'],
+                    ['mathbench-t (average)', 'naive_average'],
+                    ['GPQA_diamond', 'accuracy'],
+                    ['openai_humaneval', 'humaneval_pass@1'],
+                    ['IFEval', 'Prompt-level-strict-accuracy'],
+                    ['sanitized_mbpp', 'score'],
+                    ['mathbench-t (average)', 'naive_average']],
+    },
+]
+summarizer = dict(
+    dataset_abbrs=[
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['hellaswag', 'accuracy'],
+        ['drop', 'accuracy'],
+        ['math', 'accuracy'],
+        ['gsm8k', 'accuracy'],
+        ['mathbench-t (average)', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['sanitized_mbpp', 'score'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
+        '',
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math', 'accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science', 'accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+# Local Runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask)),
+)
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_2409_objective/'
+work_dir = osp.join(base_exp_dir, 'base_objective')

examples/eval_deepseek_r1.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Support AIME-2024 with Repeat8
+# Support MATH-500
+# Support OlympiadBench
+# Support OmniMath
+# Support LiveMathBench-202412-Hard
+import os.path as osp
+from itertools import product
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.runners import LocalRunner
+from opencompass.models import (
+    TurboMindModelwithChatTemplate,
+)
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+with read_base():
+    # You can comment out the datasets you don't want to evaluate
+    # Datasets
+    # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
+    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
+    # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
+    # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
+    # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
+    # Summarizer
+    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+# Set LLM Verifier used for each dataset
+verifier_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
+        key='sk-1234', # You need to set your own API key
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1', # You need to set your own API base
+        ],
+        meta_template=dict(
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True),
+            ],
+        ),
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        # max_seq_len=32768,
+        max_seq_len=49152,
+)
+for item in datasets:
+    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+#######################################################################
+#                          PART 2  Model List                         #
+#######################################################################
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+models += [
+    # You can comment out the models you don't want to evaluate
+    # All models use sampling mode
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=32768),
+    #     max_seq_len=32768,
+    #     max_out_len=32768,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=2),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=16384),
+    #     max_seq_len=32768,
+    #     max_out_len=16384,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=4),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+]
+#######################################################################
+#                          PART 3  Inference/Evaluation               #
+#######################################################################
+# Inference configuration
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=1
+        # Similar with data-parallelism, how many workers for evaluation,
+        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
+        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
+        # to max-utilize the GPUs.
+        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+# Evaluation configuration
+eval = dict(
+    partitioner=dict(
+        type=NaivePartitioner, n=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(
+            type=OpenICLEvalTask)
+    ),
+)
+#######################################################################
+#                          PART 4  Summarizer                         #
+#######################################################################
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+summary_groups.extend([
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    {
+        'name': 'LiveMathBench-v202412-Hard-Aveage8',
+        'subsets':[[
+            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
+                for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
+        ]
+    }
+])
+# Summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        # ['LiveMathBench-k1-n1', 'pass@1'],
+        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
+        # ['aime2024', 'accuracy'],
+        ['math_prm800k_500-llmjudge', 'accuracy'],
+        ['AIME2024-Aveage8', 'naive_average'],
+        ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['OmniMath', 'accuracy'],
+    ],
+    summary_groups=summary_groups,
+)
+#######################################################################
+#                          PART 5  Utils                              #
+#######################################################################
+work_dir = 'outputs/deepseek_r1_reasoning'

examples/eval_ds1000_interpreter.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from mmengine.config import read_base
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+from opencompass.models import OpenAI
+from opencompass.models.lagent import CodeAgent
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+PYTHON_INTERPRETER_DESCRIPTION = """\
+It can run a Python code. The code must be a valid code that contains only python method.
+"""
+actions = [
+    dict(
+        type=PythonInterpreter,
+        description=PYTHON_INTERPRETER_DESCRIPTION,
+        answer_expr=None,
+    )
+]
+with read_base():
+    from opencompass.configs.datasets.ds1000.ds1000_gen_5c4bec import \
+        ds1000_datasets as datasets
+models = [
+    dict(abbr='gpt-3.5-react',
+         type=CodeAgent,
+         llm=dict(
+             type=OpenAI,
+             path='gpt-3.5-turbo',
+             key='ENV',
+             query_per_second=1,
+             max_seq_len=4096,
+         ),
+         actions=actions,
+         batch_size=8),
+]
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=40000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)

examples/eval_eese_api_judge.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.eese.eese_judge_gen import \
+        eese_datasets
+    # 选择一个感兴趣的模型
+    from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
+        models as gpt4
+from opencompass.models import OpenAISDK
+# 配置评判模型
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+judge_cfg = dict(
+    abbr='model-judge',
+    type=OpenAISDK,
+    path='model-name',
+    key='your-api-key',
+    openai_api_base=['openai-url'],
+    meta_template=api_meta_template,
+    query_per_second=16,
+    batch_size=1,
+    temperature=0.001,
+    tokenizer_path='gpt-4o',
+    verbose=True,
+    max_out_len=16384,
+    max_seq_len=49152,
+)
+datasets = eese_datasets
+models = gpt4
+# 为每个数据集增加judge_cfg信息，而不是覆盖
+for dataset in datasets:
+    if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
+        # 获取现有的judge_cfg，如果不存在则创建空字典
+        existing_judge_cfg = dataset['eval_cfg']['evaluator'].get('judge_cfg', {})
+        # 更新现有的judge_cfg，保留原有配置并添加新配置
+        existing_judge_cfg.update(judge_cfg)
+        # 将更新后的配置设置回去
+        dataset['eval_cfg']['evaluator']['judge_cfg'] = existing_judge_cfg

examples/eval_gpt4.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from mmengine.config import read_base
+from opencompass.models import OpenAI
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+with read_base():
+    from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.summarizers.medium import summarizer
+# GPT4 needs a special humaneval postprocessor
+from opencompass.datasets.humaneval import humaneval_gpt_postprocess
+for _dataset in datasets:
+    if _dataset['path'] == 'openai_humaneval':
+        _dataset['eval_cfg']['pred_postprocessor'][
+            'type'] = humaneval_gpt_postprocess
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+models = [
+    dict(
+        abbr='GPT4',
+        type=OpenAI,
+        path='gpt-4-0613',
+        key=
+        'ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=2048,
+        batch_size=8),
+]
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=4,
+                task=dict(type=OpenICLInferTask)),
+)

examples/eval_hf_llama_7b.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.collections.base_medium_llama import (
+        piqa_datasets, siqa_datasets)
+    from opencompass.configs.models.hf_llama.hf_llama_7b import models
+datasets = [*piqa_datasets, *siqa_datasets]

examples/eval_inference_ppl.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from mmengine.config import read_base
+with read_base():
+    # Inference PPL datasets
+    from opencompass.configs.datasets.inference_ppl.inference_ppl import inference_ppl_datasets
+    # Model configs
+    from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
+    from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
+    from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+# -------------Inference Stage ----------------------------------------
+datasets = [*inference_ppl_datasets]
+workdir = 'outputs/inference_ppl'
+models = [
+    *qwen1_5_7b,
+    *qwen1_5_14b,
+    *llama2_7b,
+    *llama2_13b,
+]
+# Set custom batch_size and num_gpus for faster loss calculation
+# Smaller batch_size should give more precise results, at the cost of worse efficiency
+model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
+for mdl in models:
+    mdl.update(model_cfg)
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=OpenICLInferTask),
+        max_num_workers=256,  # Maximum concurrent evaluation task count
+    ),
+)
+# -------------Evaluation Stage ----------------------------------------
+eval = dict(partitioner=dict(type=NaivePartitioner),
+            runner=dict(
+                type=LocalRunner,
+                task=dict(type=OpenICLEvalTask),
+                max_num_workers=256,
+            ))

examples/eval_internLM.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from mmengine.config import read_base
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.collections.base_medium import datasets
+    # choose a model of interest
+    from opencompass.configs.models.internlm.internlm_7b import models
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer

examples/eval_internlm_7b.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from mmengine.config import read_base
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.collections.base_medium import datasets
+    # choose a model of interest
+    from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer

examples/eval_internlm_chat_turbomind.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from mmengine.config import read_base
+from opencompass.models.turbomind import TurboMindModel
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets
+    from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
+        crowspairs_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
+        WiC_datasets
+    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
+        WSC_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
+        triviaqa_datasets
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+internlm_meta_template = dict(round=[
+    dict(role='HUMAN', begin='<|User|>:', end='\n'),
+    dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
+],
+                              eos_token_id=103028)
+internlm2_meta_template = dict(round=[
+    dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
+    dict(role='BOT',
+         begin='<|im_start|>assistant\n',
+         end='<|im_end|>\n',
+         generate=True),
+],
+                               eos_token_id=92542)
+# config for internlm-chat-7b
+internlm_chat_7b = dict(
+    type=TurboMindModel,
+    abbr='internlm-chat-7b-turbomind',
+    path='internlm/internlm-chat-7b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=32,
+                       rope_scaling_factor=1.0),
+    gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=32,
+    concurrency=32,
+    meta_template=internlm_meta_template,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='<eoa>',
+)
+# config for internlm-chat-7b
+internlm2_chat_7b = dict(type=TurboMindModel,
+                         abbr='internlm2-chat-7b-turbomind',
+                         path='internlm/internlm2-chat-7b',
+                         engine_config=dict(session_len=2048,
+                                            max_batch_size=32,
+                                            rope_scaling_factor=1.0),
+                         gen_config=dict(top_k=1,
+                                         top_p=0.8,
+                                         temperature=1.0,
+                                         max_new_tokens=100),
+                         max_out_len=100,
+                         max_seq_len=2048,
+                         batch_size=32,
+                         concurrency=32,
+                         meta_template=internlm2_meta_template,
+                         run_cfg=dict(num_gpus=1, num_procs=1),
+                         end_str='<|im_end|>')
+# config for internlm-chat-20b
+internlm_chat_20b = dict(
+    type=TurboMindModel,
+    abbr='internlm-chat-20b-turbomind',
+    path='internlm/internlm-chat-20b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=8,
+                       rope_scaling_factor=1.0),
+    gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=8,
+    concurrency=8,
+    meta_template=internlm_meta_template,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='<eoa>',
+)
+models = [internlm_chat_20b]

examples/eval_internlm_turbomind.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from mmengine.config import read_base
+from opencompass.models.turbomind import TurboMindModel
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
+        WiC_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
+        triviaqa_datasets
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+# # config for internlm-7b model
+internlm_7b = dict(
+    type=TurboMindModel,
+    abbr='internlm-7b-turbomind',
+    path='internlm/internlm-7b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=32,
+                       rope_scaling_factor=1.0),
+    gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=32,
+    concurrency=32,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
+# config for internlm-20b model
+internlm_20b = dict(
+    type=TurboMindModel,
+    abbr='internlm-20b-turbomind',
+    path='internlm/internlm-20b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=8,
+                       rope_scaling_factor=1.0),
+    gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=8,
+    concurrency=8,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
+models = [internlm_20b]

examples/eval_judge_dataset_all.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
+    from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
+    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
+    from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
+    from opencompass.configs.summarizers.judgedataset_all import summarizer
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.models import TurboMindModelwithChatTemplate
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+work_dir = './outputs/judge_dataset_all/'

examples/eval_judgebench.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = [*get_judgebench_datasets]
+from opencompass.models import TurboMindModelwithChatTemplate
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+work_dir = './outputs/judgebench/'

examples/eval_judgerbench.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
+from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
+                                HuggingFaceChatGLM3, OpenAI,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='CompassJudger-1-7B-Instruct',
+        path='opencompass/CompassJudger-1-7B-Instruct',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+datasets = judgerbench_datasets
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=NaivePartitioner,
+        n=10,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+work_dir = 'outputs/judgerbench/'

examples/eval_judgerbenchv2.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset
+    from opencompass.configs.summarizers.judgerbenchv2 import summarizer
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = [*get_judgerbenchv2_dataset]
+from opencompass.models import TurboMindModelwithChatTemplate
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+infer = dict(
+    # partitioner=dict(type=NaivePartitioner),
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+work_dir = './outputs/judgerbenchv2/'

examples/eval_korbench.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from mmengine import read_base
+with read_base():
+    from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import \
+        korbench_mixed_datasets as mixed_datasets
+    from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
+        korbench_0shot_single_datasets as zero_shot_datasets
+    from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import \
+        korbench_3shot_single_datasets as three_shot_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b
+datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
+models = hf_internlm2_5_7b

examples/eval_livestembench.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+with read_base():
+    # 选择一个数据集列表
+    from opencompass.configs.datasets.livestembench.livestembench_gen_3e3c50 import \
+        livestembench_datasets
+    # 选择一个感兴趣的模型
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
+        models as qwen2_5_7b_instruct_lmdeploy_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
+        models as qwen2_5_72b_instruct_lmdeploy_model
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+models = [
+    *qwen2_5_7b_instruct_lmdeploy_model, *qwen2_5_72b_instruct_lmdeploy_model
+]
+# Judge 模型配置
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+judge_cfg = dict(
+    abbr='qwen2-5-72b-instruct',
+    type=OpenAISDK,
+    path='YOUR_SERVER_MODEL_NAME',  # 你的部署的模型名称
+    key='None',
+    openai_api_base=[
+        'http://localhost:23333/v1',  # 你的模型部署的地址
+    ],
+    meta_template=api_meta_template,
+    query_per_second=16,
+    batch_size=16,
+    temperature=0.001,
+    max_completion_tokens=32768,
+)
+for dataset in datasets:
+    dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+# -------------Inferen Stage ----------------------------------------
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLEvalTask),
+    ),
+)
+work_dir = './outputs/livestembench'

examples/eval_llm_judge.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from mmengine.config import read_base
+from opencompass.models.openai_api import OpenAISDK
+# Import pre-configured models from OpenCompass
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct_model,
+    )
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import CustomDataset
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+# Evaluation configuration using LLM as judge
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/math',
+            file_name='test_prm800k_500.jsonl',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.jsonl',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+# Model to be evaluated
+models = lmdeploy_qwen2_5_7b_instruct_model
+# Limiting test to first 8 examples for quick testing
+math_reader_cfg['test_range'] = '[0:8]'
+# Output directory
+work_dir = 'outputs/llm_judge'

examples/eval_lmdeploy_demo.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
+        gsm8k_datasets
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_1_8b_chat import \
+        models
+datasets = gsm8k_datasets
+models = models

examples/eval_longbenchv2.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from mmengine.config import read_base
+with read_base():
+    # Models
+    # Datasets
+    from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
+        LongBenchv2_datasets as LongBenchv2_datasets
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
+        models as lmdeploy_glm4_9b_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
+        models as lmdeploy_qwen2_5_7b_instruct_model
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for model in models:
+    model['max_seq_len'] = 128 * 1024
+    model['engine_config']['session_len'] = 128 * 1024
+    model['engine_config']['tp'] = 2
+    model['run_cfg']['num_gpus'] = 2
+    # Drop middle tokens to make input length shorter than session_len, use 128k to keep sync with Longbenchv2 original code
+    # Drop middle now only support LMDeploy models
+    model['drop_middle'] = True
+work_dir = './outputs/longbenchv2'

examples/eval_math_llm_judge.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model  # noqa: F401, F403
+    from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model  # noqa: F401, F403
+    from opencompass.configs.datasets.math.math_llm_judge import math_datasets  # noqa: F401, F403
+from opencompass.datasets import math_judement_preprocess
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import AllObjSummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+# -------------Prompt Settings ----------------------------------------
+eng_obj_prompt = """
+Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
+Examples:
+    Expression 1: $2x+3$
+    Expression 2: $3+2x$
+[Yes]
+    Expression 1: 3/2
+    Expression 2: 1.5
+[Yes]
+    Expression 1: $x^2+2x+1$
+    Expression 2: $y^2+2y+1$
+[No]
+    Expression 1: $x^2+2x+1$
+    Expression 2: $(x+1)^2$
+[Yes]
+    Expression 1: 3245/5
+    Expression 2: 649
+[No]
+(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
+    Expression 1: 2/(-3)
+    Expression 2: -2/3
+[Yes]
+(trivial simplifications are allowed)
+    Expression 1: 72 degrees
+    Expression 2: 72
+[Yes]
+(give benefit of the doubt to units)
+    Expression 1: 64
+    Expression 2: 64 square feet
+[Yes]
+(give benefit of the doubt to units)
+    Expression 1: 64
+    Expression 2:
+[No]
+(only mark as equivalent if both expressions are nonempty)
+---
+YOUR TASK
+Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
+    Expression 1: {obj_gold}
+    Expression 2: {prediction}
+"""
+# -------------Inferen Stage ----------------------------------------
+# eval models
+models = [*hf_llama3_8b_instruct_model]
+# judge models
+judge_models = hf_llama3_70b_instruct_model
+eng_datasets = [*math_datasets]
+chn_datasets = []
+datasets = eng_datasets + chn_datasets
+work_dir = 'outputs/obj_all/'
+for d in eng_datasets:
+    d['eval_cfg'] = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            # If you need to preprocess the prediction before judging,
+            # you can specify the pred_postprocessor function here
+            pred_postprocessor=dict(type=math_judement_preprocess),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(role='HUMAN', prompt=eng_obj_prompt),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=40000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLInferTask)),
+)
+# ------------- Evaluation Configuration --------------------------------
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveSizePartitioner,
+        max_task_size=80000,
+        mode='singlescore',
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+summarizer = dict(type=AllObjSummarizer)

examples/eval_math_verify.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+with read_base():
+    from opencompass.configs.datasets.math.math_500_gen import math_datasets
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-llama-8b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(
+            top_k=1,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+]
+datasets = [*math_datasets]
+work_dir = './outputs/math_500'

examples/eval_mmlu_cf.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import \
+        mmlu_cf_datasets
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model
+    from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import \
+        models as hf_qwen2_5_7b_instruct_model
+    from opencompass.configs.summarizers.mmlu_cf import summarizer
+datasets = sum([
+    v
+    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
+], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLEvalTask)),
+)
+work_dir = 'outputs/debug/mmlu_cf'

examples/eval_mmlu_pro.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
+        mmlu_pro_datasets
+    from opencompass.configs.internal.clusters.local import eval
+    from opencompass.configs.internal.clusters.local import \
+        infer_num_worker as infer
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+        models as lmdeploy_qwen2_7b_instruct_model
+    from opencompass.configs.summarizers.mmlu_pro import summarizer
+datasets = sum([
+    v
+    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
+], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+work_dir = 'outputs/debug/mmlu_pro'
+# dataset                    version    metric         mode      qwen2-7b-instruct-turbomind    llama-3-8b-instruct-turbomind
+# -------------------------  ---------  -------------  ------  -----------------------------  -------------------------------
+# mmlu_pro                   -          naive_average  gen                             46.18                            43.92
+# mmlu_pro_biology           736233     accuracy       gen                             63.74                            64.02
+# mmlu_pro_business          736233     accuracy       gen                             53.23                            46.01
+# mmlu_pro_chemistry         736233     accuracy       gen                             35.25                            32.42
+# mmlu_pro_computer_science  736233     accuracy       gen                             47.07                            44.88
+# mmlu_pro_economics         736233     accuracy       gen                             59.00                            53.79
+# mmlu_pro_engineering       736233     accuracy       gen                             26.73                            33.54
+# mmlu_pro_health            736233     accuracy       gen                             47.31                            51.34
+# mmlu_pro_history           736233     accuracy       gen                             42.78                            42.26
+# mmlu_pro_law               736233     accuracy       gen                             28.07                            26.98
+# mmlu_pro_math              736233     accuracy       gen                             53.59                            37.53
+# mmlu_pro_philosophy        736233     accuracy       gen                             42.28                            42.48
+# mmlu_pro_physics           736233     accuracy       gen                             39.11                            33.64
+# mmlu_pro_psychology        736233     accuracy       gen                             60.90                            59.65
+# mmlu_pro_other             736233     accuracy       gen                             47.40                            46.32

examples/eval_mmlu_with_zero_retriever_overwritten.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from copy import deepcopy
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+with read_base():
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
+        mmlu_datasets  # this is a dataset evaluated with 5-shot
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
+datasets = []
+for d in mmlu_datasets:
+    d = deepcopy(d)
+    d['infer_cfg']['retriever'] = dict(type=ZeroRetriever)
+    datasets.append(d)

examples/eval_multi_prompt_demo.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from mmengine.config import read_base
+from opencompass.models import HuggingFaceCausalLM
+with read_base():
+    from opencompass.configs.datasets.winogrande.winogrande_gen_a027b6 import \
+        winogrande_datasets
+datasets = [*winogrande_datasets]
+_meta_template = dict(round=[
+    dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
+    dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
+], )
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='internlm-chat-7b-hf',
+        path='internlm/internlm-chat-7b',
+        tokenizer_path='internlm/internlm-chat-7b',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            use_fast=False,
+            trust_remote_code=True,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        meta_template=_meta_template,
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+_winogrande_all = [d['abbr'] for d in winogrande_datasets]
+summarizer = dict(summary_groups=[
+    {
+        'name': 'winogrande',
+        'subsets': _winogrande_all
+    },
+    {
+        'name': 'winogrande_std',
+        'subsets': _winogrande_all,
+        'std': True
+    },
+])

examples/eval_musr.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os.path as osp
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
+        models as lmdeploy_glm4_9b_chat_model
+    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
+        models as lmdeploy_gemma_9b_it_model
+    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
+        models as lmdeploy_gemma_27b_it_model
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model
+    from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
+        models as lmdeploy_ministral_8b_instruct_2410_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
+        models as lmdeploy_qwen2_5_7b_instruct_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
+        models as lmdeploy_qwen2_5_14b_instruct_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
+        models as lmdeploy_qwen2_5_32b_instruct_model
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
+        models as lmdeploy_yi_1_5_9b_chat_model
+    from opencompass.configs.summarizers.groups.musr_average import summarizer
+datasets = [*musr_datasets]
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+base_exp_dir = 'outputs/musr/'
+work_dir = osp.join(base_exp_dir, 'musr_eval')

examples/eval_needlebench_v2.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from mmengine.config import read_base
+# we use mmengine.config to import other config files
+with read_base():
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b
+    # Evaluate needlebench_32k, adjust the configuration to use 4k, 32k, 128k, 200k, or 1000k if necessary.
+    # from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_32k import needlebench_datasets
+    # from opencompass.configs.summarizers.needlebench import needlebench_32k_summarizer as summarizer
+    # only eval original "needle in a haystack test" in needlebench_32k
+    from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_single_32k import needlebench_zh_datasets, needlebench_en_datasets
+    from opencompass.configs.summarizers.needlebench import needlebench_v2_32k_summarizer as summarizer
+    # eval Ancestral Tracing Challenge(ATC)
+    # from opencompass.configs.datasets.needlebench_v2.atc.atc_0shot_nocot_2_power_en import needlebench_datasets
+    # ATC use default summarizer thus no need to import summarizer
+datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
+for m in internlm2_chat_7b:
+    m['max_seq_len'] = 32768 # Ensure InternLM2-7B model can receive the full long text; for other models, adjust according to their supported maximum sequence length.
+    m['max_out_len'] = 4096
+models = internlm2_chat_7b
+work_dir = './outputs/needlebench'

examples/eval_qwen3.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os.path as osp
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+with read_base():
+    from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
+    from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
+    from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
+#######################################################################
+#                          PART 0  Meta Info                          #
+#######################################################################
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+],
+)
+judge_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct',
+        key='sk-1234',
+        openai_api_base=[
+            'http://x.x.x.x:4000/v1',
+        ],
+        meta_template=api_meta_template,
+        query_per_second=8,
+        batch_size=256,
+        temperature=0.001,
+        # max_completion_tokens=32768,
+        tokenizer_path='gpt-4o-2024-05-13',
+        # verbose=True,
+        max_out_len=16384,
+        max_seq_len=32768,
+        # max_seq_len=49152,
+        mode='mid',
+        retry=10
+)
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+repeated_info = [
+    (math_datasets, 4),
+    (aime2024_datasets, 32),
+    (aime2025_datasets, 32),
+]
+for datasets_, num in repeated_info:
+    for dataset_ in datasets_:
+        dataset_['n'] = num
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 32768
+    try:
+        if 'judge_cfg' in item['eval_cfg']['evaluator']:
+           item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+        elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+            item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+    except:
+        pass
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        ['math_prm800k_500', 'accuracy (4 runs average)'],
+        ['aime2024', 'accuracy (32 runs average)'],
+        ['aime2025', 'accuracy (32 runs average)'],
+        ['livemathbench_hard', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['olymmath', 'naive_average'],
+    ],
+    summary_groups = sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+    ),
+)
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+models += [
+    dict(
+        abbr='Qwen_Qwen3-235B-A22B',
+        type=OpenAISDK,
+        path='Qwen/Qwen3-235B-A22B',
+        key='sk-admin',
+        openai_api_base=[
+            'http://106.15.231.215:40007/v1/',
+        ],
+        meta_template=dict(
+            # begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                # XXX: all system roles are mapped to human in purpose
+                dict(role='BOT', api_role='BOT', generate=True),
+            ]
+        ),
+        query_per_second=16,
+        batch_size=128,
+        # batch_size=1,
+        temperature=0.6,
+        # max_completion_tokens=32768,
+        tokenizer_path='gpt-4',
+        # verbose=True,
+        max_out_len=32768,
+        max_seq_len=32768,
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
+)
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
+)
+base_exp_dir = 'outputs/qwen3_reasoning'
+work_dir = osp.join(base_exp_dir, 'chat_objective')

examples/eval_qwen_7b_chat.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.collections.leaderboard.qwen_chat import \
+        datasets
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
+    from opencompass.configs.summarizers.leaderboard import summarizer
+'''
+dataset                                 version    metric            mode    qwen-7b-chat-hf
+--------------------------------------  ---------  ----------------  ------  -----------------
+--------- 考试 Exam ---------           -          -                 -       -
+ceval                                   -          naive_average     gen     56.07
+agieval                                 -          naive_average     mixed   39.51
+mmlu                                    -          naive_average     gen     53.49
+cmmlu                                   -          naive_average     gen     55.29
+GaokaoBench                             -          weighted_average  gen     48.01
+ARC-c                                   ca1e8e     accuracy          ppl     74.92
+ARC-e                                   ca1e8e     accuracy          ppl     85.71
+--------- 语言 Language ---------       -          -                 -       -
+WiC                                     efbd01     accuracy          gen     51.41
+chid-dev                                25f3d3     accuracy          ppl     77.72
+afqmc-dev                               4a1636     accuracy          gen     69.00
+WSC                                     678cb5     accuracy          ppl     67.31
+tydiqa-goldp                            -          naive_average     gen     15.32
+flores_100                              -          naive_average     gen     10.00
+--------- 知识 Knowledge ---------      -          -                 -       -
+BoolQ                                   463fee     accuracy          ppl     83.18
+commonsense_qa                          ddaabf     accuracy          gen     76.41
+triviaqa                                b6904f     score             gen     43.25
+nq                                      23dc1a     score             gen     16.26
+--------- 理解 Understanding ---------  -          -                 -       -
+C3                                      e6778d     accuracy          gen     81.53
+race-middle                             e0908b     accuracy          gen     83.01
+race-high                               e0908b     accuracy          gen     77.79
+openbookqa_fact                         49689a     accuracy          ppl     86.40
+csl_dev                                 3c4211     accuracy          ppl     64.38
+lcsts                                   0b3969     rouge1            gen     12.75
+Xsum                                    207e69     rouge1            gen     20.21
+eprstmt-dev                             ed0c5d     accuracy          ppl     85.00
+lambada                                 de1af2     accuracy          gen     59.19
+--------- 推理 Reasoning ---------      -          -                 -       -
+cmnli                                   15e783     accuracy          ppl     48.08
+ocnli                                   15e783     accuracy          ppl     51.40
+AX_b                                    689df1     accuracy          ppl     65.67
+AX_g                                    808a19     accuracy          ppl     76.12
+RTE                                     808a19     accuracy          ppl     68.95
+COPA                                    59f42c     accuracy          gen     92.00
+ReCoRD                                  6f7cfc     score             gen     0.16
+hellaswag                               8d79e0     accuracy          ppl     69.28
+piqa                                    34eee7     accuracy          ppl     72.20
+siqa                                    ea30d1     accuracy          ppl     72.88
+math                                    2c0b9e     accuracy          gen     7.84
+gsm8k                                   4c7f6e     accuracy          gen     45.41
+drop                                    53a0a7     score             gen     39.62
+openai_humaneval                        dd0dff     humaneval_pass@1  gen     10.98
+mbpp                                    60ca11     score             gen     20.60
+bbh                                     -          naive_average     gen     42.61
+'''

examples/eval_qwen_7b_chat_lawbench.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.lawbench.lawbench_one_shot_gen_002588 import \
+        lawbench_datasets as lawbench_one_shot_datasets
+    from opencompass.configs.datasets.lawbench.lawbench_zero_shot_gen_002588 import \
+        lawbench_datasets as lawbench_zero_shot_datasets
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
+    from opencompass.configs.summarizers.lawbench import summarizer
+datasets = lawbench_zero_shot_datasets + lawbench_one_shot_datasets
+for d in datasets:
+    d['infer_cfg']['inferencer']['save_every'] = 1

examples/eval_rewardbench.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
+    from opencompass.configs.summarizers.rewardbench import summarizer
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = [*get_rewardbench_datasets]
+from opencompass.models import TurboMindModelwithChatTemplate
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+work_dir = './outputs/rewardbench/'

examples/eval_rmb.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.rmb import get_rmb_dataset
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = [*get_rmb_dataset]
+from opencompass.models import TurboMindModelwithChatTemplate
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+infer = dict(
+    # partitioner=dict(type=NaivePartitioner),
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+work_dir = './outputs/rmb/'

examples/eval_ruler.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from mmengine.config import read_base
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+with read_base():
+    from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets  # CWE
+    from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets  # FWE
+    from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets  # Niah
+    from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets  # QA
+    from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets  # VT
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import (
+        models as internlm2_5_7b_chat_1m,
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
+        models as llama3_8b_instruct_model,
+    )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
+        models as qwen2_7b_instruct_model,
+    )
+    from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups
+import_datasets = sum(
+    [niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], [])
+# Evaluation config
+NUM_SAMPLES = 500
+# Change the context lengths to be tested
+max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
+abbr_suffixs = ['4k', '8k', '16k', '32k']
+work_dir = './outputs/ruler'
+# Model Settings
+qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
+qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
+qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
+qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
+llama3_8b_instruct_model[0]['max_seq_len'] = 33792
+llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
+llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
+llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
+model_settings = [
+    [qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
+    [llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
+    [internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
+]
+# Dataset Model Combination
+datasets = []
+models = []
+model_dataset_combinations = []
+# Different seq length
+for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
+    for model, model_path in model_settings:
+        _tmp_datasets = []
+        for dataset in import_datasets:
+            tmp_dataset = dataset.deepcopy()
+            tmp_dataset['tokenizer_model'] = model_path
+            tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
+            tmp_dataset['num_samples'] = NUM_SAMPLES
+            tmp_dataset['max_seq_length'] = max_seq_len
+            _tmp_datasets.append(tmp_dataset)
+        model_dataset_combinations.append(
+            dict(models=[model], datasets=_tmp_datasets))
+        models.append(model)
+        datasets.extend(_tmp_datasets)
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask),
+                retry=5),
+)
+eval = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=32,
+                task=dict(type=OpenICLEvalTask)),
+)
+summarizer = dict(
+    dataset_abbrs=abbr_suffixs,
+    summary_groups=sum([ruler_summary_groups], []),
+)
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# dataset    version    metric         mode      qwen2-7b-instruct-turbomind    llama-3-8b-instruct-turbomind    internlm2_5-7b-chat-1m-turbomind
+# ---------  ---------  -------------  ------  -----------------------------  -------------------------------  ----------------------------------
+# 4k         -          naive_average  gen                             93.66                            93.48                               91.20
+# 8k         -          naive_average  gen                             88.38                            89.95                               89.07
+# 16k        -          naive_average  gen                             84.27                             0.14                               87.61
+# 32k        -          naive_average  gen                             81.36                             0.00                               84.59
+# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

examples/eval_rwkv5_3b.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.collections.base_medium_llama import \
+        datasets
+    from opencompass.configs.models.rwkv.rwkv5_3b import models
+    from opencompass.configs.summarizers.leaderboard import summarizer

examples/eval_simpleqa.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
+from mmengine.config import read_base
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+from opencompass.tasks import OpenICLInferTask
+with read_base():
+    from opencompass.configs.datasets.SimpleQA.simpleqa_gen import \
+        simpleqa_datasets
+    from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
+        models as gpt_4o_2024_05_13_model
+models = gpt_4o_2024_05_13_model  # model for generation
+judge_models = gpt_4o_2024_05_13_model  # model for evaluation
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+summarizer = dict(type=DefaultSubjectiveSummarizer)
+# -------------Inferen Stage ----------------------------------------
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=SubjectiveEvalTask)),
+)

examples/eval_subjective.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
+    from opencompass.configs.datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
+    from opencompass.configs.datasets.subjective.fofo.fofo_judge import fofo_datasets
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
+    from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
+    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
+from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
+                                HuggingFaceChatGLM3, OpenAI)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_num_worker import \
+    SubjectiveNumWorkerPartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import SubjectiveSummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=
+            True,  #For subjective evaluation, we suggest you do set do_sample when running model inference!
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+datasets = [
+    *alignbench_datasets, *alpacav2_datasets, *arenahard_datasets,
+    *compassarena_datasets, *compassbench_datasets, *fofo_datasets,
+    *mtbench_datasets, *mtbench101_datasets, *wildbench_datasets
+]  # add datasets you want
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+## ------------- JudgeLLM Configuration
+judge_models = [
+    dict(
+        abbr='GPT4-Turbo',
+        type=OpenAI,
+        path='gpt-4-1106-preview',
+        key=
+        'xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=2048,
+        max_seq_len=2048,
+        batch_size=8,
+        temperature=0,
+    )
+]
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+summarizer = dict(type=SubjectiveSummarizer, function='subjective')
+work_dir = 'outputs/subjective/'

examples/eval_subjective_bradleyterry.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
+        alpacav2_datasets, )
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_bradleyterry import (
+        arenahard_datasets, )
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
+        compassarena_datasets, )
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
+        wildbench_datasets, )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat, )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
+        models as lmdeploy_internlm2_5_20b_chat, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct, )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
+        models as lmdeploy_qwen2_7b_instruct, )
+from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
+                                HuggingFaceChatGLM3, OpenAI,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_num_worker import \
+    SubjectiveNumWorkerPartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import (CompassArenaBradleyTerrySummarizer,
+                                     SubjectiveSummarizer)
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    *lmdeploy_internlm2_5_7b_chat,
+    *lmdeploy_internlm2_5_20b_chat,
+    *lmdeploy_qwen2_5_14b_instruct,
+    *lmdeploy_qwen2_5_7b_instruct,
+    *lmdeploy_qwen2_7b_instruct,
+]
+datasets = [
+    *alpacav2_datasets,
+    *arenahard_datasets,
+    *compassarena_datasets,
+    *wildbench_datasets,
+]
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+## ------------- JudgeLLM Configuration
+judge_models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='CompassJudger-1-32B-Instruct',
+        path='opencompass/CompassJudger-1-32B-Instruct',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+## ------------- Summary Configuration
+# This step fits a Bradley-Terry model (statistical model) with an option
+# to include style features and control variables based on groups
+# (group variables must be available in the input dataset for each observation).
+summarizer = dict(
+    type=CompassArenaBradleyTerrySummarizer,
+    rating_system='bradleyterry',
+    report_pred_win_rates=True,
+    num_bootstrap=100,
+    num_cpu=None,
+    with_control_vars=True,
+    normalize_style_features=False,
+    odds_ratio=True,
+)
+work_dir = 'outputs/subjective/bradleyterry'

examples/eval_teval.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from copy import deepcopy
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
+        teval_datasets as teval_en_datasets
+    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
+        teval_datasets as teval_zh_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
+        models as hf_internlm2_chat_7b_model
+    from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
+        models as hf_llama2_7b_chat_model
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
+        models as hf_qwen_7b_chat_model
+    from opencompass.configs.summarizers.teval import summarizer
+meta_template_system_patches = {
+    'internlm2-chat-7b-hf':
+    dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
+    'internlm2-chat-20b-hf':
+    dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
+}
+_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
+                     [])
+models = []
+for m in _origin_models:
+    m = deepcopy(m)
+    if 'meta_template' in m and 'round' in m['meta_template']:
+        round = m['meta_template']['round']
+        if all(r['role'].upper() != 'SYSTEM'
+               for r in round):  # no system round
+            if m['abbr'] in meta_template_system_patches:
+                system_round = meta_template_system_patches[m['abbr']]
+            else:
+                system_round = [
+                    r for r in round if r['role'].upper() == 'HUMAN'
+                ][0]
+                system_round = deepcopy(system_round)
+                system_round['role'] = 'SYSTEM'
+            m['meta_template']['round'].append(system_round)
+    else:
+        raise ValueError(f'no meta_template.round in {m.get("abbr", None)}')
+    print(
+        f'model {m["abbr"]} is using the following meta_template: {m["meta_template"]}'
+    )
+    models.append(m)
+datasets = teval_en_datasets + teval_zh_datasets
+work_dir = './outputs/teval'
+"""Dataset                                      version    metric          mode
+qwen-7b-chat-hf    internlm2-chat-7b-hf    llama-2-7b-chat-hf.
+-------------------------------------------  ---------  --------------  -------  -----------------  ----------------------  --------------------
+teval                                        -          naive_average   unknown              57.69                   78.18                 36.63
+teval-instruct_v1                            10482d     string_metric   unknown              28.83                   98.08                 50.27
+teval-instruct_v1                            10482d     json_metric     unknown              94.32                   97.08                  0.15
+teval-plan_str_v1                            10482d     f1_score        unknown              66.24                   84.12                 45.72
+teval-plan_json_v1                           10482d     f1_score        unknown              63.62                   77.71                 19.95
+teval-reason_str_v1                          10482d     thought         unknown              54.14                   63.58                 44.92
+teval-reason_retrieve_understand_json_v1     10482d     thought         unknown              33.77                   54.72                 21.49
+teval-retrieve_str_v1                        10482d     name            unknown              73.89                   85.28                 60.6
+teval-reason_retrieve_understand_json_v1     10482d     name            unknown              31.15                   68.97                 15.34
+teval-understand_str_v1                      10482d     args            unknown              77.76                   93.03                 65.61
+teval-reason_retrieve_understand_json_v1     10482d     args            unknown              44.16                   72.23                 26.84
+teval-review_str_v1                          10482d     review_quality  unknown              62.22                   71.66                 44.35
+teval_zh                                     -          naive_average   unknown              61.31                   75.01                 32.33
+teval-instruct_v1_zh                         10482d     string_metric   unknown              88.69                   98.19                 23.64
+teval-instruct_v1_zh                         10482d     json_metric     unknown              75.77                   96.62                  0.89
+teval-plan_str_v1_zh                         10482d     f1_score        unknown              62.43                   70.69                 47.82
+teval-plan_json_v1_zh                        10482d     f1_score        unknown              61.46                   68.95                 15.87
+teval-reason_str_v1_zh                       10482d     thought         unknown              59.43                   68.14                 46.96
+teval-reason_retrieve_understand_json_v1_zh  10482d     thought         unknown              39.19                   60.37                 23.91
+teval-retrieve_str_v1_zh                     10482d     name            unknown              69.41                   84.22                 54.44
+teval-reason_retrieve_understand_json_v1_zh  10482d     name            unknown              32.87                   70.46                 14.16
+teval-understand_str_v1_zh                   10482d     args            unknown              84.39                   88.62                 77.29
+teval-reason_retrieve_understand_json_v1_zh  10482d     args            unknown              48.71                   72.71                 28.83
+teval-review_str_v1_zh                       10482d     review_quality  unknown              56.67                   60.57                 27.1
+"""

examples/eval_with_model_dataset_combinations.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets as chat_ceval_datasets
+    from opencompass.configs.datasets.ceval.ceval_ppl_578f8d import \
+        ceval_datasets as base_ceval_datasets
+    from opencompass.configs.internal.clusters.slurm import eval, infer
+    from opencompass.configs.models.qwen.hf_qwen_7b import \
+        models as hf_qwen_7b_base_models
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
+        models as hf_qwen_7b_chat_models
+    # from opencompass.configs.internal.clusters.slurm import infer_split as infer, eval
+    # from opencompass.configs.internal.clusters.slurm import infer_size as infer, eval
+    # from opencompass.configs.internal.clusters.slurm import infer_size_split as infer, eval
+base_ceval_datasets = base_ceval_datasets[:1]
+chat_ceval_datasets = chat_ceval_datasets[-1:]
+# If you do not want to run all the combinations of models and datasets, you
+# can specify the combinations you want to run here. This is useful when you
+# deleberately want to skip some subset of the combinations.
+# Models and datasets in different combinations are recommended to be disjoint
+# (different `abbr` in model & dataset configs), as we haven't tested this case
+# throughly.
+model_dataset_combinations = [
+    dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
+    dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
+    # dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
+]
+# This union of models and datasets in model_dataset_combinations should be
+# stored in the `models` and `datasets` variables below. Otherwise, modules
+# like summarizer will miss out some information.
+models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
+datasets = [*base_ceval_datasets, *chat_ceval_datasets]
+work_dir = './outputs/default/mdcomb/'
+"""
+dataset                 version    metric    mode    qwen-7b-hf    qwen-7b-chat-hf
+----------------------  ---------  --------  ------  ------------  -----------------
+ceval-computer_network  9b9417     accuracy  ppl     52.63         -
+ceval-physician         6e277d     accuracy  gen     -             59.18
+"""

tmp/38bf021a-c80f-4a23-9021-f2adc82afa5d_params.py ADDED Viewed

	@@ -0,0 +1,1424 @@

+datasets = [
+    [
+        dict(
+            abbr='LongBench_2wikimqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='2wikimqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBench2wikimqaDataset'),
+        dict(
+            abbr='LongBench_hotpotqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='hotpotqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchhotpotqaDataset'),
+        dict(
+            abbr='LongBench_musique_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='musique',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmusiqueDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_en_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[57:76]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_zh_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
+        dict(
+            abbr='LongBench_narrativeqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='narrativeqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchnarrativeqaDataset'),
+        dict(
+            abbr='LongBench_qasper_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qasper',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqasperDataset'),
+        dict(
+            abbr='LongBench_triviaqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.triviaqa_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='triviaqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtriviaqaDataset'),
+        dict(
+            abbr='LongBench_gov_report_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='gov_report',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchgov_reportDataset'),
+        dict(
+            abbr='LongBench_qmsum_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qmsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqmsumDataset'),
+        dict(
+            abbr='LongBench_vcsum_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='vcsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchvcsumDataset'),
+        dict(
+            abbr='LongBench_dureader_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='dureader',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchdureaderDataset'),
+        dict(
+            abbr='LongBench_lcc_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lcc',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[189:252]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlccDataset'),
+        dict(
+            abbr='LongBench_repobench-p_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}{input}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='repobench-p',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[189:252]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchrepobenchDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_en_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_zh_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
+        dict(
+            abbr='LongBench_passage_count_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCountEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_count',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_countDataset'),
+        dict(
+            abbr='LongBench_trec_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.trec_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='trec',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtrecDataset'),
+        dict(
+            abbr='LongBench_lsht_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.lsht_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt='请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lsht',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlshtDataset'),
+        dict(
+            abbr='LongBench_multi_news_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multi_news',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmulti_newsDataset'),
+        dict(
+            abbr='LongBench_samsum_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.samsum_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='samsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchsamsumDataset'),
+        dict(
+            abbr='LongBench_2wikimqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='2wikimqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBench2wikimqaDataset'),
+        dict(
+            abbr='LongBench_hotpotqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='hotpotqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchhotpotqaDataset'),
+        dict(
+            abbr='LongBench_musique_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='musique',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmusiqueDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_en_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[57:76]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_zh_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
+        dict(
+            abbr='LongBench_narrativeqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='narrativeqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchnarrativeqaDataset'),
+        dict(
+            abbr='LongBench_qasper_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qasper',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqasperDataset'),
+        dict(
+            abbr='LongBench_triviaqa_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.triviaqa_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='triviaqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtriviaqaDataset'),
+        dict(
+            abbr='LongBench_gov_report_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='gov_report',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchgov_reportDataset'),
+        dict(
+            abbr='LongBench_qmsum_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qmsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqmsumDataset'),
+        dict(
+            abbr='LongBench_vcsum_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='vcsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchvcsumDataset'),
+        dict(
+            abbr='LongBench_dureader_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='dureader',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchdureaderDataset'),
+        dict(
+            abbr='LongBench_lcc_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lcc',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[189:252]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlccDataset'),
+        dict(
+            abbr='LongBench_repobench-p_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}{input}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='repobench-p',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[189:252]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchrepobenchDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_en_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_zh_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
+        dict(
+            abbr='LongBench_passage_count_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCountEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_count',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_countDataset'),
+        dict(
+            abbr='LongBench_trec_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.trec_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='trec',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtrecDataset'),
+        dict(
+            abbr='LongBench_lsht_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.lsht_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt='请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lsht',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlshtDataset'),
+        dict(
+            abbr='LongBench_multi_news_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multi_news',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmulti_newsDataset'),
+        dict(
+            abbr='LongBench_samsum_3',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.samsum_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='samsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[75:100]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchsamsumDataset'),
+    ],
+]
+models = [
+    dict(
+        abbr='delta_net',
+        batch_size=128,
+        max_seq_len=2048,
+        model_kwargs=dict(
+            device_map='auto',
+            torch_dtype='torch.bfloat16',
+            trust_remote_code=True),
+        path='/mnt/jfzn/msj/delta_net-1.3B-100B',
+        run_cfg=dict(num_gpus=1),
+        tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
+        tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
+        type='opencompass.models.HuggingFaceBaseModel'),
+]
+work_dir = 'outputs/default/20251127_221150'

tmp/3baffa8c-bc69-4789-aa49-f30266896eb4_params.py ADDED Viewed

File without changes

tmp/3bc1afd5-60f6-4b89-9fc0-909218b5c248_params.py ADDED Viewed

	@@ -0,0 +1,53 @@

+datasets = [
+    [
+        dict(
+            abbr='LongBench_musique',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='musique',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmusiqueDataset'),
+    ],
+]
+eval = dict(runner=dict(task=dict(dump_details=True)))
+models = [
+    dict(
+        abbr='gated_deltanet',
+        batch_size=128,
+        max_seq_len=2048,
+        model_kwargs=dict(
+            device_map='auto',
+            torch_dtype='torch.bfloat16',
+            trust_remote_code=True),
+        path='download_model/hgrn2-1.3B-100B',
+        run_cfg=dict(num_gpus=1),
+        tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
+        tokenizer_path='download_model/hgrn2-1.3B-100B',
+        type='opencompass.models.HuggingFaceBaseModel'),
+]
+work_dir = 'outputs/default/20251219_163447'

tmp/401500cf-6431-490c-9e43-14532e24796f_params.py ADDED Viewed

	@@ -0,0 +1,1424 @@

+datasets = [
+    [
+        dict(
+            abbr='LongBench_2wikimqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='2wikimqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBench2wikimqaDataset'),
+        dict(
+            abbr='LongBench_hotpotqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='hotpotqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchhotpotqaDataset'),
+        dict(
+            abbr='LongBench_musique_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='musique',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmusiqueDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_en_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:19]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_zh_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
+        dict(
+            abbr='LongBench_narrativeqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='narrativeqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchnarrativeqaDataset'),
+        dict(
+            abbr='LongBench_qasper_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qasper',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqasperDataset'),
+        dict(
+            abbr='LongBench_triviaqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.triviaqa_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='triviaqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtriviaqaDataset'),
+        dict(
+            abbr='LongBench_gov_report_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='gov_report',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchgov_reportDataset'),
+        dict(
+            abbr='LongBench_qmsum_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qmsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqmsumDataset'),
+        dict(
+            abbr='LongBench_vcsum_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='vcsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchvcsumDataset'),
+        dict(
+            abbr='LongBench_dureader_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='dureader',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchdureaderDataset'),
+        dict(
+            abbr='LongBench_lcc_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lcc',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:63]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlccDataset'),
+        dict(
+            abbr='LongBench_repobench-p_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}{input}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='repobench-p',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:63]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchrepobenchDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_en_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_zh_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
+        dict(
+            abbr='LongBench_passage_count_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCountEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_count',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_countDataset'),
+        dict(
+            abbr='LongBench_trec_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.trec_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='trec',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtrecDataset'),
+        dict(
+            abbr='LongBench_lsht_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.lsht_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt='请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lsht',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlshtDataset'),
+        dict(
+            abbr='LongBench_multi_news_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multi_news',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmulti_newsDataset'),
+        dict(
+            abbr='LongBench_samsum_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.samsum_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='samsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchsamsumDataset'),
+        dict(
+            abbr='LongBench_2wikimqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='2wikimqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBench2wikimqaDataset'),
+        dict(
+            abbr='LongBench_hotpotqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='hotpotqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchhotpotqaDataset'),
+        dict(
+            abbr='LongBench_musique_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='musique',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmusiqueDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_en_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:19]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
+        dict(
+            abbr='LongBench_multifieldqa_zh_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multifieldqa_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
+        dict(
+            abbr='LongBench_narrativeqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='narrativeqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchnarrativeqaDataset'),
+        dict(
+            abbr='LongBench_qasper_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qasper',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqasperDataset'),
+        dict(
+            abbr='LongBench_triviaqa_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchF1Evaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.triviaqa_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='triviaqa',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtriviaqaDataset'),
+        dict(
+            abbr='LongBench_gov_report_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='gov_report',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchgov_reportDataset'),
+        dict(
+            abbr='LongBench_qmsum_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='qmsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchqmsumDataset'),
+        dict(
+            abbr='LongBench_vcsum_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='vcsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchvcsumDataset'),
+        dict(
+            abbr='LongBench_dureader_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='dureader',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchdureaderDataset'),
+        dict(
+            abbr='LongBench_lcc_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lcc',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:63]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlccDataset'),
+        dict(
+            abbr='LongBench_repobench-p_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCodeSimEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please complete the code given below. \n{context}{input}Next line of code:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='repobench-p',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:63]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchrepobenchDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_en_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_en',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
+        dict(
+            abbr='LongBench_passage_retrieval_zh_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    language='zh',
+                    type='opencompass.datasets.LongBenchRetrievalEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_retrieval_zh',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
+        dict(
+            abbr='LongBench_passage_count_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchCountEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=32,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='passage_count',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchpassage_countDataset'),
+        dict(
+            abbr='LongBench_trec_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.trec_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='trec',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchtrecDataset'),
+        dict(
+            abbr='LongBench_lsht_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchClassificationEvaluator'
+                ),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.lsht_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=64,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt='请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='lsht',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='all_labels',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchlshtDataset'),
+        dict(
+            abbr='LongBench_multi_news_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=512,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='multi_news',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchmulti_newsDataset'),
+        dict(
+            abbr='LongBench_samsum_0',
+            eval_cfg=dict(
+                evaluator=dict(
+                    type='opencompass.datasets.LongBenchRougeEvaluator'),
+                pred_postprocessor=dict(
+                    type='opencompass.datasets.samsum_postprocess'),
+                pred_role='BOT'),
+            infer_cfg=dict(
+                inferencer=dict(
+                    max_out_len=128,
+                    type='opencompass.openicl.icl_inferencer.GenInferencer'),
+                prompt_template=dict(
+                    template=dict(round=[
+                        dict(
+                            prompt=
+                            'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
+                            role='HUMAN'),
+                    ]),
+                    type=
+                    'opencompass.openicl.icl_prompt_template.PromptTemplate'),
+                retriever=dict(
+                    type='opencompass.openicl.icl_retriever.ZeroRetriever')),
+            name='samsum',
+            path='opencompass/Longbench',
+            reader_cfg=dict(
+                input_columns=[
+                    'context',
+                    'input',
+                ],
+                output_column='answers',
+                test_range='[0:25]',
+                test_split='test',
+                train_split='test'),
+            type='opencompass.datasets.LongBenchsamsumDataset'),
+    ],
+]
+models = [
+    dict(
+        abbr='delta_net',
+        batch_size=128,
+        max_seq_len=2048,
+        model_kwargs=dict(
+            device_map='auto',
+            torch_dtype='torch.bfloat16',
+            trust_remote_code=True),
+        path='/mnt/jfzn/msj/delta_net-1.3B-100B',
+        run_cfg=dict(num_gpus=1),
+        tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
+        tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
+        type='opencompass.models.HuggingFaceBaseModel'),
+]
+work_dir = 'outputs/default/20251127_221150'