msj19 commited on
Commit
a908f55
·
verified ·
1 Parent(s): 8082566

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. examples/eval_OlympiadBench.py +36 -0
  2. examples/eval_attack.py +28 -0
  3. examples/eval_base_demo.py +14 -0
  4. examples/eval_charm_rea.py +66 -0
  5. examples/eval_chat_agent_baseline.py +38 -0
  6. examples/eval_code_passk.py +53 -0
  7. examples/eval_corebench_2409_base_objective.py +175 -0
  8. examples/eval_deepseek_r1.py +212 -0
  9. examples/eval_ds1000_interpreter.py +45 -0
  10. examples/eval_eese_api_judge.py +47 -0
  11. examples/eval_gpt4.py +44 -0
  12. examples/eval_hf_llama_7b.py +8 -0
  13. examples/eval_inference_ppl.py +51 -0
  14. examples/eval_internLM.py +9 -0
  15. examples/eval_internlm_7b.py +9 -0
  16. examples/eval_internlm_chat_turbomind.py +96 -0
  17. examples/eval_internlm_turbomind.py +55 -0
  18. examples/eval_judge_dataset_all.py +61 -0
  19. examples/eval_judgebench.py +52 -0
  20. examples/eval_judgerbench.py +58 -0
  21. examples/eval_judgerbenchv2.py +53 -0
  22. examples/eval_korbench.py +14 -0
  23. examples/eval_livestembench.py +66 -0
  24. examples/eval_llm_judge.py +116 -0
  25. examples/eval_lmdeploy_demo.py +10 -0
  26. examples/eval_longbenchv2.py +28 -0
  27. examples/eval_math_llm_judge.py +136 -0
  28. examples/eval_math_verify.py +77 -0
  29. examples/eval_mmlu_cf.py +36 -0
  30. examples/eval_mmlu_pro.py +39 -0
  31. examples/eval_mmlu_with_zero_retriever_overwritten.py +16 -0
  32. examples/eval_multi_prompt_demo.py +52 -0
  33. examples/eval_musr.py +34 -0
  34. examples/eval_needlebench_v2.py +27 -0
  35. examples/eval_qwen3.py +142 -0
  36. examples/eval_qwen_7b_chat.py +58 -0
  37. examples/eval_qwen_7b_chat_lawbench.py +13 -0
  38. examples/eval_rewardbench.py +53 -0
  39. examples/eval_rmb.py +53 -0
  40. examples/eval_ruler.py +97 -0
  41. examples/eval_rwkv5_3b.py +7 -0
  42. examples/eval_simpleqa.py +45 -0
  43. examples/eval_subjective.py +104 -0
  44. examples/eval_subjective_bradleyterry.py +120 -0
  45. examples/eval_teval.py +81 -0
  46. examples/eval_with_model_dataset_combinations.py +45 -0
  47. tmp/38bf021a-c80f-4a23-9021-f2adc82afa5d_params.py +1424 -0
  48. tmp/3baffa8c-bc69-4789-aa49-f30266896eb4_params.py +0 -0
  49. tmp/3bc1afd5-60f6-4b89-9fc0-909218b5c248_params.py +53 -0
  50. tmp/401500cf-6431-490c-9e43-14532e24796f_params.py +1424 -0
examples/eval_OlympiadBench.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_gen_be8b13 import olympiadbench_datasets
5
+
6
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
7
+
8
+ from opencompass.configs.summarizers.OlympiadBench import summarizer
9
+
10
+
11
+ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
12
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
13
+
14
+ from opencompass.runners import LocalRunner
15
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
16
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
17
+
18
+ infer = dict(
19
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
20
+ runner=dict(
21
+ type=LocalRunner,
22
+ max_num_workers=8,
23
+ task=dict(type=OpenICLInferTask)
24
+ ),
25
+ )
26
+
27
+ eval = dict(
28
+ partitioner=dict(type=NaivePartitioner, n=10),
29
+ runner=dict(
30
+ type=LocalRunner,
31
+ max_num_workers=256,
32
+ task=dict(type=OpenICLEvalTask)
33
+ ),
34
+ )
35
+
36
+ work_dir = 'outputs/debug/OlympiadBench'
examples/eval_attack.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.partitioners import NaivePartitioner
4
+ from opencompass.runners import LocalRunner
5
+ from opencompass.tasks import OpenICLAttackTask
6
+
7
+ with read_base():
8
+ # choose a list of datasets
9
+ from opencompass.configs.datasets.promptbench.promptbench_wnli_gen_50662f import \
10
+ wnli_datasets
11
+ from opencompass.configs.models.qwen.hf_qwen2_1_5b import models
12
+
13
+ datasets = wnli_datasets
14
+
15
+ # Please run whole dataset at a time, aka use `NaivePartitioner` only
16
+ # Please use `OpenICLAttackTask` if want to perform attack experiment
17
+ infer = dict(
18
+ partitioner=dict(type=NaivePartitioner),
19
+ runner=dict(type=LocalRunner,
20
+ max_num_workers=8,
21
+ task=dict(type=OpenICLAttackTask)),
22
+ )
23
+
24
+ attack = dict(
25
+ attack='textfooler',
26
+ query_budget=100,
27
+ prompt_topk=1,
28
+ )
examples/eval_base_demo.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import \
5
+ gsm8k_datasets
6
+ from opencompass.configs.datasets.demo.demo_math_base_gen import \
7
+ math_datasets
8
+ from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import \
9
+ models as hf_internlm2_1_8b_models
10
+ from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
11
+ models as hf_qwen2_1_5b_models
12
+
13
+ datasets = gsm8k_datasets + math_datasets
14
+ models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models
examples/eval_charm_rea.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import \
5
+ charm_reason_datasets as datasets
6
+
7
+ # ------>>>>>> https://arxiv.org/abs/2403.14112
8
+ # from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
9
+ # from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
10
+ # from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
11
+ # from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
12
+ # from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
13
+ # from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
14
+ # from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
15
+ # from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
16
+ # from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
17
+ # from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
18
+ # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
19
+ # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
20
+ # from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
21
+ # from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
22
+ # from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
23
+ # from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
24
+ # from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
25
+ # from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
26
+ # from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
27
+ # <<<<<<------ https://arxiv.org/abs/2403.14112
28
+ # from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
29
+ # from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
30
+ # from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
31
+ # from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
32
+ # from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
33
+ # from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
34
+ # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
35
+ # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
36
+ # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
37
+ # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
38
+ # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
39
+ # from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
40
+ # from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
41
+ # from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
42
+ # from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
43
+ # from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
44
+ from .summarizers.charm_reason import summarizer
45
+
46
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
47
+ work_dir = './outputs/CHARM_rea/chat/'
48
+
49
+ # dataset version metric mode internlm2-chat-7b-turbomind
50
+ # ------------------------------------------------------------- --------- ------------- ------ -----------------------------
51
+ # charm-reason-Direct - naive_average gen 49.51
52
+ # charm-reason-ZH-CoT - naive_average gen 61.33
53
+ # charm-reason-EN-CoT - naive_average gen 54.55
54
+ # charm-reason-XLT - naive_average gen 58.46
55
+ # charm-reason-Translate-EN - naive_average gen 56.15
56
+ # - - - -
57
+ # charm-reason-Chinese_Direct - naive_average gen 47.14
58
+ # charm-reason-Chinese_ZH-CoT - naive_average gen 58.40
59
+ # charm-reason-Chinese_EN-CoT - naive_average gen 48.31
60
+ # charm-reason-Chinese_XLT - naive_average gen 53.57
61
+ # charm-reason-Chinese_Translate-EN - naive_average gen 48.21
62
+ # charm-reason-Global_Direct - naive_average gen 51.88
63
+ # charm-reason-Global_ZH-CoT - naive_average gen 64.26
64
+ # charm-reason-Global_EN-CoT - naive_average gen 60.79
65
+ # charm-reason-Global_XLT - naive_average gen 63.36
66
+ # charm-reason-Global_Translate-EN - naive_average gen 64.10
examples/eval_chat_agent_baseline.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.openai_api import OpenAI
4
+ from opencompass.partitioners import SizePartitioner
5
+ from opencompass.runners import LocalRunner
6
+ from opencompass.tasks import OpenICLInferTask
7
+
8
+ with read_base():
9
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
10
+ gsm8k_datasets
11
+ from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
12
+ from opencompass.configs.datasets.MathBench.mathbench_gen import \
13
+ mathbench_datasets
14
+ from opencompass.configs.summarizers.math_baseline import summarizer
15
+
16
+ datasets = []
17
+ datasets += gsm8k_datasets
18
+ datasets += math_datasets
19
+ datasets += mathbench_datasets
20
+
21
+ models = [
22
+ dict(
23
+ abbr='gpt-3.5-react',
24
+ type=OpenAI,
25
+ path='gpt-3.5-turbo',
26
+ key='ENV',
27
+ query_per_second=1,
28
+ max_seq_len=4096,
29
+ batch_size=1,
30
+ ),
31
+ ]
32
+
33
+ infer = dict(
34
+ partitioner=dict(type=SizePartitioner, max_task_size=1000),
35
+ runner=dict(type=LocalRunner,
36
+ max_num_workers=16,
37
+ task=dict(type=OpenICLInferTask)),
38
+ )
examples/eval_code_passk.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config is used for pass@k evaluation with `num_return_sequences`
2
+ # That model can generate multiple responses for single input
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.models import HuggingFaceCausalLM
6
+ from opencompass.partitioners import SizePartitioner
7
+ from opencompass.runners import LocalRunner
8
+ from opencompass.tasks import OpenICLInferTask
9
+
10
+ with read_base():
11
+ from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
12
+ humaneval_datasets
13
+ from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
14
+ mbpp_datasets
15
+ from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
16
+ sanitized_mbpp_datasets
17
+
18
+ datasets = []
19
+ datasets += humaneval_datasets
20
+ datasets += mbpp_datasets
21
+ datasets += sanitized_mbpp_datasets
22
+
23
+ models = [
24
+ dict(
25
+ type=HuggingFaceCausalLM,
26
+ abbr='CodeLlama-7b-Python',
27
+ path='codellama/CodeLlama-7b-Python-hf',
28
+ tokenizer_path='codellama/CodeLlama-7b-Python-hf',
29
+ tokenizer_kwargs=dict(
30
+ padding_side='left',
31
+ truncation_side='left',
32
+ trust_remote_code=True,
33
+ ),
34
+ max_out_len=1024,
35
+ max_seq_len=2048,
36
+ batch_size=8,
37
+ model_kwargs=dict(trust_remote_code=True, device_map='auto'),
38
+ generation_kwargs=dict(
39
+ num_return_sequences=10,
40
+ do_sample=True,
41
+ top_p=0.95,
42
+ temperature=0.8,
43
+ ),
44
+ run_cfg=dict(num_gpus=1, num_procs=1),
45
+ ),
46
+ ]
47
+
48
+ infer = dict(
49
+ partitioner=dict(type=SizePartitioner, max_task_size=300),
50
+ runner=dict(type=LocalRunner,
51
+ max_num_workers=16,
52
+ task=dict(type=OpenICLInferTask)),
53
+ )
examples/eval_corebench_2409_base_objective.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path as osp
2
+
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
8
+
9
+ #######################################################################
10
+ # PART 0 Essential Configs #
11
+ #######################################################################
12
+ with read_base():
13
+ # Datasets Part
14
+ ## Core Set
15
+ # ## Examination
16
+ # ## Reasoning
17
+ from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
18
+ from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
19
+ cmmlu_datasets
20
+ from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
21
+ # ## Scientific
22
+ from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
23
+ gpqa_datasets
24
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
25
+ gsm8k_datasets
26
+ from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
27
+ hellaswag_datasets
28
+ # ## Coding
29
+ from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import \
30
+ humaneval_datasets
31
+ # ## Math
32
+ from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
33
+ math_datasets
34
+ from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
35
+ mathbench_datasets
36
+ from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
37
+ sanitized_mbpp_datasets
38
+ from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
39
+ from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
40
+ mmlu_pro_datasets
41
+ # Model List
42
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
43
+ models as lmdeploy_qwen2_5_1_5b_model
44
+ from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
45
+ from opencompass.configs.summarizers.groups.cmmlu import \
46
+ cmmlu_summary_groups
47
+ from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
48
+ mathbench_2024_summary_groups
49
+ # TODO: Add LiveCodeBench
50
+ # ## Instruction Following
51
+ # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
52
+ # Summarizer
53
+ from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
54
+ from opencompass.configs.summarizers.groups.mmlu_pro import \
55
+ mmlu_pro_summary_groups
56
+
57
+ # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
58
+ # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
59
+ # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
60
+ # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
61
+ # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
62
+ # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
63
+
64
+ #######################################################################
65
+ # PART 1 Datasets List #
66
+ #######################################################################
67
+ # datasets list for evaluation
68
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
69
+
70
+ #######################################################################
71
+ # PART 2 Datset Summarizer #
72
+ #######################################################################
73
+ # with read_base():
74
+
75
+ core_summary_groups = [
76
+ {
77
+ 'name':
78
+ 'core_average',
79
+ 'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
80
+ ['cmmlu', 'accuracy'], ['bbh', 'naive_average'],
81
+ ['hellaswag', 'accuracy'], ['drop', 'accuracy'],
82
+ ['math', 'accuracy'], ['gsm8k', 'accuracy'],
83
+ ['mathbench-t (average)', 'naive_average'],
84
+ ['GPQA_diamond', 'accuracy'],
85
+ ['openai_humaneval', 'humaneval_pass@1'],
86
+ ['IFEval', 'Prompt-level-strict-accuracy'],
87
+ ['sanitized_mbpp', 'score'],
88
+ ['mathbench-t (average)', 'naive_average']],
89
+ },
90
+ ]
91
+
92
+ summarizer = dict(
93
+ dataset_abbrs=[
94
+ ['mmlu', 'accuracy'],
95
+ ['mmlu_pro', 'accuracy'],
96
+ ['cmmlu', 'accuracy'],
97
+ ['bbh', 'naive_average'],
98
+ ['hellaswag', 'accuracy'],
99
+ ['drop', 'accuracy'],
100
+ ['math', 'accuracy'],
101
+ ['gsm8k', 'accuracy'],
102
+ ['mathbench-t (average)', 'naive_average'],
103
+ ['GPQA_diamond', 'accuracy'],
104
+ ['openai_humaneval', 'humaneval_pass@1'],
105
+ ['IFEval', 'Prompt-level-strict-accuracy'],
106
+ ['sanitized_mbpp', 'score'],
107
+ 'mathbench-a (average)',
108
+ 'mathbench-t (average)'
109
+ '',
110
+ ['mmlu', 'accuracy'],
111
+ ['mmlu-stem', 'accuracy'],
112
+ ['mmlu-social-science', 'accuracy'],
113
+ ['mmlu-humanities', 'accuracy'],
114
+ ['mmlu-other', 'accuracy'],
115
+ '',
116
+ ['mmlu_pro', 'accuracy'],
117
+ ['mmlu_pro_math', 'accuracy'],
118
+ ['mmlu_pro_physics', 'accuracy'],
119
+ ['mmlu_pro_chemistry', 'accuracy'],
120
+ ['mmlu_pro_law', 'accuracy'],
121
+ ['mmlu_pro_engineering', 'accuracy'],
122
+ ['mmlu_pro_other', 'accuracy'],
123
+ ['mmlu_pro_economics', 'accuracy'],
124
+ ['mmlu_pro_health', 'accuracy'],
125
+ ['mmlu_pro_psychology', 'accuracy'],
126
+ ['mmlu_pro_business', 'accuracy'],
127
+ ['mmlu_pro_biology', 'accuracy'],
128
+ ['mmlu_pro_philosophy', 'accuracy'],
129
+ ['mmlu_pro_computer_science', 'accuracy'],
130
+ ['mmlu_pro_history', 'accuracy'],
131
+ '',
132
+ ['cmmlu', 'accuracy'],
133
+ ['cmmlu-stem', 'accuracy'],
134
+ ['cmmlu-social-science', 'accuracy'],
135
+ ['cmmlu-humanities', 'accuracy'],
136
+ ['cmmlu-other', 'accuracy'],
137
+ ['cmmlu-china-specific', 'accuracy'],
138
+ ],
139
+ summary_groups=sum(
140
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
141
+ )
142
+
143
+ #######################################################################
144
+ # PART 3 Models List #
145
+ #######################################################################
146
+
147
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
148
+
149
+ #######################################################################
150
+ # PART 4 Inference/Evaluation Configuaration #
151
+ #######################################################################
152
+
153
+ # Local Runner
154
+ infer = dict(
155
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
156
+ runner=dict(
157
+ type=LocalRunner,
158
+ max_num_workers=16,
159
+ retry=0, # Modify if needed
160
+ task=dict(type=OpenICLInferTask)),
161
+ )
162
+
163
+ # eval with local runner
164
+ eval = dict(
165
+ partitioner=dict(type=NaivePartitioner, n=10),
166
+ runner=dict(type=LocalRunner,
167
+ max_num_workers=16,
168
+ task=dict(type=OpenICLEvalTask)),
169
+ )
170
+
171
+ #######################################################################
172
+ # PART 5 Utils Configuaration #
173
+ #######################################################################
174
+ base_exp_dir = 'outputs/corebench_2409_objective/'
175
+ work_dir = osp.join(base_exp_dir, 'base_objective')
examples/eval_deepseek_r1.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Support AIME-2024 with Repeat8
2
+ # Support MATH-500
3
+ # Support OlympiadBench
4
+ # Support OmniMath
5
+ # Support LiveMathBench-202412-Hard
6
+
7
+ import os.path as osp
8
+ from itertools import product
9
+ from opencompass.models import OpenAISDK
10
+ from mmengine.config import read_base
11
+ from opencompass.utils.text_postprocessors import extract_non_reasoning_content
12
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
13
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
14
+ from opencompass.runners import LocalRunner
15
+ from opencompass.models import (
16
+ TurboMindModelwithChatTemplate,
17
+ )
18
+
19
+ #######################################################################
20
+ # PART 1 Datasets List #
21
+ #######################################################################
22
+ with read_base():
23
+ # You can comment out the datasets you don't want to evaluate
24
+
25
+ # Datasets
26
+ # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
27
+ from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
28
+ # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
29
+ # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
30
+ # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
31
+
32
+
33
+ # Summarizer
34
+ from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
35
+
36
+ datasets = sum(
37
+ (v for k, v in locals().items() if k.endswith('_datasets')),
38
+ [],
39
+ )
40
+
41
+ # Set LLM Verifier used for each dataset
42
+
43
+ verifier_cfg = dict(
44
+ abbr='qwen2-5-32B-Instruct',
45
+ type=OpenAISDK,
46
+ path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
47
+ key='sk-1234', # You need to set your own API key
48
+ openai_api_base=[
49
+ 'http://172.30.56.1:4000/v1', # You need to set your own API base
50
+ ],
51
+ meta_template=dict(
52
+ round=[
53
+ dict(role='HUMAN', api_role='HUMAN'),
54
+ dict(role='BOT', api_role='BOT', generate=True),
55
+ ],
56
+ ),
57
+ query_per_second=16,
58
+ batch_size=1024,
59
+ temperature=0.001,
60
+ tokenizer_path='gpt-4o-2024-05-13',
61
+ verbose=True,
62
+ max_out_len=16384,
63
+ # max_seq_len=32768,
64
+ max_seq_len=49152,
65
+ )
66
+
67
+ for item in datasets:
68
+ # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
69
+ if 'judge_cfg' in item['eval_cfg']['evaluator']:
70
+ item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
71
+
72
+
73
+ #######################################################################
74
+ # PART 2 Model List #
75
+ #######################################################################
76
+
77
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
78
+
79
+ models += [
80
+ # You can comment out the models you don't want to evaluate
81
+ # All models use sampling mode
82
+ dict(
83
+ type=TurboMindModelwithChatTemplate,
84
+ abbr='deepseek-r1-distill-qwen-7b-turbomind',
85
+ path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
86
+ engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
87
+ gen_config=dict(
88
+ do_sample=True,
89
+ temperature=0.6,
90
+ top_p=0.95,
91
+ max_new_tokens=32768),
92
+ max_seq_len=32768,
93
+ max_out_len=32768,
94
+ batch_size=64,
95
+ run_cfg=dict(num_gpus=1),
96
+ pred_postprocessor=dict(type=extract_non_reasoning_content)
97
+ ),
98
+ # dict(
99
+ # type=TurboMindModelwithChatTemplate,
100
+ # abbr='deepseek-r1-distill-qwen-14b-turbomind',
101
+ # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
102
+ # engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
103
+ # gen_config=dict(
104
+ # do_sample=True,
105
+ # temperature=0.6,
106
+ # top_p=0.95,
107
+ # max_new_tokens=32768),
108
+ # max_seq_len=32768,
109
+ # max_out_len=32768,
110
+ # batch_size=128,
111
+ # run_cfg=dict(num_gpus=2),
112
+ # pred_postprocessor=dict(type=extract_non_reasoning_content)
113
+ # ),
114
+ # dict(
115
+ # type=TurboMindModelwithChatTemplate,
116
+ # abbr='deepseek-r1-distill-qwen-32b-turbomind',
117
+ # path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
118
+ # engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
119
+ # gen_config=dict(
120
+ # do_sample=True,
121
+ # temperature=0.6,
122
+ # top_p=0.95,
123
+ # max_new_tokens=16384),
124
+ # max_seq_len=32768,
125
+ # max_out_len=16384,
126
+ # batch_size=128,
127
+ # run_cfg=dict(num_gpus=4),
128
+ # pred_postprocessor=dict(type=extract_non_reasoning_content)
129
+ # ),
130
+ ]
131
+
132
+ #######################################################################
133
+ # PART 3 Inference/Evaluation #
134
+ #######################################################################
135
+
136
+ # Inference configuration
137
+ infer = dict(
138
+ partitioner=dict(
139
+ type=NumWorkerPartitioner,
140
+ num_worker=1
141
+ # Similar with data-parallelism, how many workers for evaluation,
142
+ # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
143
+ # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
144
+ # to max-utilize the GPUs.
145
+ # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
146
+ ),
147
+ runner=dict(
148
+ type=LocalRunner,
149
+ task=dict(type=OpenICLInferTask)
150
+ ),
151
+ )
152
+
153
+ # Evaluation configuration
154
+ eval = dict(
155
+ partitioner=dict(
156
+ type=NaivePartitioner, n=8
157
+ ),
158
+ runner=dict(
159
+ type=LocalRunner,
160
+ task=dict(
161
+ type=OpenICLEvalTask)
162
+ ),
163
+ )
164
+
165
+
166
+ #######################################################################
167
+ # PART 4 Summarizer #
168
+ #######################################################################
169
+
170
+
171
+ summary_groups = sum(
172
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []
173
+ )
174
+
175
+ summary_groups.extend([
176
+ {
177
+ 'name': 'AIME2024-Aveage8',
178
+ 'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
179
+ },
180
+ {
181
+ 'name': 'LiveMathBench-v202412-Hard-Aveage8',
182
+ 'subsets':[[
183
+ f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
184
+ for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
185
+ ]
186
+ }
187
+ ])
188
+
189
+ # Summarizer
190
+ summarizer = dict(
191
+ dataset_abbrs=[
192
+ 'MATH',
193
+ # ['LiveMathBench-k1-n1', 'pass@1'],
194
+ # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
195
+ # ['aime2024', 'accuracy'],
196
+ ['math_prm800k_500-llmjudge', 'accuracy'],
197
+ ['AIME2024-Aveage8', 'naive_average'],
198
+ ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
199
+ ['OlympiadBenchMath', 'accuracy'],
200
+ ['OmniMath', 'accuracy'],
201
+ ],
202
+ summary_groups=summary_groups,
203
+ )
204
+
205
+
206
+ #######################################################################
207
+ # PART 5 Utils #
208
+ #######################################################################
209
+
210
+ work_dir = 'outputs/deepseek_r1_reasoning'
211
+
212
+
examples/eval_ds1000_interpreter.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.lagent.actions.python_interpreter import PythonInterpreter
4
+ from opencompass.models import OpenAI
5
+ from opencompass.models.lagent import CodeAgent
6
+ from opencompass.partitioners import SizePartitioner
7
+ from opencompass.runners import LocalRunner
8
+ from opencompass.tasks import OpenICLInferTask
9
+
10
+ PYTHON_INTERPRETER_DESCRIPTION = """\
11
+ It can run a Python code. The code must be a valid code that contains only python method.
12
+ """
13
+
14
+ actions = [
15
+ dict(
16
+ type=PythonInterpreter,
17
+ description=PYTHON_INTERPRETER_DESCRIPTION,
18
+ answer_expr=None,
19
+ )
20
+ ]
21
+
22
+ with read_base():
23
+ from opencompass.configs.datasets.ds1000.ds1000_gen_5c4bec import \
24
+ ds1000_datasets as datasets
25
+
26
+ models = [
27
+ dict(abbr='gpt-3.5-react',
28
+ type=CodeAgent,
29
+ llm=dict(
30
+ type=OpenAI,
31
+ path='gpt-3.5-turbo',
32
+ key='ENV',
33
+ query_per_second=1,
34
+ max_seq_len=4096,
35
+ ),
36
+ actions=actions,
37
+ batch_size=8),
38
+ ]
39
+
40
+ infer = dict(
41
+ partitioner=dict(type=SizePartitioner, max_task_size=40000),
42
+ runner=dict(type=LocalRunner,
43
+ max_num_workers=16,
44
+ task=dict(type=OpenICLInferTask)),
45
+ )
examples/eval_eese_api_judge.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from mmengine.config import read_base
3
+
4
+ with read_base():
5
+ from opencompass.configs.datasets.eese.eese_judge_gen import \
6
+ eese_datasets
7
+ # 选择一个感兴趣的模型
8
+ from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
9
+ models as gpt4
10
+
11
+ from opencompass.models import OpenAISDK
12
+
13
+ # 配置评判模型
14
+ api_meta_template = dict(round=[
15
+ dict(role='HUMAN', api_role='HUMAN'),
16
+ dict(role='BOT', api_role='BOT', generate=True),
17
+ ], )
18
+
19
+ judge_cfg = dict(
20
+ abbr='model-judge',
21
+ type=OpenAISDK,
22
+ path='model-name',
23
+ key='your-api-key',
24
+ openai_api_base=['openai-url'],
25
+ meta_template=api_meta_template,
26
+ query_per_second=16,
27
+ batch_size=1,
28
+ temperature=0.001,
29
+ tokenizer_path='gpt-4o',
30
+ verbose=True,
31
+ max_out_len=16384,
32
+ max_seq_len=49152,
33
+ )
34
+
35
+ datasets = eese_datasets
36
+ models = gpt4
37
+
38
+ # 为每个数据集增加judge_cfg信息,而不是覆盖
39
+ for dataset in datasets:
40
+ if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
41
+ # 获取现有的judge_cfg,如果不存在则创建空字典
42
+ existing_judge_cfg = dataset['eval_cfg']['evaluator'].get('judge_cfg', {})
43
+ # 更新现有的judge_cfg,保留原有配置并添加新配置
44
+ existing_judge_cfg.update(judge_cfg)
45
+ # 将更新后的配置设置回去
46
+ dataset['eval_cfg']['evaluator']['judge_cfg'] = existing_judge_cfg
47
+
examples/eval_gpt4.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import OpenAI
4
+ from opencompass.partitioners import NaivePartitioner
5
+ from opencompass.runners import LocalRunner
6
+ from opencompass.tasks import OpenICLInferTask
7
+
8
+ with read_base():
9
+ from opencompass.configs.datasets.collections.chat_medium import datasets
10
+ from opencompass.configs.summarizers.medium import summarizer
11
+
12
+ # GPT4 needs a special humaneval postprocessor
13
+ from opencompass.datasets.humaneval import humaneval_gpt_postprocess
14
+
15
+ for _dataset in datasets:
16
+ if _dataset['path'] == 'openai_humaneval':
17
+ _dataset['eval_cfg']['pred_postprocessor'][
18
+ 'type'] = humaneval_gpt_postprocess
19
+
20
+ api_meta_template = dict(round=[
21
+ dict(role='HUMAN', api_role='HUMAN'),
22
+ dict(role='BOT', api_role='BOT', generate=True),
23
+ ], )
24
+
25
+ models = [
26
+ dict(
27
+ abbr='GPT4',
28
+ type=OpenAI,
29
+ path='gpt-4-0613',
30
+ key=
31
+ 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
32
+ meta_template=api_meta_template,
33
+ query_per_second=1,
34
+ max_out_len=2048,
35
+ max_seq_len=2048,
36
+ batch_size=8),
37
+ ]
38
+
39
+ infer = dict(
40
+ partitioner=dict(type=NaivePartitioner),
41
+ runner=dict(type=LocalRunner,
42
+ max_num_workers=4,
43
+ task=dict(type=OpenICLInferTask)),
44
+ )
examples/eval_hf_llama_7b.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.collections.base_medium_llama import (
5
+ piqa_datasets, siqa_datasets)
6
+ from opencompass.configs.models.hf_llama.hf_llama_7b import models
7
+
8
+ datasets = [*piqa_datasets, *siqa_datasets]
examples/eval_inference_ppl.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # Inference PPL datasets
5
+ from opencompass.configs.datasets.inference_ppl.inference_ppl import inference_ppl_datasets
6
+
7
+ # Model configs
8
+ from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
9
+ from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
10
+ from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
11
+ from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
12
+
13
+ from opencompass.partitioners import NaivePartitioner
14
+ from opencompass.runners import LocalRunner
15
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
16
+
17
+ # -------------Inference Stage ----------------------------------------
18
+
19
+ datasets = [*inference_ppl_datasets]
20
+ workdir = 'outputs/inference_ppl'
21
+
22
+ models = [
23
+ *qwen1_5_7b,
24
+ *qwen1_5_14b,
25
+ *llama2_7b,
26
+ *llama2_13b,
27
+ ]
28
+
29
+ # Set custom batch_size and num_gpus for faster loss calculation
30
+ # Smaller batch_size should give more precise results, at the cost of worse efficiency
31
+ model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
32
+
33
+ for mdl in models:
34
+ mdl.update(model_cfg)
35
+
36
+ infer = dict(
37
+ partitioner=dict(type=NaivePartitioner),
38
+ runner=dict(
39
+ type=LocalRunner,
40
+ task=dict(type=OpenICLInferTask),
41
+ max_num_workers=256, # Maximum concurrent evaluation task count
42
+ ),
43
+ )
44
+
45
+ # -------------Evaluation Stage ----------------------------------------
46
+ eval = dict(partitioner=dict(type=NaivePartitioner),
47
+ runner=dict(
48
+ type=LocalRunner,
49
+ task=dict(type=OpenICLEvalTask),
50
+ max_num_workers=256,
51
+ ))
examples/eval_internLM.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # choose a list of datasets
5
+ from opencompass.configs.datasets.collections.base_medium import datasets
6
+ # choose a model of interest
7
+ from opencompass.configs.models.internlm.internlm_7b import models
8
+ # and output the results in a choosen format
9
+ from opencompass.configs.summarizers.medium import summarizer
examples/eval_internlm_7b.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # choose a list of datasets
5
+ from opencompass.configs.datasets.collections.base_medium import datasets
6
+ # choose a model of interest
7
+ from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
8
+ # and output the results in a choosen format
9
+ from opencompass.configs.summarizers.medium import summarizer
examples/eval_internlm_chat_turbomind.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.turbomind import TurboMindModel
4
+
5
+ with read_base():
6
+ # choose a list of datasets
7
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
8
+ ceval_datasets
9
+ from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
10
+ crowspairs_datasets
11
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
12
+ gsm8k_datasets
13
+ from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
14
+ from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
15
+ from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
16
+ WiC_datasets
17
+ from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
18
+ WSC_datasets
19
+ from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
20
+ triviaqa_datasets
21
+ # and output the results in a choosen format
22
+ from opencompass.configs.summarizers.medium import summarizer
23
+
24
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
25
+
26
+ internlm_meta_template = dict(round=[
27
+ dict(role='HUMAN', begin='<|User|>:', end='\n'),
28
+ dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
29
+ ],
30
+ eos_token_id=103028)
31
+
32
+ internlm2_meta_template = dict(round=[
33
+ dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
34
+ dict(role='BOT',
35
+ begin='<|im_start|>assistant\n',
36
+ end='<|im_end|>\n',
37
+ generate=True),
38
+ ],
39
+ eos_token_id=92542)
40
+
41
+ # config for internlm-chat-7b
42
+ internlm_chat_7b = dict(
43
+ type=TurboMindModel,
44
+ abbr='internlm-chat-7b-turbomind',
45
+ path='internlm/internlm-chat-7b',
46
+ engine_config=dict(session_len=2048,
47
+ max_batch_size=32,
48
+ rope_scaling_factor=1.0),
49
+ gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
50
+ max_out_len=100,
51
+ max_seq_len=2048,
52
+ batch_size=32,
53
+ concurrency=32,
54
+ meta_template=internlm_meta_template,
55
+ run_cfg=dict(num_gpus=1, num_procs=1),
56
+ end_str='<eoa>',
57
+ )
58
+
59
+ # config for internlm-chat-7b
60
+ internlm2_chat_7b = dict(type=TurboMindModel,
61
+ abbr='internlm2-chat-7b-turbomind',
62
+ path='internlm/internlm2-chat-7b',
63
+ engine_config=dict(session_len=2048,
64
+ max_batch_size=32,
65
+ rope_scaling_factor=1.0),
66
+ gen_config=dict(top_k=1,
67
+ top_p=0.8,
68
+ temperature=1.0,
69
+ max_new_tokens=100),
70
+ max_out_len=100,
71
+ max_seq_len=2048,
72
+ batch_size=32,
73
+ concurrency=32,
74
+ meta_template=internlm2_meta_template,
75
+ run_cfg=dict(num_gpus=1, num_procs=1),
76
+ end_str='<|im_end|>')
77
+
78
+ # config for internlm-chat-20b
79
+ internlm_chat_20b = dict(
80
+ type=TurboMindModel,
81
+ abbr='internlm-chat-20b-turbomind',
82
+ path='internlm/internlm-chat-20b',
83
+ engine_config=dict(session_len=2048,
84
+ max_batch_size=8,
85
+ rope_scaling_factor=1.0),
86
+ gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
87
+ max_out_len=100,
88
+ max_seq_len=2048,
89
+ batch_size=8,
90
+ concurrency=8,
91
+ meta_template=internlm_meta_template,
92
+ run_cfg=dict(num_gpus=1, num_procs=1),
93
+ end_str='<eoa>',
94
+ )
95
+
96
+ models = [internlm_chat_20b]
examples/eval_internlm_turbomind.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.turbomind import TurboMindModel
4
+
5
+ with read_base():
6
+ # choose a list of datasets
7
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
8
+ ceval_datasets
9
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
10
+ gsm8k_datasets
11
+ from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
12
+ humaneval_datasets
13
+ from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
14
+ from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
15
+ WiC_datasets
16
+ from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
17
+ triviaqa_datasets
18
+ # and output the results in a choosen format
19
+ from opencompass.configs.summarizers.medium import summarizer
20
+
21
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
22
+
23
+ # # config for internlm-7b model
24
+ internlm_7b = dict(
25
+ type=TurboMindModel,
26
+ abbr='internlm-7b-turbomind',
27
+ path='internlm/internlm-7b',
28
+ engine_config=dict(session_len=2048,
29
+ max_batch_size=32,
30
+ rope_scaling_factor=1.0),
31
+ gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
32
+ max_out_len=100,
33
+ max_seq_len=2048,
34
+ batch_size=32,
35
+ concurrency=32,
36
+ run_cfg=dict(num_gpus=1, num_procs=1),
37
+ )
38
+
39
+ # config for internlm-20b model
40
+ internlm_20b = dict(
41
+ type=TurboMindModel,
42
+ abbr='internlm-20b-turbomind',
43
+ path='internlm/internlm-20b',
44
+ engine_config=dict(session_len=2048,
45
+ max_batch_size=8,
46
+ rope_scaling_factor=1.0),
47
+ gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
48
+ max_out_len=100,
49
+ max_seq_len=2048,
50
+ batch_size=8,
51
+ concurrency=8,
52
+ run_cfg=dict(num_gpus=1, num_procs=1),
53
+ )
54
+
55
+ models = [internlm_20b]
examples/eval_judge_dataset_all.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ with read_base():
3
+ from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
4
+ from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
5
+ from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
6
+ from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
7
+
8
+ from opencompass.configs.summarizers.judgedataset_all import summarizer
9
+ from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
10
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
11
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
12
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
13
+ from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
14
+ from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
15
+ from opencompass.runners import SlurmSequentialRunner
16
+ from opencompass.tasks import OpenICLInferTask
17
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
18
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
19
+ from opencompass.models import TurboMindModelwithChatTemplate
20
+
21
+
22
+ api_meta_template = dict(
23
+ round=[
24
+ dict(role='HUMAN', api_role='HUMAN'),
25
+ dict(role='BOT', api_role='BOT', generate=True),
26
+ ]
27
+ )
28
+ datasets = sum(
29
+ (v for k, v in locals().items() if k.endswith('_datasets')),
30
+ [],
31
+ )
32
+
33
+
34
+ models = [
35
+ dict(
36
+ type=TurboMindModelwithChatTemplate,
37
+ abbr='qwen-7b-hf',
38
+ path='Qwen/Qwen-7B',
39
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
40
+ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
41
+ max_seq_len=16384,
42
+ max_out_len=2048,
43
+ batch_size=16,
44
+ run_cfg=dict(num_gpus=1),
45
+ ),
46
+ ]
47
+
48
+
49
+
50
+ infer = dict(
51
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
52
+ runner=dict(
53
+ type=LocalRunner,
54
+ max_num_workers=72,
55
+ task=dict(type=OpenICLInferTask),
56
+ ),
57
+ )
58
+
59
+
60
+
61
+ work_dir = './outputs/judge_dataset_all/'
examples/eval_judgebench.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ with read_base():
3
+ from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
4
+
5
+ from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
6
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
7
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
8
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
9
+ from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
10
+ from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
11
+ from opencompass.runners import SlurmSequentialRunner
12
+ from opencompass.tasks import OpenICLInferTask
13
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
14
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
15
+
16
+ api_meta_template = dict(
17
+ round=[
18
+ dict(role='HUMAN', api_role='HUMAN'),
19
+ dict(role='BOT', api_role='BOT', generate=True),
20
+ ]
21
+ )
22
+ datasets = [*get_judgebench_datasets]
23
+
24
+ from opencompass.models import TurboMindModelwithChatTemplate
25
+
26
+ models = [
27
+ dict(
28
+ type=TurboMindModelwithChatTemplate,
29
+ abbr='qwen-7b-hf',
30
+ path='Qwen/Qwen-7B',
31
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
32
+ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
33
+ max_seq_len=16384,
34
+ max_out_len=2048,
35
+ batch_size=16,
36
+ run_cfg=dict(num_gpus=1),
37
+ ),
38
+ ]
39
+
40
+
41
+ infer = dict(
42
+ partitioner=dict(type=NaivePartitioner),
43
+ runner=dict(
44
+ type=LocalRunner,
45
+ max_num_workers=72,
46
+ task=dict(type=OpenICLInferTask),
47
+ ),
48
+ )
49
+
50
+
51
+
52
+ work_dir = './outputs/judgebench/'
examples/eval_judgerbench.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
5
+
6
+ from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
7
+ HuggingFaceChatGLM3, OpenAI,
8
+ TurboMindModelwithChatTemplate)
9
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
10
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
11
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
12
+
13
+ api_meta_template = dict(round=[
14
+ dict(role='HUMAN', api_role='HUMAN'),
15
+ dict(role='BOT', api_role='BOT', generate=True),
16
+ ])
17
+
18
+ # -------------Inference Stage ----------------------------------------
19
+ # For subjective evaluation, we often set do sample for models
20
+ models = [
21
+ dict(
22
+ type=TurboMindModelwithChatTemplate,
23
+ abbr='CompassJudger-1-7B-Instruct',
24
+ path='opencompass/CompassJudger-1-7B-Instruct',
25
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
26
+ gen_config=dict(top_k=1,
27
+ temperature=1e-6,
28
+ top_p=0.9,
29
+ max_new_tokens=2048),
30
+ max_seq_len=16384,
31
+ max_out_len=2048,
32
+ batch_size=16,
33
+ run_cfg=dict(num_gpus=1),
34
+ )
35
+ ]
36
+
37
+ datasets = judgerbench_datasets
38
+
39
+ infer = dict(
40
+ partitioner=dict(type=NaivePartitioner),
41
+ runner=dict(type=LocalRunner,
42
+ max_num_workers=16,
43
+ task=dict(type=OpenICLInferTask)),
44
+ )
45
+ # -------------Evalation Stage ----------------------------------------
46
+
47
+ ## ------------- Evaluation Configuration
48
+ eval = dict(
49
+ partitioner=dict(
50
+ type=NaivePartitioner,
51
+ n=10,
52
+ ),
53
+ runner=dict(type=LocalRunner,
54
+ max_num_workers=16,
55
+ task=dict(type=OpenICLEvalTask)),
56
+ )
57
+
58
+ work_dir = 'outputs/judgerbench/'
examples/eval_judgerbenchv2.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ with read_base():
3
+ from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset
4
+ from opencompass.configs.summarizers.judgerbenchv2 import summarizer
5
+ from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
6
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
7
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
8
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
9
+ from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
10
+ from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
11
+ from opencompass.runners import SlurmSequentialRunner
12
+ from opencompass.tasks import OpenICLInferTask
13
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
14
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
15
+
16
+ api_meta_template = dict(
17
+ round=[
18
+ dict(role='HUMAN', api_role='HUMAN'),
19
+ dict(role='BOT', api_role='BOT', generate=True),
20
+ ]
21
+ )
22
+ datasets = [*get_judgerbenchv2_dataset]
23
+
24
+ from opencompass.models import TurboMindModelwithChatTemplate
25
+
26
+ models = [
27
+ dict(
28
+ type=TurboMindModelwithChatTemplate,
29
+ abbr='qwen-7b-hf',
30
+ path='Qwen/Qwen-7B',
31
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
32
+ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
33
+ max_seq_len=16384,
34
+ max_out_len=2048,
35
+ batch_size=16,
36
+ run_cfg=dict(num_gpus=1),
37
+ ),
38
+ ]
39
+
40
+
41
+ infer = dict(
42
+ # partitioner=dict(type=NaivePartitioner),
43
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
44
+ runner=dict(
45
+ type=LocalRunner,
46
+ max_num_workers=72,
47
+ task=dict(type=OpenICLInferTask),
48
+ ),
49
+ )
50
+
51
+
52
+
53
+ work_dir = './outputs/judgerbenchv2/'
examples/eval_korbench.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import \
5
+ korbench_mixed_datasets as mixed_datasets
6
+ from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
7
+ korbench_0shot_single_datasets as zero_shot_datasets
8
+ from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import \
9
+ korbench_3shot_single_datasets as three_shot_datasets
10
+ from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
11
+ models as hf_internlm2_5_7b
12
+
13
+ datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
14
+ models = hf_internlm2_5_7b
examples/eval_livestembench.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import OpenAISDK
4
+
5
+ with read_base():
6
+ # 选择一个数据集列表
7
+ from opencompass.configs.datasets.livestembench.livestembench_gen_3e3c50 import \
8
+ livestembench_datasets
9
+ # 选择一个感兴趣的模型
10
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
11
+ models as qwen2_5_7b_instruct_lmdeploy_model
12
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
13
+ models as qwen2_5_72b_instruct_lmdeploy_model
14
+
15
+ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
16
+ models = [
17
+ *qwen2_5_7b_instruct_lmdeploy_model, *qwen2_5_72b_instruct_lmdeploy_model
18
+ ]
19
+
20
+ # Judge 模型配置
21
+ api_meta_template = dict(round=[
22
+ dict(role='HUMAN', api_role='HUMAN'),
23
+ dict(role='BOT', api_role='BOT', generate=True),
24
+ ], )
25
+
26
+ judge_cfg = dict(
27
+ abbr='qwen2-5-72b-instruct',
28
+ type=OpenAISDK,
29
+ path='YOUR_SERVER_MODEL_NAME', # 你的部署的模型名称
30
+ key='None',
31
+ openai_api_base=[
32
+ 'http://localhost:23333/v1', # 你的模型部署的地址
33
+ ],
34
+ meta_template=api_meta_template,
35
+ query_per_second=16,
36
+ batch_size=16,
37
+ temperature=0.001,
38
+ max_completion_tokens=32768,
39
+ )
40
+
41
+ for dataset in datasets:
42
+ dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
43
+
44
+ # -------------Inferen Stage ----------------------------------------
45
+
46
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
47
+ from opencompass.runners import LocalRunner
48
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
49
+
50
+ infer = dict(
51
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
52
+ runner=dict(type=LocalRunner,
53
+ max_num_workers=8,
54
+ task=dict(type=OpenICLInferTask)),
55
+ )
56
+
57
+ eval = dict(
58
+ partitioner=dict(type=NaivePartitioner, n=8),
59
+ runner=dict(
60
+ type=LocalRunner,
61
+ max_num_workers=256,
62
+ task=dict(type=OpenICLEvalTask),
63
+ ),
64
+ )
65
+
66
+ work_dir = './outputs/livestembench'
examples/eval_llm_judge.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from opencompass.models.openai_api import OpenAISDK
3
+
4
+ # Import pre-configured models from OpenCompass
5
+ with read_base():
6
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
7
+ models as lmdeploy_qwen2_5_7b_instruct_model,
8
+ )
9
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
10
+ models as lmdeploy_qwen2_5_14b_instruct_model,
11
+ )
12
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
13
+ from opencompass.openicl.icl_retriever import ZeroRetriever
14
+ from opencompass.openicl.icl_inferencer import GenInferencer
15
+ from opencompass.evaluator import GenericLLMEvaluator
16
+ from opencompass.datasets import generic_llmjudge_postprocess
17
+ from opencompass.datasets import CustomDataset
18
+
19
+
20
+ # Dataset reader configuration
21
+ math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
22
+
23
+ # Inference configuration
24
+ math_infer_cfg = dict(
25
+ prompt_template=dict(
26
+ type=PromptTemplate,
27
+ template=dict(
28
+ round=[
29
+ dict(
30
+ role='HUMAN',
31
+ prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
32
+ ),
33
+ ]
34
+ ),
35
+ ),
36
+ retriever=dict(type=ZeroRetriever),
37
+ inferencer=dict(type=GenInferencer),
38
+ )
39
+
40
+
41
+ # Template for the LLM judge
42
+ GRADER_TEMPLATE = """
43
+ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
44
+
45
+ Here are some evaluation criteria:
46
+ 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
47
+ 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
48
+ 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
49
+ 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
50
+ 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
51
+
52
+ Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
53
+ A: CORRECT
54
+ B: INCORRECT
55
+ Just return the letters "A" or "B", with no text around it.
56
+
57
+ Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
58
+
59
+
60
+ <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
61
+ <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
62
+ <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
63
+
64
+ Judging the correctness of candidates' answers:
65
+ """.strip()
66
+
67
+ # Evaluation configuration using LLM as judge
68
+ math_eval_cfg = dict(
69
+ evaluator=dict(
70
+ type=GenericLLMEvaluator,
71
+ prompt_template=dict(
72
+ type=PromptTemplate,
73
+ template=dict(
74
+ begin=[
75
+ dict(
76
+ role='SYSTEM',
77
+ fallback_role='HUMAN',
78
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
79
+ )
80
+ ],
81
+ round=[
82
+ dict(role='HUMAN', prompt=GRADER_TEMPLATE),
83
+ ],
84
+ ),
85
+ ),
86
+ dataset_cfg=dict(
87
+ type=CustomDataset,
88
+ path='opencompass/math',
89
+ file_name='test_prm800k_500.jsonl',
90
+ reader_cfg=math_reader_cfg,
91
+ ),
92
+ judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
93
+ dict_postprocessor=dict(type=generic_llmjudge_postprocess),
94
+ ),
95
+ )
96
+
97
+ # Dataset configuration
98
+ datasets = [
99
+ dict(
100
+ type=CustomDataset,
101
+ path='opencompass/math',
102
+ file_name='test_prm800k_500.jsonl',
103
+ reader_cfg=math_reader_cfg,
104
+ infer_cfg=math_infer_cfg,
105
+ eval_cfg=math_eval_cfg,
106
+ )
107
+ ]
108
+
109
+ # Model to be evaluated
110
+ models = lmdeploy_qwen2_5_7b_instruct_model
111
+
112
+ # Limiting test to first 8 examples for quick testing
113
+ math_reader_cfg['test_range'] = '[0:8]'
114
+
115
+ # Output directory
116
+ work_dir = 'outputs/llm_judge'
examples/eval_lmdeploy_demo.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
5
+ gsm8k_datasets
6
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_1_8b_chat import \
7
+ models
8
+
9
+ datasets = gsm8k_datasets
10
+ models = models
examples/eval_longbenchv2.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # Models
5
+ # Datasets
6
+ from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
7
+ LongBenchv2_datasets as LongBenchv2_datasets
8
+ from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
9
+ models as lmdeploy_glm4_9b_chat_model
10
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
11
+ models as lmdeploy_llama3_1_8b_instruct_model
12
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
13
+ models as lmdeploy_qwen2_5_7b_instruct_model
14
+
15
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
16
+
17
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
18
+
19
+ for model in models:
20
+ model['max_seq_len'] = 128 * 1024
21
+ model['engine_config']['session_len'] = 128 * 1024
22
+ model['engine_config']['tp'] = 2
23
+ model['run_cfg']['num_gpus'] = 2
24
+ # Drop middle tokens to make input length shorter than session_len, use 128k to keep sync with Longbenchv2 original code
25
+ # Drop middle now only support LMDeploy models
26
+ model['drop_middle'] = True
27
+
28
+ work_dir = './outputs/longbenchv2'
examples/eval_math_llm_judge.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
2
+ from mmengine.config import read_base
3
+
4
+ with read_base():
5
+ from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
6
+ from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
7
+ from opencompass.configs.datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
8
+
9
+ from opencompass.datasets import math_judement_preprocess
10
+ from opencompass.openicl.icl_evaluator import LMEvaluator
11
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
12
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
13
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
14
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
15
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
16
+ from opencompass.summarizers import AllObjSummarizer
17
+ from opencompass.tasks import OpenICLInferTask
18
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
19
+
20
+ # -------------Prompt Settings ----------------------------------------
21
+ eng_obj_prompt = """
22
+ Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
23
+
24
+ Examples:
25
+
26
+ Expression 1: $2x+3$
27
+ Expression 2: $3+2x$
28
+
29
+ [Yes]
30
+
31
+ Expression 1: 3/2
32
+ Expression 2: 1.5
33
+
34
+ [Yes]
35
+
36
+ Expression 1: $x^2+2x+1$
37
+ Expression 2: $y^2+2y+1$
38
+
39
+ [No]
40
+
41
+ Expression 1: $x^2+2x+1$
42
+ Expression 2: $(x+1)^2$
43
+
44
+ [Yes]
45
+
46
+ Expression 1: 3245/5
47
+ Expression 2: 649
48
+
49
+ [No]
50
+ (these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
51
+
52
+ Expression 1: 2/(-3)
53
+ Expression 2: -2/3
54
+
55
+ [Yes]
56
+ (trivial simplifications are allowed)
57
+
58
+ Expression 1: 72 degrees
59
+ Expression 2: 72
60
+
61
+ [Yes]
62
+ (give benefit of the doubt to units)
63
+
64
+ Expression 1: 64
65
+ Expression 2: 64 square feet
66
+
67
+ [Yes]
68
+ (give benefit of the doubt to units)
69
+
70
+ Expression 1: 64
71
+ Expression 2:
72
+
73
+ [No]
74
+ (only mark as equivalent if both expressions are nonempty)
75
+
76
+ ---
77
+
78
+ YOUR TASK
79
+
80
+
81
+ Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
82
+ Expression 1: {obj_gold}
83
+ Expression 2: {prediction}
84
+
85
+ """
86
+
87
+ # -------------Inferen Stage ----------------------------------------
88
+ # eval models
89
+ models = [*hf_llama3_8b_instruct_model]
90
+ # judge models
91
+ judge_models = hf_llama3_70b_instruct_model
92
+
93
+ eng_datasets = [*math_datasets]
94
+ chn_datasets = []
95
+ datasets = eng_datasets + chn_datasets
96
+ work_dir = 'outputs/obj_all/'
97
+
98
+ for d in eng_datasets:
99
+ d['eval_cfg'] = dict(
100
+ evaluator=dict(
101
+ type=LMEvaluator,
102
+ # If you need to preprocess the prediction before judging,
103
+ # you can specify the pred_postprocessor function here
104
+ pred_postprocessor=dict(type=math_judement_preprocess),
105
+ prompt_template=dict(
106
+ type=PromptTemplate,
107
+ template=dict(round=[
108
+ dict(role='HUMAN', prompt=eng_obj_prompt),
109
+ ]),
110
+ ),
111
+ ),
112
+ pred_role='BOT',
113
+ )
114
+
115
+ infer = dict(
116
+ partitioner=dict(type=SizePartitioner, max_task_size=40000),
117
+ runner=dict(type=LocalRunner,
118
+ max_num_workers=256,
119
+ task=dict(type=OpenICLInferTask)),
120
+ )
121
+
122
+ # ------------- Evaluation Configuration --------------------------------
123
+ eval = dict(
124
+ partitioner=dict(
125
+ type=SubjectiveSizePartitioner,
126
+ max_task_size=80000,
127
+ mode='singlescore',
128
+ models=models,
129
+ judge_models=judge_models,
130
+ ),
131
+ runner=dict(type=LocalRunner,
132
+ max_num_workers=16,
133
+ task=dict(type=SubjectiveEvalTask)),
134
+ )
135
+
136
+ summarizer = dict(type=AllObjSummarizer)
examples/eval_math_verify.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from opencompass.models import TurboMindModelwithChatTemplate
3
+ from opencompass.utils.text_postprocessors import extract_non_reasoning_content
4
+
5
+ with read_base():
6
+ from opencompass.configs.datasets.math.math_500_gen import math_datasets
7
+
8
+ models = [
9
+ dict(
10
+ type=TurboMindModelwithChatTemplate,
11
+ abbr='deepseek-r1-distill-llama-8b-turbomind',
12
+ path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
13
+ engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
14
+ gen_config=dict(
15
+ top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
16
+ ),
17
+ max_seq_len=32768,
18
+ max_out_len=32768,
19
+ batch_size=32,
20
+ run_cfg=dict(num_gpus=1),
21
+ pred_postprocessor=dict(type=extract_non_reasoning_content),
22
+ ),
23
+ dict(
24
+ type=TurboMindModelwithChatTemplate,
25
+ abbr='deepseek-r1-distill-qwen-7b-turbomind',
26
+ path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
27
+ engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
28
+ gen_config=dict(
29
+ temperature=0.6,
30
+ top_p=0.95,
31
+ max_new_tokens=32768,
32
+ do_sample=True,
33
+ ),
34
+ max_seq_len=32768,
35
+ max_out_len=32768,
36
+ batch_size=32,
37
+ run_cfg=dict(num_gpus=1),
38
+ pred_postprocessor=dict(type=extract_non_reasoning_content),
39
+ ),
40
+ dict(
41
+ type=TurboMindModelwithChatTemplate,
42
+ abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
43
+ path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
44
+ engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
45
+ gen_config=dict(
46
+ top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
47
+ ),
48
+ max_seq_len=32768,
49
+ max_out_len=32768,
50
+ batch_size=32,
51
+ run_cfg=dict(num_gpus=1),
52
+ pred_postprocessor=dict(type=extract_non_reasoning_content),
53
+ ),
54
+ dict(
55
+ type=TurboMindModelwithChatTemplate,
56
+ abbr='deepseek-r1-distill-qwen-14b-turbomind',
57
+ path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
58
+ engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
59
+ gen_config=dict(
60
+ top_k=1,
61
+ temperature=0.6,
62
+ top_p=0.95,
63
+ max_new_tokens=32768,
64
+ do_sample=True,
65
+ ),
66
+ max_seq_len=32768,
67
+ max_out_len=32768,
68
+ batch_size=16,
69
+ run_cfg=dict(num_gpus=2),
70
+ pred_postprocessor=dict(type=extract_non_reasoning_content),
71
+ ),
72
+ ]
73
+
74
+ datasets = [*math_datasets]
75
+
76
+
77
+ work_dir = './outputs/math_500'
examples/eval_mmlu_cf.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import \
5
+ mmlu_cf_datasets
6
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
7
+ models as lmdeploy_llama3_8b_instruct_model
8
+ from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import \
9
+ models as hf_qwen2_5_7b_instruct_model
10
+ from opencompass.configs.summarizers.mmlu_cf import summarizer
11
+
12
+ datasets = sum([
13
+ v
14
+ for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
15
+ ], [])
16
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
17
+
18
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
19
+ from opencompass.runners import LocalRunner
20
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
21
+
22
+ infer = dict(
23
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
24
+ runner=dict(type=LocalRunner,
25
+ max_num_workers=8,
26
+ task=dict(type=OpenICLInferTask)),
27
+ )
28
+
29
+ eval = dict(
30
+ partitioner=dict(type=NaivePartitioner, n=10),
31
+ runner=dict(type=LocalRunner,
32
+ max_num_workers=256,
33
+ task=dict(type=OpenICLEvalTask)),
34
+ )
35
+
36
+ work_dir = 'outputs/debug/mmlu_cf'
examples/eval_mmlu_pro.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
5
+ mmlu_pro_datasets
6
+ from opencompass.configs.internal.clusters.local import eval
7
+ from opencompass.configs.internal.clusters.local import \
8
+ infer_num_worker as infer
9
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
10
+ models as lmdeploy_llama3_8b_instruct_model
11
+ from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
12
+ models as lmdeploy_qwen2_7b_instruct_model
13
+ from opencompass.configs.summarizers.mmlu_pro import summarizer
14
+
15
+ datasets = sum([
16
+ v
17
+ for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
18
+ ], [])
19
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
20
+
21
+ work_dir = 'outputs/debug/mmlu_pro'
22
+
23
+ # dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind
24
+ # ------------------------- --------- ------------- ------ ----------------------------- -------------------------------
25
+ # mmlu_pro - naive_average gen 46.18 43.92
26
+ # mmlu_pro_biology 736233 accuracy gen 63.74 64.02
27
+ # mmlu_pro_business 736233 accuracy gen 53.23 46.01
28
+ # mmlu_pro_chemistry 736233 accuracy gen 35.25 32.42
29
+ # mmlu_pro_computer_science 736233 accuracy gen 47.07 44.88
30
+ # mmlu_pro_economics 736233 accuracy gen 59.00 53.79
31
+ # mmlu_pro_engineering 736233 accuracy gen 26.73 33.54
32
+ # mmlu_pro_health 736233 accuracy gen 47.31 51.34
33
+ # mmlu_pro_history 736233 accuracy gen 42.78 42.26
34
+ # mmlu_pro_law 736233 accuracy gen 28.07 26.98
35
+ # mmlu_pro_math 736233 accuracy gen 53.59 37.53
36
+ # mmlu_pro_philosophy 736233 accuracy gen 42.28 42.48
37
+ # mmlu_pro_physics 736233 accuracy gen 39.11 33.64
38
+ # mmlu_pro_psychology 736233 accuracy gen 60.90 59.65
39
+ # mmlu_pro_other 736233 accuracy gen 47.40 46.32
examples/eval_mmlu_with_zero_retriever_overwritten.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.openicl.icl_retriever import ZeroRetriever
6
+
7
+ with read_base():
8
+ from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
9
+ mmlu_datasets # this is a dataset evaluated with 5-shot
10
+ from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
11
+
12
+ datasets = []
13
+ for d in mmlu_datasets:
14
+ d = deepcopy(d)
15
+ d['infer_cfg']['retriever'] = dict(type=ZeroRetriever)
16
+ datasets.append(d)
examples/eval_multi_prompt_demo.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import HuggingFaceCausalLM
4
+
5
+ with read_base():
6
+ from opencompass.configs.datasets.winogrande.winogrande_gen_a027b6 import \
7
+ winogrande_datasets
8
+
9
+ datasets = [*winogrande_datasets]
10
+
11
+ _meta_template = dict(round=[
12
+ dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
13
+ dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
14
+ ], )
15
+
16
+ models = [
17
+ dict(
18
+ type=HuggingFaceCausalLM,
19
+ abbr='internlm-chat-7b-hf',
20
+ path='internlm/internlm-chat-7b',
21
+ tokenizer_path='internlm/internlm-chat-7b',
22
+ tokenizer_kwargs=dict(
23
+ padding_side='left',
24
+ truncation_side='left',
25
+ use_fast=False,
26
+ trust_remote_code=True,
27
+ ),
28
+ max_out_len=100,
29
+ max_seq_len=2048,
30
+ batch_size=8,
31
+ meta_template=_meta_template,
32
+ model_kwargs=dict(
33
+ trust_remote_code=True,
34
+ device_map='auto',
35
+ ),
36
+ run_cfg=dict(num_gpus=1, num_procs=1),
37
+ )
38
+ ]
39
+
40
+ _winogrande_all = [d['abbr'] for d in winogrande_datasets]
41
+
42
+ summarizer = dict(summary_groups=[
43
+ {
44
+ 'name': 'winogrande',
45
+ 'subsets': _winogrande_all
46
+ },
47
+ {
48
+ 'name': 'winogrande_std',
49
+ 'subsets': _winogrande_all,
50
+ 'std': True
51
+ },
52
+ ])
examples/eval_musr.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path as osp
2
+
3
+ from mmengine.config import read_base
4
+
5
+ with read_base():
6
+ from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
7
+ from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
8
+ models as lmdeploy_glm4_9b_chat_model
9
+ from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
10
+ models as lmdeploy_gemma_9b_it_model
11
+ from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
12
+ models as lmdeploy_gemma_27b_it_model
13
+ # from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
14
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
15
+ models as lmdeploy_internlm2_5_7b_chat_model
16
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
17
+ models as lmdeploy_llama3_1_8b_instruct_model
18
+ from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
19
+ models as lmdeploy_ministral_8b_instruct_2410_model
20
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
21
+ models as lmdeploy_qwen2_5_7b_instruct_model
22
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
23
+ models as lmdeploy_qwen2_5_14b_instruct_model
24
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
25
+ models as lmdeploy_qwen2_5_32b_instruct_model
26
+ from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
27
+ models as lmdeploy_yi_1_5_9b_chat_model
28
+ from opencompass.configs.summarizers.groups.musr_average import summarizer
29
+
30
+ datasets = [*musr_datasets]
31
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
32
+
33
+ base_exp_dir = 'outputs/musr/'
34
+ work_dir = osp.join(base_exp_dir, 'musr_eval')
examples/eval_needlebench_v2.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ # we use mmengine.config to import other config files
3
+
4
+ with read_base():
5
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b
6
+
7
+ # Evaluate needlebench_32k, adjust the configuration to use 4k, 32k, 128k, 200k, or 1000k if necessary.
8
+ # from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_32k import needlebench_datasets
9
+ # from opencompass.configs.summarizers.needlebench import needlebench_32k_summarizer as summarizer
10
+
11
+ # only eval original "needle in a haystack test" in needlebench_32k
12
+ from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_single_32k import needlebench_zh_datasets, needlebench_en_datasets
13
+ from opencompass.configs.summarizers.needlebench import needlebench_v2_32k_summarizer as summarizer
14
+
15
+ # eval Ancestral Tracing Challenge(ATC)
16
+ # from opencompass.configs.datasets.needlebench_v2.atc.atc_0shot_nocot_2_power_en import needlebench_datasets
17
+ # ATC use default summarizer thus no need to import summarizer
18
+
19
+ datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
20
+
21
+ for m in internlm2_chat_7b:
22
+ m['max_seq_len'] = 32768 # Ensure InternLM2-7B model can receive the full long text; for other models, adjust according to their supported maximum sequence length.
23
+ m['max_out_len'] = 4096
24
+
25
+ models = internlm2_chat_7b
26
+
27
+ work_dir = './outputs/needlebench'
examples/eval_qwen3.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os.path as osp
3
+ from opencompass.models import OpenAISDK
4
+ from mmengine.config import read_base
5
+ from opencompass.utils.text_postprocessors import extract_non_reasoning_content
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
8
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
9
+
10
+ with read_base():
11
+ from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
12
+ from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
13
+ from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
14
+
15
+ #######################################################################
16
+ # PART 0 Meta Info #
17
+ #######################################################################
18
+
19
+
20
+ api_meta_template = dict(round=[
21
+ dict(role='HUMAN', api_role='HUMAN'),
22
+ dict(role='BOT', api_role='BOT', generate=True),
23
+ ],
24
+ )
25
+
26
+
27
+ judge_cfg = dict(
28
+ abbr='qwen2-5-32B-Instruct',
29
+ type=OpenAISDK,
30
+ path='Qwen/Qwen2.5-32B-Instruct',
31
+ key='sk-1234',
32
+ openai_api_base=[
33
+ 'http://x.x.x.x:4000/v1',
34
+ ],
35
+ meta_template=api_meta_template,
36
+ query_per_second=8,
37
+ batch_size=256,
38
+ temperature=0.001,
39
+ # max_completion_tokens=32768,
40
+ tokenizer_path='gpt-4o-2024-05-13',
41
+ # verbose=True,
42
+ max_out_len=16384,
43
+ max_seq_len=32768,
44
+ # max_seq_len=49152,
45
+ mode='mid',
46
+ retry=10
47
+ )
48
+
49
+ #######################################################################
50
+ # PART 1 Datasets List #
51
+ #######################################################################
52
+
53
+ repeated_info = [
54
+ (math_datasets, 4),
55
+ (aime2024_datasets, 32),
56
+ (aime2025_datasets, 32),
57
+ ]
58
+
59
+ for datasets_, num in repeated_info:
60
+ for dataset_ in datasets_:
61
+ dataset_['n'] = num
62
+
63
+ datasets = sum(
64
+ (v for k, v in locals().items() if k.endswith('_datasets')),
65
+ [],
66
+ )
67
+
68
+ for item in datasets:
69
+ item['infer_cfg']['inferencer']['max_out_len'] = 32768
70
+ try:
71
+ if 'judge_cfg' in item['eval_cfg']['evaluator']:
72
+ item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
73
+ elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
74
+ item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
75
+ except:
76
+ pass
77
+ #######################################################################
78
+ # PART 2 Dataset Summarizer #
79
+ #######################################################################
80
+
81
+ summarizer = dict(
82
+ dataset_abbrs=[
83
+ 'MATH',
84
+ ['math_prm800k_500', 'accuracy (4 runs average)'],
85
+ ['aime2024', 'accuracy (32 runs average)'],
86
+ ['aime2025', 'accuracy (32 runs average)'],
87
+ ['livemathbench_hard', 'naive_average'],
88
+ ['OlympiadBenchMath', 'accuracy'],
89
+ ['olymmath', 'naive_average'],
90
+ ],
91
+ summary_groups = sum(
92
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []
93
+ ),
94
+ )
95
+
96
+ #######################################################################
97
+ # PART 3 Models List #
98
+ #######################################################################
99
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
100
+ models += [
101
+
102
+ dict(
103
+ abbr='Qwen_Qwen3-235B-A22B',
104
+ type=OpenAISDK,
105
+ path='Qwen/Qwen3-235B-A22B',
106
+ key='sk-admin',
107
+ openai_api_base=[
108
+ 'http://106.15.231.215:40007/v1/',
109
+ ],
110
+ meta_template=dict(
111
+ # begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
112
+ round=[
113
+ dict(role='HUMAN', api_role='HUMAN'),
114
+ # XXX: all system roles are mapped to human in purpose
115
+ dict(role='BOT', api_role='BOT', generate=True),
116
+ ]
117
+ ),
118
+ query_per_second=16,
119
+ batch_size=128,
120
+ # batch_size=1,
121
+ temperature=0.6,
122
+ # max_completion_tokens=32768,
123
+ tokenizer_path='gpt-4',
124
+ # verbose=True,
125
+ max_out_len=32768,
126
+ max_seq_len=32768,
127
+ pred_postprocessor=dict(type=extract_non_reasoning_content)
128
+ ),
129
+ ]
130
+
131
+ infer = dict(
132
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
133
+ runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
134
+ )
135
+
136
+ eval = dict(
137
+ partitioner=dict(type=NaivePartitioner, n=8),
138
+ runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
139
+ )
140
+
141
+ base_exp_dir = 'outputs/qwen3_reasoning'
142
+ work_dir = osp.join(base_exp_dir, 'chat_objective')
examples/eval_qwen_7b_chat.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.collections.leaderboard.qwen_chat import \
5
+ datasets
6
+ from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
7
+ from opencompass.configs.summarizers.leaderboard import summarizer
8
+ '''
9
+ dataset version metric mode qwen-7b-chat-hf
10
+ -------------------------------------- --------- ---------------- ------ -----------------
11
+ --------- 考试 Exam --------- - - - -
12
+ ceval - naive_average gen 56.07
13
+ agieval - naive_average mixed 39.51
14
+ mmlu - naive_average gen 53.49
15
+ cmmlu - naive_average gen 55.29
16
+ GaokaoBench - weighted_average gen 48.01
17
+ ARC-c ca1e8e accuracy ppl 74.92
18
+ ARC-e ca1e8e accuracy ppl 85.71
19
+ --------- 语言 Language --------- - - - -
20
+ WiC efbd01 accuracy gen 51.41
21
+ chid-dev 25f3d3 accuracy ppl 77.72
22
+ afqmc-dev 4a1636 accuracy gen 69.00
23
+ WSC 678cb5 accuracy ppl 67.31
24
+ tydiqa-goldp - naive_average gen 15.32
25
+ flores_100 - naive_average gen 10.00
26
+ --------- 知识 Knowledge --------- - - - -
27
+ BoolQ 463fee accuracy ppl 83.18
28
+ commonsense_qa ddaabf accuracy gen 76.41
29
+ triviaqa b6904f score gen 43.25
30
+ nq 23dc1a score gen 16.26
31
+ --------- 理解 Understanding --------- - - - -
32
+ C3 e6778d accuracy gen 81.53
33
+ race-middle e0908b accuracy gen 83.01
34
+ race-high e0908b accuracy gen 77.79
35
+ openbookqa_fact 49689a accuracy ppl 86.40
36
+ csl_dev 3c4211 accuracy ppl 64.38
37
+ lcsts 0b3969 rouge1 gen 12.75
38
+ Xsum 207e69 rouge1 gen 20.21
39
+ eprstmt-dev ed0c5d accuracy ppl 85.00
40
+ lambada de1af2 accuracy gen 59.19
41
+ --------- 推理 Reasoning --------- - - - -
42
+ cmnli 15e783 accuracy ppl 48.08
43
+ ocnli 15e783 accuracy ppl 51.40
44
+ AX_b 689df1 accuracy ppl 65.67
45
+ AX_g 808a19 accuracy ppl 76.12
46
+ RTE 808a19 accuracy ppl 68.95
47
+ COPA 59f42c accuracy gen 92.00
48
+ ReCoRD 6f7cfc score gen 0.16
49
+ hellaswag 8d79e0 accuracy ppl 69.28
50
+ piqa 34eee7 accuracy ppl 72.20
51
+ siqa ea30d1 accuracy ppl 72.88
52
+ math 2c0b9e accuracy gen 7.84
53
+ gsm8k 4c7f6e accuracy gen 45.41
54
+ drop 53a0a7 score gen 39.62
55
+ openai_humaneval dd0dff humaneval_pass@1 gen 10.98
56
+ mbpp 60ca11 score gen 20.60
57
+ bbh - naive_average gen 42.61
58
+ '''
examples/eval_qwen_7b_chat_lawbench.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.lawbench.lawbench_one_shot_gen_002588 import \
5
+ lawbench_datasets as lawbench_one_shot_datasets
6
+ from opencompass.configs.datasets.lawbench.lawbench_zero_shot_gen_002588 import \
7
+ lawbench_datasets as lawbench_zero_shot_datasets
8
+ from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
9
+ from opencompass.configs.summarizers.lawbench import summarizer
10
+
11
+ datasets = lawbench_zero_shot_datasets + lawbench_one_shot_datasets
12
+ for d in datasets:
13
+ d['infer_cfg']['inferencer']['save_every'] = 1
examples/eval_rewardbench.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ with read_base():
3
+ from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
4
+ from opencompass.configs.summarizers.rewardbench import summarizer
5
+
6
+ from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
7
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
8
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
9
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
10
+ from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
11
+ from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
12
+ from opencompass.runners import SlurmSequentialRunner
13
+ from opencompass.tasks import OpenICLInferTask
14
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
15
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
16
+
17
+ api_meta_template = dict(
18
+ round=[
19
+ dict(role='HUMAN', api_role='HUMAN'),
20
+ dict(role='BOT', api_role='BOT', generate=True),
21
+ ]
22
+ )
23
+ datasets = [*get_rewardbench_datasets]
24
+
25
+ from opencompass.models import TurboMindModelwithChatTemplate
26
+
27
+ models = [
28
+ dict(
29
+ type=TurboMindModelwithChatTemplate,
30
+ abbr='qwen-7b-hf',
31
+ path='Qwen/Qwen-7B',
32
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
33
+ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
34
+ max_seq_len=16384,
35
+ max_out_len=2048,
36
+ batch_size=16,
37
+ run_cfg=dict(num_gpus=1),
38
+ ),
39
+ ]
40
+
41
+
42
+ infer = dict(
43
+ partitioner=dict(type=NaivePartitioner),
44
+ runner=dict(
45
+ type=LocalRunner,
46
+ max_num_workers=72,
47
+ task=dict(type=OpenICLInferTask),
48
+ ),
49
+ )
50
+
51
+
52
+
53
+ work_dir = './outputs/rewardbench/'
examples/eval_rmb.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ with read_base():
3
+ from opencompass.configs.datasets.judge.rmb import get_rmb_dataset
4
+
5
+ from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
6
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
7
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
8
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
9
+ from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
10
+ from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
11
+ from opencompass.runners import SlurmSequentialRunner
12
+ from opencompass.tasks import OpenICLInferTask
13
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
14
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
15
+
16
+ api_meta_template = dict(
17
+ round=[
18
+ dict(role='HUMAN', api_role='HUMAN'),
19
+ dict(role='BOT', api_role='BOT', generate=True),
20
+ ]
21
+ )
22
+ datasets = [*get_rmb_dataset]
23
+
24
+ from opencompass.models import TurboMindModelwithChatTemplate
25
+
26
+ models = [
27
+ dict(
28
+ type=TurboMindModelwithChatTemplate,
29
+ abbr='qwen-7b-hf',
30
+ path='Qwen/Qwen-7B',
31
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
32
+ gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
33
+ max_seq_len=16384,
34
+ max_out_len=2048,
35
+ batch_size=16,
36
+ run_cfg=dict(num_gpus=1),
37
+ ),
38
+ ]
39
+
40
+
41
+ infer = dict(
42
+ # partitioner=dict(type=NaivePartitioner),
43
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
44
+ runner=dict(
45
+ type=LocalRunner,
46
+ max_num_workers=72,
47
+ task=dict(type=OpenICLInferTask),
48
+ ),
49
+ )
50
+
51
+
52
+
53
+ work_dir = './outputs/rmb/'
examples/eval_ruler.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
4
+ from opencompass.runners import LocalRunner
5
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
6
+
7
+ with read_base():
8
+ from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets # CWE
9
+ from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets # FWE
10
+ from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets # Niah
11
+ from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets # QA
12
+ from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets # VT
13
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import (
14
+ models as internlm2_5_7b_chat_1m,
15
+ )
16
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
17
+ models as llama3_8b_instruct_model,
18
+ )
19
+ from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
20
+ models as qwen2_7b_instruct_model,
21
+ )
22
+ from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups
23
+
24
+ import_datasets = sum(
25
+ [niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], [])
26
+
27
+ # Evaluation config
28
+ NUM_SAMPLES = 500
29
+ # Change the context lengths to be tested
30
+ max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
31
+ abbr_suffixs = ['4k', '8k', '16k', '32k']
32
+ work_dir = './outputs/ruler'
33
+
34
+ # Model Settings
35
+ qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
36
+ qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
37
+ qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
38
+ qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
39
+ llama3_8b_instruct_model[0]['max_seq_len'] = 33792
40
+ llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
41
+ llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
42
+ llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
43
+ model_settings = [
44
+ [qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
45
+ [llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
46
+ [internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
47
+ ]
48
+
49
+ # Dataset Model Combination
50
+ datasets = []
51
+ models = []
52
+ model_dataset_combinations = []
53
+
54
+ # Different seq length
55
+ for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
56
+ for model, model_path in model_settings:
57
+ _tmp_datasets = []
58
+ for dataset in import_datasets:
59
+ tmp_dataset = dataset.deepcopy()
60
+ tmp_dataset['tokenizer_model'] = model_path
61
+ tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
62
+ tmp_dataset['num_samples'] = NUM_SAMPLES
63
+ tmp_dataset['max_seq_length'] = max_seq_len
64
+ _tmp_datasets.append(tmp_dataset)
65
+ model_dataset_combinations.append(
66
+ dict(models=[model], datasets=_tmp_datasets))
67
+ models.append(model)
68
+ datasets.extend(_tmp_datasets)
69
+
70
+ infer = dict(
71
+ partitioner=dict(type=NumWorkerPartitioner),
72
+ runner=dict(type=LocalRunner,
73
+ max_num_workers=16,
74
+ task=dict(type=OpenICLInferTask),
75
+ retry=5),
76
+ )
77
+
78
+ eval = dict(
79
+ partitioner=dict(type=NaivePartitioner),
80
+ runner=dict(type=LocalRunner,
81
+ max_num_workers=32,
82
+ task=dict(type=OpenICLEvalTask)),
83
+ )
84
+
85
+ summarizer = dict(
86
+ dataset_abbrs=abbr_suffixs,
87
+ summary_groups=sum([ruler_summary_groups], []),
88
+ )
89
+
90
+ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
91
+ # dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind internlm2_5-7b-chat-1m-turbomind
92
+ # --------- --------- ------------- ------ ----------------------------- ------------------------------- ----------------------------------
93
+ # 4k - naive_average gen 93.66 93.48 91.20
94
+ # 8k - naive_average gen 88.38 89.95 89.07
95
+ # 16k - naive_average gen 84.27 0.14 87.61
96
+ # 32k - naive_average gen 81.36 0.00 84.59
97
+ # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
examples/eval_rwkv5_3b.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.collections.base_medium_llama import \
5
+ datasets
6
+ from opencompass.configs.models.rwkv.rwkv5_3b import models
7
+ from opencompass.configs.summarizers.leaderboard import summarizer
examples/eval_simpleqa.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
2
+ from mmengine.config import read_base
3
+
4
+ from opencompass.partitioners import NaivePartitioner
5
+ from opencompass.runners import LocalRunner
6
+ from opencompass.summarizers import DefaultSubjectiveSummarizer
7
+ from opencompass.tasks import OpenICLInferTask
8
+
9
+ with read_base():
10
+ from opencompass.configs.datasets.SimpleQA.simpleqa_gen import \
11
+ simpleqa_datasets
12
+ from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
13
+ models as gpt_4o_2024_05_13_model
14
+
15
+ models = gpt_4o_2024_05_13_model # model for generation
16
+ judge_models = gpt_4o_2024_05_13_model # model for evaluation
17
+
18
+ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
19
+ summarizer = dict(type=DefaultSubjectiveSummarizer)
20
+
21
+ # -------------Inferen Stage ----------------------------------------
22
+
23
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
24
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
25
+ from opencompass.runners import LocalRunner
26
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
27
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
28
+
29
+ infer = dict(
30
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
31
+ runner=dict(type=LocalRunner,
32
+ max_num_workers=8,
33
+ task=dict(type=OpenICLInferTask)),
34
+ )
35
+
36
+ eval = dict(
37
+ partitioner=dict(
38
+ type=SubjectiveNaivePartitioner,
39
+ models=models,
40
+ judge_models=judge_models,
41
+ ),
42
+ runner=dict(type=LocalRunner,
43
+ max_num_workers=256,
44
+ task=dict(type=SubjectiveEvalTask)),
45
+ )
examples/eval_subjective.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
5
+ from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
6
+ from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
7
+ from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
8
+ from opencompass.configs.datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
9
+ from opencompass.configs.datasets.subjective.fofo.fofo_judge import fofo_datasets
10
+ from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
11
+ from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
12
+ from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
13
+
14
+ from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
15
+ HuggingFaceChatGLM3, OpenAI)
16
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
17
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
18
+ from opencompass.partitioners.sub_num_worker import \
19
+ SubjectiveNumWorkerPartitioner
20
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
21
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
22
+ from opencompass.summarizers import SubjectiveSummarizer
23
+ from opencompass.tasks import OpenICLInferTask
24
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
25
+
26
+ api_meta_template = dict(round=[
27
+ dict(role='HUMAN', api_role='HUMAN'),
28
+ dict(role='BOT', api_role='BOT', generate=True),
29
+ ])
30
+
31
+ # -------------Inference Stage ----------------------------------------
32
+ # For subjective evaluation, we often set do sample for models
33
+ models = [
34
+ dict(
35
+ type=HuggingFaceChatGLM3,
36
+ abbr='chatglm3-6b-hf',
37
+ path='THUDM/chatglm3-6b',
38
+ tokenizer_path='THUDM/chatglm3-6b',
39
+ model_kwargs=dict(
40
+ device_map='auto',
41
+ trust_remote_code=True,
42
+ ),
43
+ tokenizer_kwargs=dict(
44
+ padding_side='left',
45
+ truncation_side='left',
46
+ trust_remote_code=True,
47
+ ),
48
+ generation_kwargs=dict(
49
+ do_sample=
50
+ True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
51
+ ),
52
+ meta_template=api_meta_template,
53
+ max_out_len=2048,
54
+ max_seq_len=4096,
55
+ batch_size=8,
56
+ run_cfg=dict(num_gpus=1, num_procs=1),
57
+ )
58
+ ]
59
+
60
+ datasets = [
61
+ *alignbench_datasets, *alpacav2_datasets, *arenahard_datasets,
62
+ *compassarena_datasets, *compassbench_datasets, *fofo_datasets,
63
+ *mtbench_datasets, *mtbench101_datasets, *wildbench_datasets
64
+ ] # add datasets you want
65
+
66
+ infer = dict(
67
+ partitioner=dict(type=NaivePartitioner),
68
+ runner=dict(type=LocalRunner,
69
+ max_num_workers=16,
70
+ task=dict(type=OpenICLInferTask)),
71
+ )
72
+ # -------------Evalation Stage ----------------------------------------
73
+
74
+ ## ------------- JudgeLLM Configuration
75
+ judge_models = [
76
+ dict(
77
+ abbr='GPT4-Turbo',
78
+ type=OpenAI,
79
+ path='gpt-4-1106-preview',
80
+ key=
81
+ 'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
82
+ meta_template=api_meta_template,
83
+ query_per_second=16,
84
+ max_out_len=2048,
85
+ max_seq_len=2048,
86
+ batch_size=8,
87
+ temperature=0,
88
+ )
89
+ ]
90
+
91
+ ## ------------- Evaluation Configuration
92
+ eval = dict(
93
+ partitioner=dict(
94
+ type=SubjectiveNaivePartitioner,
95
+ models=models,
96
+ judge_models=judge_models,
97
+ ),
98
+ runner=dict(type=LocalRunner,
99
+ max_num_workers=16,
100
+ task=dict(type=SubjectiveEvalTask)),
101
+ )
102
+
103
+ summarizer = dict(type=SubjectiveSummarizer, function='subjective')
104
+ work_dir = 'outputs/subjective/'
examples/eval_subjective_bradleyterry.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
5
+ alpacav2_datasets, )
6
+
7
+ from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_bradleyterry import (
8
+ arenahard_datasets, )
9
+
10
+ from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
11
+ compassarena_datasets, )
12
+
13
+ from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
14
+ wildbench_datasets, )
15
+
16
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
17
+ models as lmdeploy_internlm2_5_7b_chat, )
18
+
19
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
20
+ models as lmdeploy_internlm2_5_20b_chat, )
21
+
22
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
23
+ models as lmdeploy_qwen2_5_7b_instruct, )
24
+
25
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
26
+ models as lmdeploy_qwen2_5_14b_instruct, )
27
+
28
+ from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
29
+ models as lmdeploy_qwen2_7b_instruct, )
30
+
31
+ from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
32
+ HuggingFaceChatGLM3, OpenAI,
33
+ TurboMindModelwithChatTemplate)
34
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
35
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
36
+ from opencompass.partitioners.sub_num_worker import \
37
+ SubjectiveNumWorkerPartitioner
38
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
39
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
40
+ from opencompass.summarizers import (CompassArenaBradleyTerrySummarizer,
41
+ SubjectiveSummarizer)
42
+ from opencompass.tasks import OpenICLInferTask
43
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
44
+
45
+ api_meta_template = dict(round=[
46
+ dict(role='HUMAN', api_role='HUMAN'),
47
+ dict(role='BOT', api_role='BOT', generate=True),
48
+ ])
49
+
50
+ # -------------Inference Stage ----------------------------------------
51
+ # For subjective evaluation, we often set do sample for models
52
+ models = [
53
+ *lmdeploy_internlm2_5_7b_chat,
54
+ *lmdeploy_internlm2_5_20b_chat,
55
+ *lmdeploy_qwen2_5_14b_instruct,
56
+ *lmdeploy_qwen2_5_7b_instruct,
57
+ *lmdeploy_qwen2_7b_instruct,
58
+ ]
59
+
60
+ datasets = [
61
+ *alpacav2_datasets,
62
+ *arenahard_datasets,
63
+ *compassarena_datasets,
64
+ *wildbench_datasets,
65
+ ]
66
+
67
+ infer = dict(
68
+ partitioner=dict(type=NaivePartitioner),
69
+ runner=dict(type=LocalRunner,
70
+ max_num_workers=16,
71
+ task=dict(type=OpenICLInferTask)),
72
+ )
73
+ # -------------Evalation Stage ----------------------------------------
74
+
75
+ ## ------------- JudgeLLM Configuration
76
+ judge_models = [
77
+ dict(
78
+ type=TurboMindModelwithChatTemplate,
79
+ abbr='CompassJudger-1-32B-Instruct',
80
+ path='opencompass/CompassJudger-1-32B-Instruct',
81
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
82
+ gen_config=dict(top_k=1,
83
+ temperature=1e-6,
84
+ top_p=0.9,
85
+ max_new_tokens=2048),
86
+ max_seq_len=16384,
87
+ max_out_len=2048,
88
+ batch_size=16,
89
+ run_cfg=dict(num_gpus=4),
90
+ )
91
+ ]
92
+
93
+ ## ------------- Evaluation Configuration
94
+ eval = dict(
95
+ partitioner=dict(
96
+ type=SubjectiveNaivePartitioner,
97
+ models=models,
98
+ judge_models=judge_models,
99
+ ),
100
+ runner=dict(type=LocalRunner,
101
+ max_num_workers=16,
102
+ task=dict(type=SubjectiveEvalTask)),
103
+ )
104
+
105
+ ## ------------- Summary Configuration
106
+ # This step fits a Bradley-Terry model (statistical model) with an option
107
+ # to include style features and control variables based on groups
108
+ # (group variables must be available in the input dataset for each observation).
109
+ summarizer = dict(
110
+ type=CompassArenaBradleyTerrySummarizer,
111
+ rating_system='bradleyterry',
112
+ report_pred_win_rates=True,
113
+ num_bootstrap=100,
114
+ num_cpu=None,
115
+ with_control_vars=True,
116
+ normalize_style_features=False,
117
+ odds_ratio=True,
118
+ )
119
+
120
+ work_dir = 'outputs/subjective/bradleyterry'
examples/eval_teval.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+
3
+ from mmengine.config import read_base
4
+
5
+ with read_base():
6
+ from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
7
+ teval_datasets as teval_en_datasets
8
+ from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
9
+ teval_datasets as teval_zh_datasets
10
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
11
+ models as hf_internlm2_chat_7b_model
12
+ from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
13
+ models as hf_llama2_7b_chat_model
14
+ from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
15
+ models as hf_qwen_7b_chat_model
16
+ from opencompass.configs.summarizers.teval import summarizer
17
+
18
+ meta_template_system_patches = {
19
+ 'internlm2-chat-7b-hf':
20
+ dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
21
+ 'internlm2-chat-20b-hf':
22
+ dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
23
+ }
24
+
25
+ _origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
26
+ [])
27
+ models = []
28
+ for m in _origin_models:
29
+ m = deepcopy(m)
30
+ if 'meta_template' in m and 'round' in m['meta_template']:
31
+ round = m['meta_template']['round']
32
+ if all(r['role'].upper() != 'SYSTEM'
33
+ for r in round): # no system round
34
+ if m['abbr'] in meta_template_system_patches:
35
+ system_round = meta_template_system_patches[m['abbr']]
36
+ else:
37
+ system_round = [
38
+ r for r in round if r['role'].upper() == 'HUMAN'
39
+ ][0]
40
+ system_round = deepcopy(system_round)
41
+ system_round['role'] = 'SYSTEM'
42
+ m['meta_template']['round'].append(system_round)
43
+ else:
44
+ raise ValueError(f'no meta_template.round in {m.get("abbr", None)}')
45
+
46
+ print(
47
+ f'model {m["abbr"]} is using the following meta_template: {m["meta_template"]}'
48
+ )
49
+ models.append(m)
50
+
51
+ datasets = teval_en_datasets + teval_zh_datasets
52
+ work_dir = './outputs/teval'
53
+ """Dataset version metric mode
54
+ qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf.
55
+
56
+ ------------------------------------------- --------- -------------- ------- ----------------- ---------------------- --------------------
57
+ teval - naive_average unknown 57.69 78.18 36.63
58
+ teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27
59
+ teval-instruct_v1 10482d json_metric unknown 94.32 97.08 0.15
60
+ teval-plan_str_v1 10482d f1_score unknown 66.24 84.12 45.72
61
+ teval-plan_json_v1 10482d f1_score unknown 63.62 77.71 19.95
62
+ teval-reason_str_v1 10482d thought unknown 54.14 63.58 44.92
63
+ teval-reason_retrieve_understand_json_v1 10482d thought unknown 33.77 54.72 21.49
64
+ teval-retrieve_str_v1 10482d name unknown 73.89 85.28 60.6
65
+ teval-reason_retrieve_understand_json_v1 10482d name unknown 31.15 68.97 15.34
66
+ teval-understand_str_v1 10482d args unknown 77.76 93.03 65.61
67
+ teval-reason_retrieve_understand_json_v1 10482d args unknown 44.16 72.23 26.84
68
+ teval-review_str_v1 10482d review_quality unknown 62.22 71.66 44.35
69
+ teval_zh - naive_average unknown 61.31 75.01 32.33
70
+ teval-instruct_v1_zh 10482d string_metric unknown 88.69 98.19 23.64
71
+ teval-instruct_v1_zh 10482d json_metric unknown 75.77 96.62 0.89
72
+ teval-plan_str_v1_zh 10482d f1_score unknown 62.43 70.69 47.82
73
+ teval-plan_json_v1_zh 10482d f1_score unknown 61.46 68.95 15.87
74
+ teval-reason_str_v1_zh 10482d thought unknown 59.43 68.14 46.96
75
+ teval-reason_retrieve_understand_json_v1_zh 10482d thought unknown 39.19 60.37 23.91
76
+ teval-retrieve_str_v1_zh 10482d name unknown 69.41 84.22 54.44
77
+ teval-reason_retrieve_understand_json_v1_zh 10482d name unknown 32.87 70.46 14.16
78
+ teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29
79
+ teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83
80
+ teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1
81
+ """
examples/eval_with_model_dataset_combinations.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
5
+ ceval_datasets as chat_ceval_datasets
6
+ from opencompass.configs.datasets.ceval.ceval_ppl_578f8d import \
7
+ ceval_datasets as base_ceval_datasets
8
+ from opencompass.configs.internal.clusters.slurm import eval, infer
9
+ from opencompass.configs.models.qwen.hf_qwen_7b import \
10
+ models as hf_qwen_7b_base_models
11
+ from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
12
+ models as hf_qwen_7b_chat_models
13
+
14
+ # from opencompass.configs.internal.clusters.slurm import infer_split as infer, eval
15
+ # from opencompass.configs.internal.clusters.slurm import infer_size as infer, eval
16
+ # from opencompass.configs.internal.clusters.slurm import infer_size_split as infer, eval
17
+
18
+ base_ceval_datasets = base_ceval_datasets[:1]
19
+ chat_ceval_datasets = chat_ceval_datasets[-1:]
20
+
21
+ # If you do not want to run all the combinations of models and datasets, you
22
+ # can specify the combinations you want to run here. This is useful when you
23
+ # deleberately want to skip some subset of the combinations.
24
+ # Models and datasets in different combinations are recommended to be disjoint
25
+ # (different `abbr` in model & dataset configs), as we haven't tested this case
26
+ # throughly.
27
+ model_dataset_combinations = [
28
+ dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
29
+ dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
30
+ # dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
31
+ ]
32
+
33
+ # This union of models and datasets in model_dataset_combinations should be
34
+ # stored in the `models` and `datasets` variables below. Otherwise, modules
35
+ # like summarizer will miss out some information.
36
+ models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
37
+ datasets = [*base_ceval_datasets, *chat_ceval_datasets]
38
+
39
+ work_dir = './outputs/default/mdcomb/'
40
+ """
41
+ dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
42
+ ---------------------- --------- -------- ------ ------------ -----------------
43
+ ceval-computer_network 9b9417 accuracy ppl 52.63 -
44
+ ceval-physician 6e277d accuracy gen - 59.18
45
+ """
tmp/38bf021a-c80f-4a23-9021-f2adc82afa5d_params.py ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa_3',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_range='[75:100]',
33
+ test_split='test',
34
+ train_split='test'),
35
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
36
+ dict(
37
+ abbr='LongBench_hotpotqa_3',
38
+ eval_cfg=dict(
39
+ evaluator=dict(
40
+ type='opencompass.datasets.LongBenchF1Evaluator'),
41
+ pred_role='BOT'),
42
+ infer_cfg=dict(
43
+ inferencer=dict(
44
+ max_out_len=32,
45
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
46
+ prompt_template=dict(
47
+ template=dict(round=[
48
+ dict(
49
+ prompt=
50
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
51
+ role='HUMAN'),
52
+ ]),
53
+ type=
54
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
55
+ retriever=dict(
56
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
57
+ name='hotpotqa',
58
+ path='opencompass/Longbench',
59
+ reader_cfg=dict(
60
+ input_columns=[
61
+ 'context',
62
+ 'input',
63
+ ],
64
+ output_column='answers',
65
+ test_range='[75:100]',
66
+ test_split='test',
67
+ train_split='test'),
68
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
69
+ dict(
70
+ abbr='LongBench_musique_3',
71
+ eval_cfg=dict(
72
+ evaluator=dict(
73
+ type='opencompass.datasets.LongBenchF1Evaluator'),
74
+ pred_role='BOT'),
75
+ infer_cfg=dict(
76
+ inferencer=dict(
77
+ max_out_len=32,
78
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
79
+ prompt_template=dict(
80
+ template=dict(round=[
81
+ dict(
82
+ prompt=
83
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
84
+ role='HUMAN'),
85
+ ]),
86
+ type=
87
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
88
+ retriever=dict(
89
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
90
+ name='musique',
91
+ path='opencompass/Longbench',
92
+ reader_cfg=dict(
93
+ input_columns=[
94
+ 'context',
95
+ 'input',
96
+ ],
97
+ output_column='answers',
98
+ test_range='[75:100]',
99
+ test_split='test',
100
+ train_split='test'),
101
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
102
+ dict(
103
+ abbr='LongBench_multifieldqa_en_3',
104
+ eval_cfg=dict(
105
+ evaluator=dict(
106
+ type='opencompass.datasets.LongBenchF1Evaluator'),
107
+ pred_role='BOT'),
108
+ infer_cfg=dict(
109
+ inferencer=dict(
110
+ max_out_len=64,
111
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
112
+ prompt_template=dict(
113
+ template=dict(round=[
114
+ dict(
115
+ prompt=
116
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
117
+ role='HUMAN'),
118
+ ]),
119
+ type=
120
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
121
+ retriever=dict(
122
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
123
+ name='multifieldqa_en',
124
+ path='opencompass/Longbench',
125
+ reader_cfg=dict(
126
+ input_columns=[
127
+ 'context',
128
+ 'input',
129
+ ],
130
+ output_column='answers',
131
+ test_range='[57:76]',
132
+ test_split='test',
133
+ train_split='test'),
134
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
135
+ dict(
136
+ abbr='LongBench_multifieldqa_zh_3',
137
+ eval_cfg=dict(
138
+ evaluator=dict(
139
+ language='zh',
140
+ type='opencompass.datasets.LongBenchF1Evaluator'),
141
+ pred_role='BOT'),
142
+ infer_cfg=dict(
143
+ inferencer=dict(
144
+ max_out_len=64,
145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
146
+ prompt_template=dict(
147
+ template=dict(round=[
148
+ dict(
149
+ prompt=
150
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
151
+ role='HUMAN'),
152
+ ]),
153
+ type=
154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
155
+ retriever=dict(
156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
157
+ name='multifieldqa_zh',
158
+ path='opencompass/Longbench',
159
+ reader_cfg=dict(
160
+ input_columns=[
161
+ 'context',
162
+ 'input',
163
+ ],
164
+ output_column='answers',
165
+ test_range='[75:100]',
166
+ test_split='test',
167
+ train_split='test'),
168
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
169
+ dict(
170
+ abbr='LongBench_narrativeqa_3',
171
+ eval_cfg=dict(
172
+ evaluator=dict(
173
+ type='opencompass.datasets.LongBenchF1Evaluator'),
174
+ pred_role='BOT'),
175
+ infer_cfg=dict(
176
+ inferencer=dict(
177
+ max_out_len=128,
178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
179
+ prompt_template=dict(
180
+ template=dict(round=[
181
+ dict(
182
+ prompt=
183
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
184
+ role='HUMAN'),
185
+ ]),
186
+ type=
187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
188
+ retriever=dict(
189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
190
+ name='narrativeqa',
191
+ path='opencompass/Longbench',
192
+ reader_cfg=dict(
193
+ input_columns=[
194
+ 'context',
195
+ 'input',
196
+ ],
197
+ output_column='answers',
198
+ test_range='[75:100]',
199
+ test_split='test',
200
+ train_split='test'),
201
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
202
+ dict(
203
+ abbr='LongBench_qasper_3',
204
+ eval_cfg=dict(
205
+ evaluator=dict(
206
+ type='opencompass.datasets.LongBenchF1Evaluator'),
207
+ pred_role='BOT'),
208
+ infer_cfg=dict(
209
+ inferencer=dict(
210
+ max_out_len=32,
211
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
212
+ prompt_template=dict(
213
+ template=dict(round=[
214
+ dict(
215
+ prompt=
216
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
217
+ role='HUMAN'),
218
+ ]),
219
+ type=
220
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
221
+ retriever=dict(
222
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
223
+ name='qasper',
224
+ path='opencompass/Longbench',
225
+ reader_cfg=dict(
226
+ input_columns=[
227
+ 'context',
228
+ 'input',
229
+ ],
230
+ output_column='answers',
231
+ test_range='[75:100]',
232
+ test_split='test',
233
+ train_split='test'),
234
+ type='opencompass.datasets.LongBenchqasperDataset'),
235
+ dict(
236
+ abbr='LongBench_triviaqa_3',
237
+ eval_cfg=dict(
238
+ evaluator=dict(
239
+ type='opencompass.datasets.LongBenchF1Evaluator'),
240
+ pred_postprocessor=dict(
241
+ type='opencompass.datasets.triviaqa_postprocess'),
242
+ pred_role='BOT'),
243
+ infer_cfg=dict(
244
+ inferencer=dict(
245
+ max_out_len=32,
246
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
247
+ prompt_template=dict(
248
+ template=dict(round=[
249
+ dict(
250
+ prompt=
251
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
252
+ role='HUMAN'),
253
+ ]),
254
+ type=
255
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
256
+ retriever=dict(
257
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
258
+ name='triviaqa',
259
+ path='opencompass/Longbench',
260
+ reader_cfg=dict(
261
+ input_columns=[
262
+ 'context',
263
+ 'input',
264
+ ],
265
+ output_column='answers',
266
+ test_range='[75:100]',
267
+ test_split='test',
268
+ train_split='test'),
269
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
270
+ dict(
271
+ abbr='LongBench_gov_report_3',
272
+ eval_cfg=dict(
273
+ evaluator=dict(
274
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
275
+ pred_role='BOT'),
276
+ infer_cfg=dict(
277
+ inferencer=dict(
278
+ max_out_len=512,
279
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
280
+ prompt_template=dict(
281
+ template=dict(round=[
282
+ dict(
283
+ prompt=
284
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
285
+ role='HUMAN'),
286
+ ]),
287
+ type=
288
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
289
+ retriever=dict(
290
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
291
+ name='gov_report',
292
+ path='opencompass/Longbench',
293
+ reader_cfg=dict(
294
+ input_columns=[
295
+ 'context',
296
+ ],
297
+ output_column='answers',
298
+ test_range='[75:100]',
299
+ test_split='test',
300
+ train_split='test'),
301
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
302
+ dict(
303
+ abbr='LongBench_qmsum_3',
304
+ eval_cfg=dict(
305
+ evaluator=dict(
306
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
307
+ pred_role='BOT'),
308
+ infer_cfg=dict(
309
+ inferencer=dict(
310
+ max_out_len=512,
311
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
312
+ prompt_template=dict(
313
+ template=dict(round=[
314
+ dict(
315
+ prompt=
316
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
317
+ role='HUMAN'),
318
+ ]),
319
+ type=
320
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
321
+ retriever=dict(
322
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
323
+ name='qmsum',
324
+ path='opencompass/Longbench',
325
+ reader_cfg=dict(
326
+ input_columns=[
327
+ 'context',
328
+ 'input',
329
+ ],
330
+ output_column='answers',
331
+ test_range='[75:100]',
332
+ test_split='test',
333
+ train_split='test'),
334
+ type='opencompass.datasets.LongBenchqmsumDataset'),
335
+ dict(
336
+ abbr='LongBench_vcsum_3',
337
+ eval_cfg=dict(
338
+ evaluator=dict(
339
+ language='zh',
340
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
341
+ pred_role='BOT'),
342
+ infer_cfg=dict(
343
+ inferencer=dict(
344
+ max_out_len=512,
345
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
346
+ prompt_template=dict(
347
+ template=dict(round=[
348
+ dict(
349
+ prompt=
350
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
351
+ role='HUMAN'),
352
+ ]),
353
+ type=
354
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
355
+ retriever=dict(
356
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
357
+ name='vcsum',
358
+ path='opencompass/Longbench',
359
+ reader_cfg=dict(
360
+ input_columns=[
361
+ 'context',
362
+ ],
363
+ output_column='answers',
364
+ test_range='[75:100]',
365
+ test_split='test',
366
+ train_split='test'),
367
+ type='opencompass.datasets.LongBenchvcsumDataset'),
368
+ dict(
369
+ abbr='LongBench_dureader_3',
370
+ eval_cfg=dict(
371
+ evaluator=dict(
372
+ language='zh',
373
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
374
+ pred_role='BOT'),
375
+ infer_cfg=dict(
376
+ inferencer=dict(
377
+ max_out_len=128,
378
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
379
+ prompt_template=dict(
380
+ template=dict(round=[
381
+ dict(
382
+ prompt=
383
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
384
+ role='HUMAN'),
385
+ ]),
386
+ type=
387
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
388
+ retriever=dict(
389
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
390
+ name='dureader',
391
+ path='opencompass/Longbench',
392
+ reader_cfg=dict(
393
+ input_columns=[
394
+ 'context',
395
+ 'input',
396
+ ],
397
+ output_column='answers',
398
+ test_range='[75:100]',
399
+ test_split='test',
400
+ train_split='test'),
401
+ type='opencompass.datasets.LongBenchdureaderDataset'),
402
+ dict(
403
+ abbr='LongBench_lcc_3',
404
+ eval_cfg=dict(
405
+ evaluator=dict(
406
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
407
+ pred_role='BOT'),
408
+ infer_cfg=dict(
409
+ inferencer=dict(
410
+ max_out_len=64,
411
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
412
+ prompt_template=dict(
413
+ template=dict(round=[
414
+ dict(
415
+ prompt=
416
+ 'Please complete the code given below. \n{context}Next line of code:\n',
417
+ role='HUMAN'),
418
+ ]),
419
+ type=
420
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
421
+ retriever=dict(
422
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
423
+ name='lcc',
424
+ path='opencompass/Longbench',
425
+ reader_cfg=dict(
426
+ input_columns=[
427
+ 'context',
428
+ ],
429
+ output_column='answers',
430
+ test_range='[189:252]',
431
+ test_split='test',
432
+ train_split='test'),
433
+ type='opencompass.datasets.LongBenchlccDataset'),
434
+ dict(
435
+ abbr='LongBench_repobench-p_3',
436
+ eval_cfg=dict(
437
+ evaluator=dict(
438
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
439
+ pred_role='BOT'),
440
+ infer_cfg=dict(
441
+ inferencer=dict(
442
+ max_out_len=64,
443
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
444
+ prompt_template=dict(
445
+ template=dict(round=[
446
+ dict(
447
+ prompt=
448
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
449
+ role='HUMAN'),
450
+ ]),
451
+ type=
452
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
453
+ retriever=dict(
454
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
455
+ name='repobench-p',
456
+ path='opencompass/Longbench',
457
+ reader_cfg=dict(
458
+ input_columns=[
459
+ 'context',
460
+ 'input',
461
+ ],
462
+ output_column='answers',
463
+ test_range='[189:252]',
464
+ test_split='test',
465
+ train_split='test'),
466
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
467
+ dict(
468
+ abbr='LongBench_passage_retrieval_en_3',
469
+ eval_cfg=dict(
470
+ evaluator=dict(
471
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
472
+ pred_role='BOT'),
473
+ infer_cfg=dict(
474
+ inferencer=dict(
475
+ max_out_len=32,
476
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
477
+ prompt_template=dict(
478
+ template=dict(round=[
479
+ dict(
480
+ prompt=
481
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
482
+ role='HUMAN'),
483
+ ]),
484
+ type=
485
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
486
+ retriever=dict(
487
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
488
+ name='passage_retrieval_en',
489
+ path='opencompass/Longbench',
490
+ reader_cfg=dict(
491
+ input_columns=[
492
+ 'context',
493
+ 'input',
494
+ ],
495
+ output_column='answers',
496
+ test_range='[75:100]',
497
+ test_split='test',
498
+ train_split='test'),
499
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
500
+ dict(
501
+ abbr='LongBench_passage_retrieval_zh_3',
502
+ eval_cfg=dict(
503
+ evaluator=dict(
504
+ language='zh',
505
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
506
+ pred_role='BOT'),
507
+ infer_cfg=dict(
508
+ inferencer=dict(
509
+ max_out_len=32,
510
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
511
+ prompt_template=dict(
512
+ template=dict(round=[
513
+ dict(
514
+ prompt=
515
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
516
+ role='HUMAN'),
517
+ ]),
518
+ type=
519
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
520
+ retriever=dict(
521
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
522
+ name='passage_retrieval_zh',
523
+ path='opencompass/Longbench',
524
+ reader_cfg=dict(
525
+ input_columns=[
526
+ 'context',
527
+ 'input',
528
+ ],
529
+ output_column='answers',
530
+ test_range='[75:100]',
531
+ test_split='test',
532
+ train_split='test'),
533
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
534
+ dict(
535
+ abbr='LongBench_passage_count_3',
536
+ eval_cfg=dict(
537
+ evaluator=dict(
538
+ type='opencompass.datasets.LongBenchCountEvaluator'),
539
+ pred_role='BOT'),
540
+ infer_cfg=dict(
541
+ inferencer=dict(
542
+ max_out_len=32,
543
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
544
+ prompt_template=dict(
545
+ template=dict(round=[
546
+ dict(
547
+ prompt=
548
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
549
+ role='HUMAN'),
550
+ ]),
551
+ type=
552
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
553
+ retriever=dict(
554
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
555
+ name='passage_count',
556
+ path='opencompass/Longbench',
557
+ reader_cfg=dict(
558
+ input_columns=[
559
+ 'context',
560
+ 'input',
561
+ ],
562
+ output_column='answers',
563
+ test_range='[75:100]',
564
+ test_split='test',
565
+ train_split='test'),
566
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
567
+ dict(
568
+ abbr='LongBench_trec_3',
569
+ eval_cfg=dict(
570
+ evaluator=dict(
571
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
572
+ ),
573
+ pred_postprocessor=dict(
574
+ type='opencompass.datasets.trec_postprocess'),
575
+ pred_role='BOT'),
576
+ infer_cfg=dict(
577
+ inferencer=dict(
578
+ max_out_len=64,
579
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
580
+ prompt_template=dict(
581
+ template=dict(round=[
582
+ dict(
583
+ prompt=
584
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
585
+ role='HUMAN'),
586
+ ]),
587
+ type=
588
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
589
+ retriever=dict(
590
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
591
+ name='trec',
592
+ path='opencompass/Longbench',
593
+ reader_cfg=dict(
594
+ input_columns=[
595
+ 'context',
596
+ 'input',
597
+ ],
598
+ output_column='all_labels',
599
+ test_range='[75:100]',
600
+ test_split='test',
601
+ train_split='test'),
602
+ type='opencompass.datasets.LongBenchtrecDataset'),
603
+ dict(
604
+ abbr='LongBench_lsht_3',
605
+ eval_cfg=dict(
606
+ evaluator=dict(
607
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
608
+ ),
609
+ pred_postprocessor=dict(
610
+ type='opencompass.datasets.lsht_postprocess'),
611
+ pred_role='BOT'),
612
+ infer_cfg=dict(
613
+ inferencer=dict(
614
+ max_out_len=64,
615
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
616
+ prompt_template=dict(
617
+ template=dict(round=[
618
+ dict(
619
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
620
+ role='HUMAN'),
621
+ ]),
622
+ type=
623
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
624
+ retriever=dict(
625
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
626
+ name='lsht',
627
+ path='opencompass/Longbench',
628
+ reader_cfg=dict(
629
+ input_columns=[
630
+ 'context',
631
+ 'input',
632
+ ],
633
+ output_column='all_labels',
634
+ test_range='[75:100]',
635
+ test_split='test',
636
+ train_split='test'),
637
+ type='opencompass.datasets.LongBenchlshtDataset'),
638
+ dict(
639
+ abbr='LongBench_multi_news_3',
640
+ eval_cfg=dict(
641
+ evaluator=dict(
642
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
643
+ pred_role='BOT'),
644
+ infer_cfg=dict(
645
+ inferencer=dict(
646
+ max_out_len=512,
647
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
648
+ prompt_template=dict(
649
+ template=dict(round=[
650
+ dict(
651
+ prompt=
652
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
653
+ role='HUMAN'),
654
+ ]),
655
+ type=
656
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
657
+ retriever=dict(
658
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
659
+ name='multi_news',
660
+ path='opencompass/Longbench',
661
+ reader_cfg=dict(
662
+ input_columns=[
663
+ 'context',
664
+ ],
665
+ output_column='answers',
666
+ test_range='[75:100]',
667
+ test_split='test',
668
+ train_split='test'),
669
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
670
+ dict(
671
+ abbr='LongBench_samsum_3',
672
+ eval_cfg=dict(
673
+ evaluator=dict(
674
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
675
+ pred_postprocessor=dict(
676
+ type='opencompass.datasets.samsum_postprocess'),
677
+ pred_role='BOT'),
678
+ infer_cfg=dict(
679
+ inferencer=dict(
680
+ max_out_len=128,
681
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
682
+ prompt_template=dict(
683
+ template=dict(round=[
684
+ dict(
685
+ prompt=
686
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
687
+ role='HUMAN'),
688
+ ]),
689
+ type=
690
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
691
+ retriever=dict(
692
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
693
+ name='samsum',
694
+ path='opencompass/Longbench',
695
+ reader_cfg=dict(
696
+ input_columns=[
697
+ 'context',
698
+ 'input',
699
+ ],
700
+ output_column='answers',
701
+ test_range='[75:100]',
702
+ test_split='test',
703
+ train_split='test'),
704
+ type='opencompass.datasets.LongBenchsamsumDataset'),
705
+ dict(
706
+ abbr='LongBench_2wikimqa_3',
707
+ eval_cfg=dict(
708
+ evaluator=dict(
709
+ type='opencompass.datasets.LongBenchF1Evaluator'),
710
+ pred_role='BOT'),
711
+ infer_cfg=dict(
712
+ inferencer=dict(
713
+ max_out_len=32,
714
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
715
+ prompt_template=dict(
716
+ template=dict(round=[
717
+ dict(
718
+ prompt=
719
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
720
+ role='HUMAN'),
721
+ ]),
722
+ type=
723
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
724
+ retriever=dict(
725
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
726
+ name='2wikimqa',
727
+ path='opencompass/Longbench',
728
+ reader_cfg=dict(
729
+ input_columns=[
730
+ 'context',
731
+ 'input',
732
+ ],
733
+ output_column='answers',
734
+ test_range='[75:100]',
735
+ test_split='test',
736
+ train_split='test'),
737
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
738
+ dict(
739
+ abbr='LongBench_hotpotqa_3',
740
+ eval_cfg=dict(
741
+ evaluator=dict(
742
+ type='opencompass.datasets.LongBenchF1Evaluator'),
743
+ pred_role='BOT'),
744
+ infer_cfg=dict(
745
+ inferencer=dict(
746
+ max_out_len=32,
747
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
748
+ prompt_template=dict(
749
+ template=dict(round=[
750
+ dict(
751
+ prompt=
752
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
753
+ role='HUMAN'),
754
+ ]),
755
+ type=
756
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
757
+ retriever=dict(
758
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
759
+ name='hotpotqa',
760
+ path='opencompass/Longbench',
761
+ reader_cfg=dict(
762
+ input_columns=[
763
+ 'context',
764
+ 'input',
765
+ ],
766
+ output_column='answers',
767
+ test_range='[75:100]',
768
+ test_split='test',
769
+ train_split='test'),
770
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
771
+ dict(
772
+ abbr='LongBench_musique_3',
773
+ eval_cfg=dict(
774
+ evaluator=dict(
775
+ type='opencompass.datasets.LongBenchF1Evaluator'),
776
+ pred_role='BOT'),
777
+ infer_cfg=dict(
778
+ inferencer=dict(
779
+ max_out_len=32,
780
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
781
+ prompt_template=dict(
782
+ template=dict(round=[
783
+ dict(
784
+ prompt=
785
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
786
+ role='HUMAN'),
787
+ ]),
788
+ type=
789
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
790
+ retriever=dict(
791
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
792
+ name='musique',
793
+ path='opencompass/Longbench',
794
+ reader_cfg=dict(
795
+ input_columns=[
796
+ 'context',
797
+ 'input',
798
+ ],
799
+ output_column='answers',
800
+ test_range='[75:100]',
801
+ test_split='test',
802
+ train_split='test'),
803
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
804
+ dict(
805
+ abbr='LongBench_multifieldqa_en_3',
806
+ eval_cfg=dict(
807
+ evaluator=dict(
808
+ type='opencompass.datasets.LongBenchF1Evaluator'),
809
+ pred_role='BOT'),
810
+ infer_cfg=dict(
811
+ inferencer=dict(
812
+ max_out_len=64,
813
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
814
+ prompt_template=dict(
815
+ template=dict(round=[
816
+ dict(
817
+ prompt=
818
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
819
+ role='HUMAN'),
820
+ ]),
821
+ type=
822
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
823
+ retriever=dict(
824
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
825
+ name='multifieldqa_en',
826
+ path='opencompass/Longbench',
827
+ reader_cfg=dict(
828
+ input_columns=[
829
+ 'context',
830
+ 'input',
831
+ ],
832
+ output_column='answers',
833
+ test_range='[57:76]',
834
+ test_split='test',
835
+ train_split='test'),
836
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
837
+ dict(
838
+ abbr='LongBench_multifieldqa_zh_3',
839
+ eval_cfg=dict(
840
+ evaluator=dict(
841
+ language='zh',
842
+ type='opencompass.datasets.LongBenchF1Evaluator'),
843
+ pred_role='BOT'),
844
+ infer_cfg=dict(
845
+ inferencer=dict(
846
+ max_out_len=64,
847
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
848
+ prompt_template=dict(
849
+ template=dict(round=[
850
+ dict(
851
+ prompt=
852
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
853
+ role='HUMAN'),
854
+ ]),
855
+ type=
856
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
857
+ retriever=dict(
858
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
859
+ name='multifieldqa_zh',
860
+ path='opencompass/Longbench',
861
+ reader_cfg=dict(
862
+ input_columns=[
863
+ 'context',
864
+ 'input',
865
+ ],
866
+ output_column='answers',
867
+ test_range='[75:100]',
868
+ test_split='test',
869
+ train_split='test'),
870
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
871
+ dict(
872
+ abbr='LongBench_narrativeqa_3',
873
+ eval_cfg=dict(
874
+ evaluator=dict(
875
+ type='opencompass.datasets.LongBenchF1Evaluator'),
876
+ pred_role='BOT'),
877
+ infer_cfg=dict(
878
+ inferencer=dict(
879
+ max_out_len=128,
880
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
881
+ prompt_template=dict(
882
+ template=dict(round=[
883
+ dict(
884
+ prompt=
885
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
886
+ role='HUMAN'),
887
+ ]),
888
+ type=
889
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
890
+ retriever=dict(
891
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
892
+ name='narrativeqa',
893
+ path='opencompass/Longbench',
894
+ reader_cfg=dict(
895
+ input_columns=[
896
+ 'context',
897
+ 'input',
898
+ ],
899
+ output_column='answers',
900
+ test_range='[75:100]',
901
+ test_split='test',
902
+ train_split='test'),
903
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
904
+ dict(
905
+ abbr='LongBench_qasper_3',
906
+ eval_cfg=dict(
907
+ evaluator=dict(
908
+ type='opencompass.datasets.LongBenchF1Evaluator'),
909
+ pred_role='BOT'),
910
+ infer_cfg=dict(
911
+ inferencer=dict(
912
+ max_out_len=32,
913
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
914
+ prompt_template=dict(
915
+ template=dict(round=[
916
+ dict(
917
+ prompt=
918
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
919
+ role='HUMAN'),
920
+ ]),
921
+ type=
922
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
923
+ retriever=dict(
924
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
925
+ name='qasper',
926
+ path='opencompass/Longbench',
927
+ reader_cfg=dict(
928
+ input_columns=[
929
+ 'context',
930
+ 'input',
931
+ ],
932
+ output_column='answers',
933
+ test_range='[75:100]',
934
+ test_split='test',
935
+ train_split='test'),
936
+ type='opencompass.datasets.LongBenchqasperDataset'),
937
+ dict(
938
+ abbr='LongBench_triviaqa_3',
939
+ eval_cfg=dict(
940
+ evaluator=dict(
941
+ type='opencompass.datasets.LongBenchF1Evaluator'),
942
+ pred_postprocessor=dict(
943
+ type='opencompass.datasets.triviaqa_postprocess'),
944
+ pred_role='BOT'),
945
+ infer_cfg=dict(
946
+ inferencer=dict(
947
+ max_out_len=32,
948
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
949
+ prompt_template=dict(
950
+ template=dict(round=[
951
+ dict(
952
+ prompt=
953
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
954
+ role='HUMAN'),
955
+ ]),
956
+ type=
957
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
958
+ retriever=dict(
959
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
960
+ name='triviaqa',
961
+ path='opencompass/Longbench',
962
+ reader_cfg=dict(
963
+ input_columns=[
964
+ 'context',
965
+ 'input',
966
+ ],
967
+ output_column='answers',
968
+ test_range='[75:100]',
969
+ test_split='test',
970
+ train_split='test'),
971
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
972
+ dict(
973
+ abbr='LongBench_gov_report_3',
974
+ eval_cfg=dict(
975
+ evaluator=dict(
976
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
977
+ pred_role='BOT'),
978
+ infer_cfg=dict(
979
+ inferencer=dict(
980
+ max_out_len=512,
981
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
982
+ prompt_template=dict(
983
+ template=dict(round=[
984
+ dict(
985
+ prompt=
986
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
987
+ role='HUMAN'),
988
+ ]),
989
+ type=
990
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
991
+ retriever=dict(
992
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
993
+ name='gov_report',
994
+ path='opencompass/Longbench',
995
+ reader_cfg=dict(
996
+ input_columns=[
997
+ 'context',
998
+ ],
999
+ output_column='answers',
1000
+ test_range='[75:100]',
1001
+ test_split='test',
1002
+ train_split='test'),
1003
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
1004
+ dict(
1005
+ abbr='LongBench_qmsum_3',
1006
+ eval_cfg=dict(
1007
+ evaluator=dict(
1008
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1009
+ pred_role='BOT'),
1010
+ infer_cfg=dict(
1011
+ inferencer=dict(
1012
+ max_out_len=512,
1013
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1014
+ prompt_template=dict(
1015
+ template=dict(round=[
1016
+ dict(
1017
+ prompt=
1018
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
1019
+ role='HUMAN'),
1020
+ ]),
1021
+ type=
1022
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1023
+ retriever=dict(
1024
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1025
+ name='qmsum',
1026
+ path='opencompass/Longbench',
1027
+ reader_cfg=dict(
1028
+ input_columns=[
1029
+ 'context',
1030
+ 'input',
1031
+ ],
1032
+ output_column='answers',
1033
+ test_range='[75:100]',
1034
+ test_split='test',
1035
+ train_split='test'),
1036
+ type='opencompass.datasets.LongBenchqmsumDataset'),
1037
+ dict(
1038
+ abbr='LongBench_vcsum_3',
1039
+ eval_cfg=dict(
1040
+ evaluator=dict(
1041
+ language='zh',
1042
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1043
+ pred_role='BOT'),
1044
+ infer_cfg=dict(
1045
+ inferencer=dict(
1046
+ max_out_len=512,
1047
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1048
+ prompt_template=dict(
1049
+ template=dict(round=[
1050
+ dict(
1051
+ prompt=
1052
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
1053
+ role='HUMAN'),
1054
+ ]),
1055
+ type=
1056
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1057
+ retriever=dict(
1058
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1059
+ name='vcsum',
1060
+ path='opencompass/Longbench',
1061
+ reader_cfg=dict(
1062
+ input_columns=[
1063
+ 'context',
1064
+ ],
1065
+ output_column='answers',
1066
+ test_range='[75:100]',
1067
+ test_split='test',
1068
+ train_split='test'),
1069
+ type='opencompass.datasets.LongBenchvcsumDataset'),
1070
+ dict(
1071
+ abbr='LongBench_dureader_3',
1072
+ eval_cfg=dict(
1073
+ evaluator=dict(
1074
+ language='zh',
1075
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1076
+ pred_role='BOT'),
1077
+ infer_cfg=dict(
1078
+ inferencer=dict(
1079
+ max_out_len=128,
1080
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1081
+ prompt_template=dict(
1082
+ template=dict(round=[
1083
+ dict(
1084
+ prompt=
1085
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
1086
+ role='HUMAN'),
1087
+ ]),
1088
+ type=
1089
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1090
+ retriever=dict(
1091
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1092
+ name='dureader',
1093
+ path='opencompass/Longbench',
1094
+ reader_cfg=dict(
1095
+ input_columns=[
1096
+ 'context',
1097
+ 'input',
1098
+ ],
1099
+ output_column='answers',
1100
+ test_range='[75:100]',
1101
+ test_split='test',
1102
+ train_split='test'),
1103
+ type='opencompass.datasets.LongBenchdureaderDataset'),
1104
+ dict(
1105
+ abbr='LongBench_lcc_3',
1106
+ eval_cfg=dict(
1107
+ evaluator=dict(
1108
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1109
+ pred_role='BOT'),
1110
+ infer_cfg=dict(
1111
+ inferencer=dict(
1112
+ max_out_len=64,
1113
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1114
+ prompt_template=dict(
1115
+ template=dict(round=[
1116
+ dict(
1117
+ prompt=
1118
+ 'Please complete the code given below. \n{context}Next line of code:\n',
1119
+ role='HUMAN'),
1120
+ ]),
1121
+ type=
1122
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1123
+ retriever=dict(
1124
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1125
+ name='lcc',
1126
+ path='opencompass/Longbench',
1127
+ reader_cfg=dict(
1128
+ input_columns=[
1129
+ 'context',
1130
+ ],
1131
+ output_column='answers',
1132
+ test_range='[189:252]',
1133
+ test_split='test',
1134
+ train_split='test'),
1135
+ type='opencompass.datasets.LongBenchlccDataset'),
1136
+ dict(
1137
+ abbr='LongBench_repobench-p_3',
1138
+ eval_cfg=dict(
1139
+ evaluator=dict(
1140
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1141
+ pred_role='BOT'),
1142
+ infer_cfg=dict(
1143
+ inferencer=dict(
1144
+ max_out_len=64,
1145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1146
+ prompt_template=dict(
1147
+ template=dict(round=[
1148
+ dict(
1149
+ prompt=
1150
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
1151
+ role='HUMAN'),
1152
+ ]),
1153
+ type=
1154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1155
+ retriever=dict(
1156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1157
+ name='repobench-p',
1158
+ path='opencompass/Longbench',
1159
+ reader_cfg=dict(
1160
+ input_columns=[
1161
+ 'context',
1162
+ 'input',
1163
+ ],
1164
+ output_column='answers',
1165
+ test_range='[189:252]',
1166
+ test_split='test',
1167
+ train_split='test'),
1168
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
1169
+ dict(
1170
+ abbr='LongBench_passage_retrieval_en_3',
1171
+ eval_cfg=dict(
1172
+ evaluator=dict(
1173
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1174
+ pred_role='BOT'),
1175
+ infer_cfg=dict(
1176
+ inferencer=dict(
1177
+ max_out_len=32,
1178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1179
+ prompt_template=dict(
1180
+ template=dict(round=[
1181
+ dict(
1182
+ prompt=
1183
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
1184
+ role='HUMAN'),
1185
+ ]),
1186
+ type=
1187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1188
+ retriever=dict(
1189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1190
+ name='passage_retrieval_en',
1191
+ path='opencompass/Longbench',
1192
+ reader_cfg=dict(
1193
+ input_columns=[
1194
+ 'context',
1195
+ 'input',
1196
+ ],
1197
+ output_column='answers',
1198
+ test_range='[75:100]',
1199
+ test_split='test',
1200
+ train_split='test'),
1201
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
1202
+ dict(
1203
+ abbr='LongBench_passage_retrieval_zh_3',
1204
+ eval_cfg=dict(
1205
+ evaluator=dict(
1206
+ language='zh',
1207
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1208
+ pred_role='BOT'),
1209
+ infer_cfg=dict(
1210
+ inferencer=dict(
1211
+ max_out_len=32,
1212
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1213
+ prompt_template=dict(
1214
+ template=dict(round=[
1215
+ dict(
1216
+ prompt=
1217
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
1218
+ role='HUMAN'),
1219
+ ]),
1220
+ type=
1221
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1222
+ retriever=dict(
1223
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1224
+ name='passage_retrieval_zh',
1225
+ path='opencompass/Longbench',
1226
+ reader_cfg=dict(
1227
+ input_columns=[
1228
+ 'context',
1229
+ 'input',
1230
+ ],
1231
+ output_column='answers',
1232
+ test_range='[75:100]',
1233
+ test_split='test',
1234
+ train_split='test'),
1235
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
1236
+ dict(
1237
+ abbr='LongBench_passage_count_3',
1238
+ eval_cfg=dict(
1239
+ evaluator=dict(
1240
+ type='opencompass.datasets.LongBenchCountEvaluator'),
1241
+ pred_role='BOT'),
1242
+ infer_cfg=dict(
1243
+ inferencer=dict(
1244
+ max_out_len=32,
1245
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1246
+ prompt_template=dict(
1247
+ template=dict(round=[
1248
+ dict(
1249
+ prompt=
1250
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
1251
+ role='HUMAN'),
1252
+ ]),
1253
+ type=
1254
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1255
+ retriever=dict(
1256
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1257
+ name='passage_count',
1258
+ path='opencompass/Longbench',
1259
+ reader_cfg=dict(
1260
+ input_columns=[
1261
+ 'context',
1262
+ 'input',
1263
+ ],
1264
+ output_column='answers',
1265
+ test_range='[75:100]',
1266
+ test_split='test',
1267
+ train_split='test'),
1268
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
1269
+ dict(
1270
+ abbr='LongBench_trec_3',
1271
+ eval_cfg=dict(
1272
+ evaluator=dict(
1273
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1274
+ ),
1275
+ pred_postprocessor=dict(
1276
+ type='opencompass.datasets.trec_postprocess'),
1277
+ pred_role='BOT'),
1278
+ infer_cfg=dict(
1279
+ inferencer=dict(
1280
+ max_out_len=64,
1281
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1282
+ prompt_template=dict(
1283
+ template=dict(round=[
1284
+ dict(
1285
+ prompt=
1286
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
1287
+ role='HUMAN'),
1288
+ ]),
1289
+ type=
1290
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1291
+ retriever=dict(
1292
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1293
+ name='trec',
1294
+ path='opencompass/Longbench',
1295
+ reader_cfg=dict(
1296
+ input_columns=[
1297
+ 'context',
1298
+ 'input',
1299
+ ],
1300
+ output_column='all_labels',
1301
+ test_range='[75:100]',
1302
+ test_split='test',
1303
+ train_split='test'),
1304
+ type='opencompass.datasets.LongBenchtrecDataset'),
1305
+ dict(
1306
+ abbr='LongBench_lsht_3',
1307
+ eval_cfg=dict(
1308
+ evaluator=dict(
1309
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1310
+ ),
1311
+ pred_postprocessor=dict(
1312
+ type='opencompass.datasets.lsht_postprocess'),
1313
+ pred_role='BOT'),
1314
+ infer_cfg=dict(
1315
+ inferencer=dict(
1316
+ max_out_len=64,
1317
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1318
+ prompt_template=dict(
1319
+ template=dict(round=[
1320
+ dict(
1321
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
1322
+ role='HUMAN'),
1323
+ ]),
1324
+ type=
1325
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1326
+ retriever=dict(
1327
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1328
+ name='lsht',
1329
+ path='opencompass/Longbench',
1330
+ reader_cfg=dict(
1331
+ input_columns=[
1332
+ 'context',
1333
+ 'input',
1334
+ ],
1335
+ output_column='all_labels',
1336
+ test_range='[75:100]',
1337
+ test_split='test',
1338
+ train_split='test'),
1339
+ type='opencompass.datasets.LongBenchlshtDataset'),
1340
+ dict(
1341
+ abbr='LongBench_multi_news_3',
1342
+ eval_cfg=dict(
1343
+ evaluator=dict(
1344
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1345
+ pred_role='BOT'),
1346
+ infer_cfg=dict(
1347
+ inferencer=dict(
1348
+ max_out_len=512,
1349
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1350
+ prompt_template=dict(
1351
+ template=dict(round=[
1352
+ dict(
1353
+ prompt=
1354
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
1355
+ role='HUMAN'),
1356
+ ]),
1357
+ type=
1358
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1359
+ retriever=dict(
1360
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1361
+ name='multi_news',
1362
+ path='opencompass/Longbench',
1363
+ reader_cfg=dict(
1364
+ input_columns=[
1365
+ 'context',
1366
+ ],
1367
+ output_column='answers',
1368
+ test_range='[75:100]',
1369
+ test_split='test',
1370
+ train_split='test'),
1371
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
1372
+ dict(
1373
+ abbr='LongBench_samsum_3',
1374
+ eval_cfg=dict(
1375
+ evaluator=dict(
1376
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1377
+ pred_postprocessor=dict(
1378
+ type='opencompass.datasets.samsum_postprocess'),
1379
+ pred_role='BOT'),
1380
+ infer_cfg=dict(
1381
+ inferencer=dict(
1382
+ max_out_len=128,
1383
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1384
+ prompt_template=dict(
1385
+ template=dict(round=[
1386
+ dict(
1387
+ prompt=
1388
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
1389
+ role='HUMAN'),
1390
+ ]),
1391
+ type=
1392
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1393
+ retriever=dict(
1394
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1395
+ name='samsum',
1396
+ path='opencompass/Longbench',
1397
+ reader_cfg=dict(
1398
+ input_columns=[
1399
+ 'context',
1400
+ 'input',
1401
+ ],
1402
+ output_column='answers',
1403
+ test_range='[75:100]',
1404
+ test_split='test',
1405
+ train_split='test'),
1406
+ type='opencompass.datasets.LongBenchsamsumDataset'),
1407
+ ],
1408
+ ]
1409
+ models = [
1410
+ dict(
1411
+ abbr='delta_net',
1412
+ batch_size=128,
1413
+ max_seq_len=2048,
1414
+ model_kwargs=dict(
1415
+ device_map='auto',
1416
+ torch_dtype='torch.bfloat16',
1417
+ trust_remote_code=True),
1418
+ path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1419
+ run_cfg=dict(num_gpus=1),
1420
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
1421
+ tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1422
+ type='opencompass.models.HuggingFaceBaseModel'),
1423
+ ]
1424
+ work_dir = 'outputs/default/20251127_221150'
tmp/3baffa8c-bc69-4789-aa49-f30266896eb4_params.py ADDED
File without changes
tmp/3bc1afd5-60f6-4b89-9fc0-909218b5c248_params.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_musique',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='musique',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_split='test',
33
+ train_split='test'),
34
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
35
+ ],
36
+ ]
37
+ eval = dict(runner=dict(task=dict(dump_details=True)))
38
+ models = [
39
+ dict(
40
+ abbr='gated_deltanet',
41
+ batch_size=128,
42
+ max_seq_len=2048,
43
+ model_kwargs=dict(
44
+ device_map='auto',
45
+ torch_dtype='torch.bfloat16',
46
+ trust_remote_code=True),
47
+ path='download_model/hgrn2-1.3B-100B',
48
+ run_cfg=dict(num_gpus=1),
49
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
50
+ tokenizer_path='download_model/hgrn2-1.3B-100B',
51
+ type='opencompass.models.HuggingFaceBaseModel'),
52
+ ]
53
+ work_dir = 'outputs/default/20251219_163447'
tmp/401500cf-6431-490c-9e43-14532e24796f_params.py ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa_0',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_range='[0:25]',
33
+ test_split='test',
34
+ train_split='test'),
35
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
36
+ dict(
37
+ abbr='LongBench_hotpotqa_0',
38
+ eval_cfg=dict(
39
+ evaluator=dict(
40
+ type='opencompass.datasets.LongBenchF1Evaluator'),
41
+ pred_role='BOT'),
42
+ infer_cfg=dict(
43
+ inferencer=dict(
44
+ max_out_len=32,
45
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
46
+ prompt_template=dict(
47
+ template=dict(round=[
48
+ dict(
49
+ prompt=
50
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
51
+ role='HUMAN'),
52
+ ]),
53
+ type=
54
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
55
+ retriever=dict(
56
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
57
+ name='hotpotqa',
58
+ path='opencompass/Longbench',
59
+ reader_cfg=dict(
60
+ input_columns=[
61
+ 'context',
62
+ 'input',
63
+ ],
64
+ output_column='answers',
65
+ test_range='[0:25]',
66
+ test_split='test',
67
+ train_split='test'),
68
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
69
+ dict(
70
+ abbr='LongBench_musique_0',
71
+ eval_cfg=dict(
72
+ evaluator=dict(
73
+ type='opencompass.datasets.LongBenchF1Evaluator'),
74
+ pred_role='BOT'),
75
+ infer_cfg=dict(
76
+ inferencer=dict(
77
+ max_out_len=32,
78
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
79
+ prompt_template=dict(
80
+ template=dict(round=[
81
+ dict(
82
+ prompt=
83
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
84
+ role='HUMAN'),
85
+ ]),
86
+ type=
87
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
88
+ retriever=dict(
89
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
90
+ name='musique',
91
+ path='opencompass/Longbench',
92
+ reader_cfg=dict(
93
+ input_columns=[
94
+ 'context',
95
+ 'input',
96
+ ],
97
+ output_column='answers',
98
+ test_range='[0:25]',
99
+ test_split='test',
100
+ train_split='test'),
101
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
102
+ dict(
103
+ abbr='LongBench_multifieldqa_en_0',
104
+ eval_cfg=dict(
105
+ evaluator=dict(
106
+ type='opencompass.datasets.LongBenchF1Evaluator'),
107
+ pred_role='BOT'),
108
+ infer_cfg=dict(
109
+ inferencer=dict(
110
+ max_out_len=64,
111
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
112
+ prompt_template=dict(
113
+ template=dict(round=[
114
+ dict(
115
+ prompt=
116
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
117
+ role='HUMAN'),
118
+ ]),
119
+ type=
120
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
121
+ retriever=dict(
122
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
123
+ name='multifieldqa_en',
124
+ path='opencompass/Longbench',
125
+ reader_cfg=dict(
126
+ input_columns=[
127
+ 'context',
128
+ 'input',
129
+ ],
130
+ output_column='answers',
131
+ test_range='[0:19]',
132
+ test_split='test',
133
+ train_split='test'),
134
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
135
+ dict(
136
+ abbr='LongBench_multifieldqa_zh_0',
137
+ eval_cfg=dict(
138
+ evaluator=dict(
139
+ language='zh',
140
+ type='opencompass.datasets.LongBenchF1Evaluator'),
141
+ pred_role='BOT'),
142
+ infer_cfg=dict(
143
+ inferencer=dict(
144
+ max_out_len=64,
145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
146
+ prompt_template=dict(
147
+ template=dict(round=[
148
+ dict(
149
+ prompt=
150
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
151
+ role='HUMAN'),
152
+ ]),
153
+ type=
154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
155
+ retriever=dict(
156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
157
+ name='multifieldqa_zh',
158
+ path='opencompass/Longbench',
159
+ reader_cfg=dict(
160
+ input_columns=[
161
+ 'context',
162
+ 'input',
163
+ ],
164
+ output_column='answers',
165
+ test_range='[0:25]',
166
+ test_split='test',
167
+ train_split='test'),
168
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
169
+ dict(
170
+ abbr='LongBench_narrativeqa_0',
171
+ eval_cfg=dict(
172
+ evaluator=dict(
173
+ type='opencompass.datasets.LongBenchF1Evaluator'),
174
+ pred_role='BOT'),
175
+ infer_cfg=dict(
176
+ inferencer=dict(
177
+ max_out_len=128,
178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
179
+ prompt_template=dict(
180
+ template=dict(round=[
181
+ dict(
182
+ prompt=
183
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
184
+ role='HUMAN'),
185
+ ]),
186
+ type=
187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
188
+ retriever=dict(
189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
190
+ name='narrativeqa',
191
+ path='opencompass/Longbench',
192
+ reader_cfg=dict(
193
+ input_columns=[
194
+ 'context',
195
+ 'input',
196
+ ],
197
+ output_column='answers',
198
+ test_range='[0:25]',
199
+ test_split='test',
200
+ train_split='test'),
201
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
202
+ dict(
203
+ abbr='LongBench_qasper_0',
204
+ eval_cfg=dict(
205
+ evaluator=dict(
206
+ type='opencompass.datasets.LongBenchF1Evaluator'),
207
+ pred_role='BOT'),
208
+ infer_cfg=dict(
209
+ inferencer=dict(
210
+ max_out_len=32,
211
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
212
+ prompt_template=dict(
213
+ template=dict(round=[
214
+ dict(
215
+ prompt=
216
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
217
+ role='HUMAN'),
218
+ ]),
219
+ type=
220
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
221
+ retriever=dict(
222
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
223
+ name='qasper',
224
+ path='opencompass/Longbench',
225
+ reader_cfg=dict(
226
+ input_columns=[
227
+ 'context',
228
+ 'input',
229
+ ],
230
+ output_column='answers',
231
+ test_range='[0:25]',
232
+ test_split='test',
233
+ train_split='test'),
234
+ type='opencompass.datasets.LongBenchqasperDataset'),
235
+ dict(
236
+ abbr='LongBench_triviaqa_0',
237
+ eval_cfg=dict(
238
+ evaluator=dict(
239
+ type='opencompass.datasets.LongBenchF1Evaluator'),
240
+ pred_postprocessor=dict(
241
+ type='opencompass.datasets.triviaqa_postprocess'),
242
+ pred_role='BOT'),
243
+ infer_cfg=dict(
244
+ inferencer=dict(
245
+ max_out_len=32,
246
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
247
+ prompt_template=dict(
248
+ template=dict(round=[
249
+ dict(
250
+ prompt=
251
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
252
+ role='HUMAN'),
253
+ ]),
254
+ type=
255
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
256
+ retriever=dict(
257
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
258
+ name='triviaqa',
259
+ path='opencompass/Longbench',
260
+ reader_cfg=dict(
261
+ input_columns=[
262
+ 'context',
263
+ 'input',
264
+ ],
265
+ output_column='answers',
266
+ test_range='[0:25]',
267
+ test_split='test',
268
+ train_split='test'),
269
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
270
+ dict(
271
+ abbr='LongBench_gov_report_0',
272
+ eval_cfg=dict(
273
+ evaluator=dict(
274
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
275
+ pred_role='BOT'),
276
+ infer_cfg=dict(
277
+ inferencer=dict(
278
+ max_out_len=512,
279
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
280
+ prompt_template=dict(
281
+ template=dict(round=[
282
+ dict(
283
+ prompt=
284
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
285
+ role='HUMAN'),
286
+ ]),
287
+ type=
288
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
289
+ retriever=dict(
290
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
291
+ name='gov_report',
292
+ path='opencompass/Longbench',
293
+ reader_cfg=dict(
294
+ input_columns=[
295
+ 'context',
296
+ ],
297
+ output_column='answers',
298
+ test_range='[0:25]',
299
+ test_split='test',
300
+ train_split='test'),
301
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
302
+ dict(
303
+ abbr='LongBench_qmsum_0',
304
+ eval_cfg=dict(
305
+ evaluator=dict(
306
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
307
+ pred_role='BOT'),
308
+ infer_cfg=dict(
309
+ inferencer=dict(
310
+ max_out_len=512,
311
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
312
+ prompt_template=dict(
313
+ template=dict(round=[
314
+ dict(
315
+ prompt=
316
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
317
+ role='HUMAN'),
318
+ ]),
319
+ type=
320
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
321
+ retriever=dict(
322
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
323
+ name='qmsum',
324
+ path='opencompass/Longbench',
325
+ reader_cfg=dict(
326
+ input_columns=[
327
+ 'context',
328
+ 'input',
329
+ ],
330
+ output_column='answers',
331
+ test_range='[0:25]',
332
+ test_split='test',
333
+ train_split='test'),
334
+ type='opencompass.datasets.LongBenchqmsumDataset'),
335
+ dict(
336
+ abbr='LongBench_vcsum_0',
337
+ eval_cfg=dict(
338
+ evaluator=dict(
339
+ language='zh',
340
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
341
+ pred_role='BOT'),
342
+ infer_cfg=dict(
343
+ inferencer=dict(
344
+ max_out_len=512,
345
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
346
+ prompt_template=dict(
347
+ template=dict(round=[
348
+ dict(
349
+ prompt=
350
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
351
+ role='HUMAN'),
352
+ ]),
353
+ type=
354
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
355
+ retriever=dict(
356
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
357
+ name='vcsum',
358
+ path='opencompass/Longbench',
359
+ reader_cfg=dict(
360
+ input_columns=[
361
+ 'context',
362
+ ],
363
+ output_column='answers',
364
+ test_range='[0:25]',
365
+ test_split='test',
366
+ train_split='test'),
367
+ type='opencompass.datasets.LongBenchvcsumDataset'),
368
+ dict(
369
+ abbr='LongBench_dureader_0',
370
+ eval_cfg=dict(
371
+ evaluator=dict(
372
+ language='zh',
373
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
374
+ pred_role='BOT'),
375
+ infer_cfg=dict(
376
+ inferencer=dict(
377
+ max_out_len=128,
378
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
379
+ prompt_template=dict(
380
+ template=dict(round=[
381
+ dict(
382
+ prompt=
383
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
384
+ role='HUMAN'),
385
+ ]),
386
+ type=
387
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
388
+ retriever=dict(
389
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
390
+ name='dureader',
391
+ path='opencompass/Longbench',
392
+ reader_cfg=dict(
393
+ input_columns=[
394
+ 'context',
395
+ 'input',
396
+ ],
397
+ output_column='answers',
398
+ test_range='[0:25]',
399
+ test_split='test',
400
+ train_split='test'),
401
+ type='opencompass.datasets.LongBenchdureaderDataset'),
402
+ dict(
403
+ abbr='LongBench_lcc_0',
404
+ eval_cfg=dict(
405
+ evaluator=dict(
406
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
407
+ pred_role='BOT'),
408
+ infer_cfg=dict(
409
+ inferencer=dict(
410
+ max_out_len=64,
411
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
412
+ prompt_template=dict(
413
+ template=dict(round=[
414
+ dict(
415
+ prompt=
416
+ 'Please complete the code given below. \n{context}Next line of code:\n',
417
+ role='HUMAN'),
418
+ ]),
419
+ type=
420
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
421
+ retriever=dict(
422
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
423
+ name='lcc',
424
+ path='opencompass/Longbench',
425
+ reader_cfg=dict(
426
+ input_columns=[
427
+ 'context',
428
+ ],
429
+ output_column='answers',
430
+ test_range='[0:63]',
431
+ test_split='test',
432
+ train_split='test'),
433
+ type='opencompass.datasets.LongBenchlccDataset'),
434
+ dict(
435
+ abbr='LongBench_repobench-p_0',
436
+ eval_cfg=dict(
437
+ evaluator=dict(
438
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
439
+ pred_role='BOT'),
440
+ infer_cfg=dict(
441
+ inferencer=dict(
442
+ max_out_len=64,
443
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
444
+ prompt_template=dict(
445
+ template=dict(round=[
446
+ dict(
447
+ prompt=
448
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
449
+ role='HUMAN'),
450
+ ]),
451
+ type=
452
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
453
+ retriever=dict(
454
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
455
+ name='repobench-p',
456
+ path='opencompass/Longbench',
457
+ reader_cfg=dict(
458
+ input_columns=[
459
+ 'context',
460
+ 'input',
461
+ ],
462
+ output_column='answers',
463
+ test_range='[0:63]',
464
+ test_split='test',
465
+ train_split='test'),
466
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
467
+ dict(
468
+ abbr='LongBench_passage_retrieval_en_0',
469
+ eval_cfg=dict(
470
+ evaluator=dict(
471
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
472
+ pred_role='BOT'),
473
+ infer_cfg=dict(
474
+ inferencer=dict(
475
+ max_out_len=32,
476
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
477
+ prompt_template=dict(
478
+ template=dict(round=[
479
+ dict(
480
+ prompt=
481
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
482
+ role='HUMAN'),
483
+ ]),
484
+ type=
485
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
486
+ retriever=dict(
487
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
488
+ name='passage_retrieval_en',
489
+ path='opencompass/Longbench',
490
+ reader_cfg=dict(
491
+ input_columns=[
492
+ 'context',
493
+ 'input',
494
+ ],
495
+ output_column='answers',
496
+ test_range='[0:25]',
497
+ test_split='test',
498
+ train_split='test'),
499
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
500
+ dict(
501
+ abbr='LongBench_passage_retrieval_zh_0',
502
+ eval_cfg=dict(
503
+ evaluator=dict(
504
+ language='zh',
505
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
506
+ pred_role='BOT'),
507
+ infer_cfg=dict(
508
+ inferencer=dict(
509
+ max_out_len=32,
510
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
511
+ prompt_template=dict(
512
+ template=dict(round=[
513
+ dict(
514
+ prompt=
515
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
516
+ role='HUMAN'),
517
+ ]),
518
+ type=
519
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
520
+ retriever=dict(
521
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
522
+ name='passage_retrieval_zh',
523
+ path='opencompass/Longbench',
524
+ reader_cfg=dict(
525
+ input_columns=[
526
+ 'context',
527
+ 'input',
528
+ ],
529
+ output_column='answers',
530
+ test_range='[0:25]',
531
+ test_split='test',
532
+ train_split='test'),
533
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
534
+ dict(
535
+ abbr='LongBench_passage_count_0',
536
+ eval_cfg=dict(
537
+ evaluator=dict(
538
+ type='opencompass.datasets.LongBenchCountEvaluator'),
539
+ pred_role='BOT'),
540
+ infer_cfg=dict(
541
+ inferencer=dict(
542
+ max_out_len=32,
543
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
544
+ prompt_template=dict(
545
+ template=dict(round=[
546
+ dict(
547
+ prompt=
548
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
549
+ role='HUMAN'),
550
+ ]),
551
+ type=
552
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
553
+ retriever=dict(
554
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
555
+ name='passage_count',
556
+ path='opencompass/Longbench',
557
+ reader_cfg=dict(
558
+ input_columns=[
559
+ 'context',
560
+ 'input',
561
+ ],
562
+ output_column='answers',
563
+ test_range='[0:25]',
564
+ test_split='test',
565
+ train_split='test'),
566
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
567
+ dict(
568
+ abbr='LongBench_trec_0',
569
+ eval_cfg=dict(
570
+ evaluator=dict(
571
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
572
+ ),
573
+ pred_postprocessor=dict(
574
+ type='opencompass.datasets.trec_postprocess'),
575
+ pred_role='BOT'),
576
+ infer_cfg=dict(
577
+ inferencer=dict(
578
+ max_out_len=64,
579
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
580
+ prompt_template=dict(
581
+ template=dict(round=[
582
+ dict(
583
+ prompt=
584
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
585
+ role='HUMAN'),
586
+ ]),
587
+ type=
588
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
589
+ retriever=dict(
590
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
591
+ name='trec',
592
+ path='opencompass/Longbench',
593
+ reader_cfg=dict(
594
+ input_columns=[
595
+ 'context',
596
+ 'input',
597
+ ],
598
+ output_column='all_labels',
599
+ test_range='[0:25]',
600
+ test_split='test',
601
+ train_split='test'),
602
+ type='opencompass.datasets.LongBenchtrecDataset'),
603
+ dict(
604
+ abbr='LongBench_lsht_0',
605
+ eval_cfg=dict(
606
+ evaluator=dict(
607
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
608
+ ),
609
+ pred_postprocessor=dict(
610
+ type='opencompass.datasets.lsht_postprocess'),
611
+ pred_role='BOT'),
612
+ infer_cfg=dict(
613
+ inferencer=dict(
614
+ max_out_len=64,
615
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
616
+ prompt_template=dict(
617
+ template=dict(round=[
618
+ dict(
619
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
620
+ role='HUMAN'),
621
+ ]),
622
+ type=
623
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
624
+ retriever=dict(
625
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
626
+ name='lsht',
627
+ path='opencompass/Longbench',
628
+ reader_cfg=dict(
629
+ input_columns=[
630
+ 'context',
631
+ 'input',
632
+ ],
633
+ output_column='all_labels',
634
+ test_range='[0:25]',
635
+ test_split='test',
636
+ train_split='test'),
637
+ type='opencompass.datasets.LongBenchlshtDataset'),
638
+ dict(
639
+ abbr='LongBench_multi_news_0',
640
+ eval_cfg=dict(
641
+ evaluator=dict(
642
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
643
+ pred_role='BOT'),
644
+ infer_cfg=dict(
645
+ inferencer=dict(
646
+ max_out_len=512,
647
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
648
+ prompt_template=dict(
649
+ template=dict(round=[
650
+ dict(
651
+ prompt=
652
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
653
+ role='HUMAN'),
654
+ ]),
655
+ type=
656
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
657
+ retriever=dict(
658
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
659
+ name='multi_news',
660
+ path='opencompass/Longbench',
661
+ reader_cfg=dict(
662
+ input_columns=[
663
+ 'context',
664
+ ],
665
+ output_column='answers',
666
+ test_range='[0:25]',
667
+ test_split='test',
668
+ train_split='test'),
669
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
670
+ dict(
671
+ abbr='LongBench_samsum_0',
672
+ eval_cfg=dict(
673
+ evaluator=dict(
674
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
675
+ pred_postprocessor=dict(
676
+ type='opencompass.datasets.samsum_postprocess'),
677
+ pred_role='BOT'),
678
+ infer_cfg=dict(
679
+ inferencer=dict(
680
+ max_out_len=128,
681
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
682
+ prompt_template=dict(
683
+ template=dict(round=[
684
+ dict(
685
+ prompt=
686
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
687
+ role='HUMAN'),
688
+ ]),
689
+ type=
690
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
691
+ retriever=dict(
692
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
693
+ name='samsum',
694
+ path='opencompass/Longbench',
695
+ reader_cfg=dict(
696
+ input_columns=[
697
+ 'context',
698
+ 'input',
699
+ ],
700
+ output_column='answers',
701
+ test_range='[0:25]',
702
+ test_split='test',
703
+ train_split='test'),
704
+ type='opencompass.datasets.LongBenchsamsumDataset'),
705
+ dict(
706
+ abbr='LongBench_2wikimqa_0',
707
+ eval_cfg=dict(
708
+ evaluator=dict(
709
+ type='opencompass.datasets.LongBenchF1Evaluator'),
710
+ pred_role='BOT'),
711
+ infer_cfg=dict(
712
+ inferencer=dict(
713
+ max_out_len=32,
714
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
715
+ prompt_template=dict(
716
+ template=dict(round=[
717
+ dict(
718
+ prompt=
719
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
720
+ role='HUMAN'),
721
+ ]),
722
+ type=
723
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
724
+ retriever=dict(
725
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
726
+ name='2wikimqa',
727
+ path='opencompass/Longbench',
728
+ reader_cfg=dict(
729
+ input_columns=[
730
+ 'context',
731
+ 'input',
732
+ ],
733
+ output_column='answers',
734
+ test_range='[0:25]',
735
+ test_split='test',
736
+ train_split='test'),
737
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
738
+ dict(
739
+ abbr='LongBench_hotpotqa_0',
740
+ eval_cfg=dict(
741
+ evaluator=dict(
742
+ type='opencompass.datasets.LongBenchF1Evaluator'),
743
+ pred_role='BOT'),
744
+ infer_cfg=dict(
745
+ inferencer=dict(
746
+ max_out_len=32,
747
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
748
+ prompt_template=dict(
749
+ template=dict(round=[
750
+ dict(
751
+ prompt=
752
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
753
+ role='HUMAN'),
754
+ ]),
755
+ type=
756
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
757
+ retriever=dict(
758
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
759
+ name='hotpotqa',
760
+ path='opencompass/Longbench',
761
+ reader_cfg=dict(
762
+ input_columns=[
763
+ 'context',
764
+ 'input',
765
+ ],
766
+ output_column='answers',
767
+ test_range='[0:25]',
768
+ test_split='test',
769
+ train_split='test'),
770
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
771
+ dict(
772
+ abbr='LongBench_musique_0',
773
+ eval_cfg=dict(
774
+ evaluator=dict(
775
+ type='opencompass.datasets.LongBenchF1Evaluator'),
776
+ pred_role='BOT'),
777
+ infer_cfg=dict(
778
+ inferencer=dict(
779
+ max_out_len=32,
780
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
781
+ prompt_template=dict(
782
+ template=dict(round=[
783
+ dict(
784
+ prompt=
785
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
786
+ role='HUMAN'),
787
+ ]),
788
+ type=
789
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
790
+ retriever=dict(
791
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
792
+ name='musique',
793
+ path='opencompass/Longbench',
794
+ reader_cfg=dict(
795
+ input_columns=[
796
+ 'context',
797
+ 'input',
798
+ ],
799
+ output_column='answers',
800
+ test_range='[0:25]',
801
+ test_split='test',
802
+ train_split='test'),
803
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
804
+ dict(
805
+ abbr='LongBench_multifieldqa_en_0',
806
+ eval_cfg=dict(
807
+ evaluator=dict(
808
+ type='opencompass.datasets.LongBenchF1Evaluator'),
809
+ pred_role='BOT'),
810
+ infer_cfg=dict(
811
+ inferencer=dict(
812
+ max_out_len=64,
813
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
814
+ prompt_template=dict(
815
+ template=dict(round=[
816
+ dict(
817
+ prompt=
818
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
819
+ role='HUMAN'),
820
+ ]),
821
+ type=
822
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
823
+ retriever=dict(
824
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
825
+ name='multifieldqa_en',
826
+ path='opencompass/Longbench',
827
+ reader_cfg=dict(
828
+ input_columns=[
829
+ 'context',
830
+ 'input',
831
+ ],
832
+ output_column='answers',
833
+ test_range='[0:19]',
834
+ test_split='test',
835
+ train_split='test'),
836
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
837
+ dict(
838
+ abbr='LongBench_multifieldqa_zh_0',
839
+ eval_cfg=dict(
840
+ evaluator=dict(
841
+ language='zh',
842
+ type='opencompass.datasets.LongBenchF1Evaluator'),
843
+ pred_role='BOT'),
844
+ infer_cfg=dict(
845
+ inferencer=dict(
846
+ max_out_len=64,
847
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
848
+ prompt_template=dict(
849
+ template=dict(round=[
850
+ dict(
851
+ prompt=
852
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
853
+ role='HUMAN'),
854
+ ]),
855
+ type=
856
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
857
+ retriever=dict(
858
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
859
+ name='multifieldqa_zh',
860
+ path='opencompass/Longbench',
861
+ reader_cfg=dict(
862
+ input_columns=[
863
+ 'context',
864
+ 'input',
865
+ ],
866
+ output_column='answers',
867
+ test_range='[0:25]',
868
+ test_split='test',
869
+ train_split='test'),
870
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
871
+ dict(
872
+ abbr='LongBench_narrativeqa_0',
873
+ eval_cfg=dict(
874
+ evaluator=dict(
875
+ type='opencompass.datasets.LongBenchF1Evaluator'),
876
+ pred_role='BOT'),
877
+ infer_cfg=dict(
878
+ inferencer=dict(
879
+ max_out_len=128,
880
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
881
+ prompt_template=dict(
882
+ template=dict(round=[
883
+ dict(
884
+ prompt=
885
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
886
+ role='HUMAN'),
887
+ ]),
888
+ type=
889
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
890
+ retriever=dict(
891
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
892
+ name='narrativeqa',
893
+ path='opencompass/Longbench',
894
+ reader_cfg=dict(
895
+ input_columns=[
896
+ 'context',
897
+ 'input',
898
+ ],
899
+ output_column='answers',
900
+ test_range='[0:25]',
901
+ test_split='test',
902
+ train_split='test'),
903
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
904
+ dict(
905
+ abbr='LongBench_qasper_0',
906
+ eval_cfg=dict(
907
+ evaluator=dict(
908
+ type='opencompass.datasets.LongBenchF1Evaluator'),
909
+ pred_role='BOT'),
910
+ infer_cfg=dict(
911
+ inferencer=dict(
912
+ max_out_len=32,
913
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
914
+ prompt_template=dict(
915
+ template=dict(round=[
916
+ dict(
917
+ prompt=
918
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
919
+ role='HUMAN'),
920
+ ]),
921
+ type=
922
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
923
+ retriever=dict(
924
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
925
+ name='qasper',
926
+ path='opencompass/Longbench',
927
+ reader_cfg=dict(
928
+ input_columns=[
929
+ 'context',
930
+ 'input',
931
+ ],
932
+ output_column='answers',
933
+ test_range='[0:25]',
934
+ test_split='test',
935
+ train_split='test'),
936
+ type='opencompass.datasets.LongBenchqasperDataset'),
937
+ dict(
938
+ abbr='LongBench_triviaqa_0',
939
+ eval_cfg=dict(
940
+ evaluator=dict(
941
+ type='opencompass.datasets.LongBenchF1Evaluator'),
942
+ pred_postprocessor=dict(
943
+ type='opencompass.datasets.triviaqa_postprocess'),
944
+ pred_role='BOT'),
945
+ infer_cfg=dict(
946
+ inferencer=dict(
947
+ max_out_len=32,
948
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
949
+ prompt_template=dict(
950
+ template=dict(round=[
951
+ dict(
952
+ prompt=
953
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
954
+ role='HUMAN'),
955
+ ]),
956
+ type=
957
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
958
+ retriever=dict(
959
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
960
+ name='triviaqa',
961
+ path='opencompass/Longbench',
962
+ reader_cfg=dict(
963
+ input_columns=[
964
+ 'context',
965
+ 'input',
966
+ ],
967
+ output_column='answers',
968
+ test_range='[0:25]',
969
+ test_split='test',
970
+ train_split='test'),
971
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
972
+ dict(
973
+ abbr='LongBench_gov_report_0',
974
+ eval_cfg=dict(
975
+ evaluator=dict(
976
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
977
+ pred_role='BOT'),
978
+ infer_cfg=dict(
979
+ inferencer=dict(
980
+ max_out_len=512,
981
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
982
+ prompt_template=dict(
983
+ template=dict(round=[
984
+ dict(
985
+ prompt=
986
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
987
+ role='HUMAN'),
988
+ ]),
989
+ type=
990
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
991
+ retriever=dict(
992
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
993
+ name='gov_report',
994
+ path='opencompass/Longbench',
995
+ reader_cfg=dict(
996
+ input_columns=[
997
+ 'context',
998
+ ],
999
+ output_column='answers',
1000
+ test_range='[0:25]',
1001
+ test_split='test',
1002
+ train_split='test'),
1003
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
1004
+ dict(
1005
+ abbr='LongBench_qmsum_0',
1006
+ eval_cfg=dict(
1007
+ evaluator=dict(
1008
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1009
+ pred_role='BOT'),
1010
+ infer_cfg=dict(
1011
+ inferencer=dict(
1012
+ max_out_len=512,
1013
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1014
+ prompt_template=dict(
1015
+ template=dict(round=[
1016
+ dict(
1017
+ prompt=
1018
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
1019
+ role='HUMAN'),
1020
+ ]),
1021
+ type=
1022
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1023
+ retriever=dict(
1024
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1025
+ name='qmsum',
1026
+ path='opencompass/Longbench',
1027
+ reader_cfg=dict(
1028
+ input_columns=[
1029
+ 'context',
1030
+ 'input',
1031
+ ],
1032
+ output_column='answers',
1033
+ test_range='[0:25]',
1034
+ test_split='test',
1035
+ train_split='test'),
1036
+ type='opencompass.datasets.LongBenchqmsumDataset'),
1037
+ dict(
1038
+ abbr='LongBench_vcsum_0',
1039
+ eval_cfg=dict(
1040
+ evaluator=dict(
1041
+ language='zh',
1042
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1043
+ pred_role='BOT'),
1044
+ infer_cfg=dict(
1045
+ inferencer=dict(
1046
+ max_out_len=512,
1047
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1048
+ prompt_template=dict(
1049
+ template=dict(round=[
1050
+ dict(
1051
+ prompt=
1052
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
1053
+ role='HUMAN'),
1054
+ ]),
1055
+ type=
1056
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1057
+ retriever=dict(
1058
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1059
+ name='vcsum',
1060
+ path='opencompass/Longbench',
1061
+ reader_cfg=dict(
1062
+ input_columns=[
1063
+ 'context',
1064
+ ],
1065
+ output_column='answers',
1066
+ test_range='[0:25]',
1067
+ test_split='test',
1068
+ train_split='test'),
1069
+ type='opencompass.datasets.LongBenchvcsumDataset'),
1070
+ dict(
1071
+ abbr='LongBench_dureader_0',
1072
+ eval_cfg=dict(
1073
+ evaluator=dict(
1074
+ language='zh',
1075
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1076
+ pred_role='BOT'),
1077
+ infer_cfg=dict(
1078
+ inferencer=dict(
1079
+ max_out_len=128,
1080
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1081
+ prompt_template=dict(
1082
+ template=dict(round=[
1083
+ dict(
1084
+ prompt=
1085
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
1086
+ role='HUMAN'),
1087
+ ]),
1088
+ type=
1089
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1090
+ retriever=dict(
1091
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1092
+ name='dureader',
1093
+ path='opencompass/Longbench',
1094
+ reader_cfg=dict(
1095
+ input_columns=[
1096
+ 'context',
1097
+ 'input',
1098
+ ],
1099
+ output_column='answers',
1100
+ test_range='[0:25]',
1101
+ test_split='test',
1102
+ train_split='test'),
1103
+ type='opencompass.datasets.LongBenchdureaderDataset'),
1104
+ dict(
1105
+ abbr='LongBench_lcc_0',
1106
+ eval_cfg=dict(
1107
+ evaluator=dict(
1108
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1109
+ pred_role='BOT'),
1110
+ infer_cfg=dict(
1111
+ inferencer=dict(
1112
+ max_out_len=64,
1113
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1114
+ prompt_template=dict(
1115
+ template=dict(round=[
1116
+ dict(
1117
+ prompt=
1118
+ 'Please complete the code given below. \n{context}Next line of code:\n',
1119
+ role='HUMAN'),
1120
+ ]),
1121
+ type=
1122
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1123
+ retriever=dict(
1124
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1125
+ name='lcc',
1126
+ path='opencompass/Longbench',
1127
+ reader_cfg=dict(
1128
+ input_columns=[
1129
+ 'context',
1130
+ ],
1131
+ output_column='answers',
1132
+ test_range='[0:63]',
1133
+ test_split='test',
1134
+ train_split='test'),
1135
+ type='opencompass.datasets.LongBenchlccDataset'),
1136
+ dict(
1137
+ abbr='LongBench_repobench-p_0',
1138
+ eval_cfg=dict(
1139
+ evaluator=dict(
1140
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1141
+ pred_role='BOT'),
1142
+ infer_cfg=dict(
1143
+ inferencer=dict(
1144
+ max_out_len=64,
1145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1146
+ prompt_template=dict(
1147
+ template=dict(round=[
1148
+ dict(
1149
+ prompt=
1150
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
1151
+ role='HUMAN'),
1152
+ ]),
1153
+ type=
1154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1155
+ retriever=dict(
1156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1157
+ name='repobench-p',
1158
+ path='opencompass/Longbench',
1159
+ reader_cfg=dict(
1160
+ input_columns=[
1161
+ 'context',
1162
+ 'input',
1163
+ ],
1164
+ output_column='answers',
1165
+ test_range='[0:63]',
1166
+ test_split='test',
1167
+ train_split='test'),
1168
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
1169
+ dict(
1170
+ abbr='LongBench_passage_retrieval_en_0',
1171
+ eval_cfg=dict(
1172
+ evaluator=dict(
1173
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1174
+ pred_role='BOT'),
1175
+ infer_cfg=dict(
1176
+ inferencer=dict(
1177
+ max_out_len=32,
1178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1179
+ prompt_template=dict(
1180
+ template=dict(round=[
1181
+ dict(
1182
+ prompt=
1183
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
1184
+ role='HUMAN'),
1185
+ ]),
1186
+ type=
1187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1188
+ retriever=dict(
1189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1190
+ name='passage_retrieval_en',
1191
+ path='opencompass/Longbench',
1192
+ reader_cfg=dict(
1193
+ input_columns=[
1194
+ 'context',
1195
+ 'input',
1196
+ ],
1197
+ output_column='answers',
1198
+ test_range='[0:25]',
1199
+ test_split='test',
1200
+ train_split='test'),
1201
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
1202
+ dict(
1203
+ abbr='LongBench_passage_retrieval_zh_0',
1204
+ eval_cfg=dict(
1205
+ evaluator=dict(
1206
+ language='zh',
1207
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1208
+ pred_role='BOT'),
1209
+ infer_cfg=dict(
1210
+ inferencer=dict(
1211
+ max_out_len=32,
1212
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1213
+ prompt_template=dict(
1214
+ template=dict(round=[
1215
+ dict(
1216
+ prompt=
1217
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
1218
+ role='HUMAN'),
1219
+ ]),
1220
+ type=
1221
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1222
+ retriever=dict(
1223
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1224
+ name='passage_retrieval_zh',
1225
+ path='opencompass/Longbench',
1226
+ reader_cfg=dict(
1227
+ input_columns=[
1228
+ 'context',
1229
+ 'input',
1230
+ ],
1231
+ output_column='answers',
1232
+ test_range='[0:25]',
1233
+ test_split='test',
1234
+ train_split='test'),
1235
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
1236
+ dict(
1237
+ abbr='LongBench_passage_count_0',
1238
+ eval_cfg=dict(
1239
+ evaluator=dict(
1240
+ type='opencompass.datasets.LongBenchCountEvaluator'),
1241
+ pred_role='BOT'),
1242
+ infer_cfg=dict(
1243
+ inferencer=dict(
1244
+ max_out_len=32,
1245
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1246
+ prompt_template=dict(
1247
+ template=dict(round=[
1248
+ dict(
1249
+ prompt=
1250
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
1251
+ role='HUMAN'),
1252
+ ]),
1253
+ type=
1254
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1255
+ retriever=dict(
1256
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1257
+ name='passage_count',
1258
+ path='opencompass/Longbench',
1259
+ reader_cfg=dict(
1260
+ input_columns=[
1261
+ 'context',
1262
+ 'input',
1263
+ ],
1264
+ output_column='answers',
1265
+ test_range='[0:25]',
1266
+ test_split='test',
1267
+ train_split='test'),
1268
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
1269
+ dict(
1270
+ abbr='LongBench_trec_0',
1271
+ eval_cfg=dict(
1272
+ evaluator=dict(
1273
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1274
+ ),
1275
+ pred_postprocessor=dict(
1276
+ type='opencompass.datasets.trec_postprocess'),
1277
+ pred_role='BOT'),
1278
+ infer_cfg=dict(
1279
+ inferencer=dict(
1280
+ max_out_len=64,
1281
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1282
+ prompt_template=dict(
1283
+ template=dict(round=[
1284
+ dict(
1285
+ prompt=
1286
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
1287
+ role='HUMAN'),
1288
+ ]),
1289
+ type=
1290
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1291
+ retriever=dict(
1292
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1293
+ name='trec',
1294
+ path='opencompass/Longbench',
1295
+ reader_cfg=dict(
1296
+ input_columns=[
1297
+ 'context',
1298
+ 'input',
1299
+ ],
1300
+ output_column='all_labels',
1301
+ test_range='[0:25]',
1302
+ test_split='test',
1303
+ train_split='test'),
1304
+ type='opencompass.datasets.LongBenchtrecDataset'),
1305
+ dict(
1306
+ abbr='LongBench_lsht_0',
1307
+ eval_cfg=dict(
1308
+ evaluator=dict(
1309
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1310
+ ),
1311
+ pred_postprocessor=dict(
1312
+ type='opencompass.datasets.lsht_postprocess'),
1313
+ pred_role='BOT'),
1314
+ infer_cfg=dict(
1315
+ inferencer=dict(
1316
+ max_out_len=64,
1317
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1318
+ prompt_template=dict(
1319
+ template=dict(round=[
1320
+ dict(
1321
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
1322
+ role='HUMAN'),
1323
+ ]),
1324
+ type=
1325
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1326
+ retriever=dict(
1327
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1328
+ name='lsht',
1329
+ path='opencompass/Longbench',
1330
+ reader_cfg=dict(
1331
+ input_columns=[
1332
+ 'context',
1333
+ 'input',
1334
+ ],
1335
+ output_column='all_labels',
1336
+ test_range='[0:25]',
1337
+ test_split='test',
1338
+ train_split='test'),
1339
+ type='opencompass.datasets.LongBenchlshtDataset'),
1340
+ dict(
1341
+ abbr='LongBench_multi_news_0',
1342
+ eval_cfg=dict(
1343
+ evaluator=dict(
1344
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1345
+ pred_role='BOT'),
1346
+ infer_cfg=dict(
1347
+ inferencer=dict(
1348
+ max_out_len=512,
1349
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1350
+ prompt_template=dict(
1351
+ template=dict(round=[
1352
+ dict(
1353
+ prompt=
1354
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
1355
+ role='HUMAN'),
1356
+ ]),
1357
+ type=
1358
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1359
+ retriever=dict(
1360
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1361
+ name='multi_news',
1362
+ path='opencompass/Longbench',
1363
+ reader_cfg=dict(
1364
+ input_columns=[
1365
+ 'context',
1366
+ ],
1367
+ output_column='answers',
1368
+ test_range='[0:25]',
1369
+ test_split='test',
1370
+ train_split='test'),
1371
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
1372
+ dict(
1373
+ abbr='LongBench_samsum_0',
1374
+ eval_cfg=dict(
1375
+ evaluator=dict(
1376
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1377
+ pred_postprocessor=dict(
1378
+ type='opencompass.datasets.samsum_postprocess'),
1379
+ pred_role='BOT'),
1380
+ infer_cfg=dict(
1381
+ inferencer=dict(
1382
+ max_out_len=128,
1383
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1384
+ prompt_template=dict(
1385
+ template=dict(round=[
1386
+ dict(
1387
+ prompt=
1388
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
1389
+ role='HUMAN'),
1390
+ ]),
1391
+ type=
1392
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1393
+ retriever=dict(
1394
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1395
+ name='samsum',
1396
+ path='opencompass/Longbench',
1397
+ reader_cfg=dict(
1398
+ input_columns=[
1399
+ 'context',
1400
+ 'input',
1401
+ ],
1402
+ output_column='answers',
1403
+ test_range='[0:25]',
1404
+ test_split='test',
1405
+ train_split='test'),
1406
+ type='opencompass.datasets.LongBenchsamsumDataset'),
1407
+ ],
1408
+ ]
1409
+ models = [
1410
+ dict(
1411
+ abbr='delta_net',
1412
+ batch_size=128,
1413
+ max_seq_len=2048,
1414
+ model_kwargs=dict(
1415
+ device_map='auto',
1416
+ torch_dtype='torch.bfloat16',
1417
+ trust_remote_code=True),
1418
+ path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1419
+ run_cfg=dict(num_gpus=1),
1420
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
1421
+ tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1422
+ type='opencompass.models.HuggingFaceBaseModel'),
1423
+ ]
1424
+ work_dir = 'outputs/default/20251127_221150'