msj19 commited on
Commit
8082566
·
verified ·
1 Parent(s): dc367ce

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. examples/eval_academic_leaderboard_202412.py +129 -0
  2. examples/eval_academic_leaderboard_202502.py +137 -0
  3. examples/eval_babilong.py +51 -0
  4. examples/eval_bench_intern_s1.py +169 -0
  5. examples/eval_cascade_evaluator.py +130 -0
  6. examples/eval_chat_agent.py +67 -0
  7. examples/eval_chat_demo.py +14 -0
  8. examples/eval_chat_last.py +35 -0
  9. examples/eval_chatml_datasets.py +51 -0
  10. examples/eval_chembench.py +23 -0
  11. examples/eval_chinese_simpleqa.py +73 -0
  12. examples/eval_cibench.py +154 -0
  13. examples/eval_claude.py +19 -0
  14. examples/eval_codeagent.py +52 -0
  15. examples/eval_codebench_full.py +155 -0
  16. examples/eval_compassarena_subjectivebench_bradleyterry.py +119 -0
  17. examples/eval_contamination.py +21 -0
  18. examples/eval_corebench_2409_longcontext.py +127 -0
  19. examples/eval_corebench_2409_subjective.py +123 -0
  20. examples/eval_edgellm_demo.py +65 -0
  21. examples/eval_gpt3.5.py +38 -0
  22. examples/eval_hellobench.py +106 -0
  23. examples/eval_internlm2_chat_keyset.py +46 -0
  24. examples/eval_internlm2_keyset.py +24 -0
  25. examples/eval_internlm3_math500_thinking.py +120 -0
  26. examples/eval_internlm_chat_lmdeploy_apiserver.py +58 -0
  27. examples/eval_internlm_flames_chat.py +116 -0
  28. examples/eval_internlm_lmdeploy_apiserver.py +43 -0
  29. examples/eval_internlm_math_chat.py +17 -0
  30. examples/eval_lightllm.py +52 -0
  31. examples/eval_math_llm_judge_internal.py +43 -0
  32. examples/eval_mathbench.py +41 -0
  33. examples/eval_modelscope_datasets.py +112 -0
  34. examples/eval_qwen_7b.py +58 -0
  35. examples/eval_ruler_fix_tokenizer.py +38 -0
  36. examples/eval_subjective_alpacaeval_official.py +72 -0
  37. requirements/vllm.txt +1 -0
  38. tmp/08b1e522-33ea-430a-ba78-4d273bf09a88_params.py +1424 -0
  39. tmp/0954e290-fcd0-400c-8c58-f14a577dc5e4_params.py +1424 -0
  40. tmp/0985e09b-75af-404f-ac0c-079c3aa085fb_params.py +0 -0
  41. tmp/09d7374d-16f6-44e6-a2fa-f4925f8fb3fc_params.py +56 -0
  42. tmp/0a5aa083-12c4-41a8-92db-57a728f50ed5_params.py +0 -0
  43. tmp/0bd141af-ea86-420f-b26c-b2890fc57de2_params.py +56 -0
  44. tmp/0c3d2c0a-49a1-40b1-b0b6-3d32b7381062_params.py +1420 -0
  45. tmp/0d03fed5-a949-4dc0-815b-cf2f740d6181_params.py +53 -0
  46. tmp/0d2ff363-9d6a-489c-b18d-e978d436a065_params.py +0 -0
  47. tmp/10481e04-ca08-4f83-972f-e8fccc958b91_params.py +61 -0
  48. tmp/104a1807-a194-4864-99ea-1a9fe1a47bac_params.py +0 -0
  49. tmp/11308d03-3ab0-43b0-9f06-64b71c4140c1_params.py +55 -0
  50. tmp/1405e46f-8be4-462d-a794-3b47ef9839c2_params.py +1424 -0
examples/eval_academic_leaderboard_202412.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path as osp
2
+
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6
+ from opencompass.runners import LocalRunner, VOLCRunner
7
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
8
+
9
+ #######################################################################
10
+ # PART 0 Essential Configs #
11
+ #######################################################################
12
+ with read_base():
13
+ # Datasets Part
14
+ # Knowledge
15
+ # Math
16
+ from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
17
+ aime2024_datasets
18
+ from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import \
19
+ bbh_datasets
20
+ # General Reasoning
21
+ from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
22
+ gpqa_datasets
23
+ from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
24
+ humaneval_datasets
25
+ # Instruction Following
26
+ from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
27
+ ifeval_datasets
28
+ from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
29
+ LCBCodeGeneration_dataset
30
+ from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
31
+ math_datasets
32
+ from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
33
+ mmlu_pro_datasets
34
+ # Model List
35
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
36
+ models as hf_internlm2_5_7b_chat_model
37
+ # Summary Groups
38
+ from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
39
+ from opencompass.configs.summarizers.groups.mmlu_pro import \
40
+ mmlu_pro_summary_groups
41
+
42
+ #######################################################################
43
+ # PART 1 Datasets List #
44
+ #######################################################################
45
+ # datasets list for evaluation
46
+ # Only take LCB generation for evaluation
47
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
48
+ []) + [LCBCodeGeneration_dataset]
49
+
50
+ #######################################################################
51
+ # PART 2 Datset Summarizer #
52
+ #######################################################################
53
+
54
+ core_summary_groups = [
55
+ {
56
+ 'name':
57
+ 'core_average',
58
+ 'subsets': [
59
+ ['IFEval', 'Prompt-level-strict-accuracy'],
60
+ ['bbh', 'naive_average'],
61
+ ['math_prm800k_500', 'accuracy'],
62
+ ['aime2024', 'accuracy'],
63
+ ['GPQA_diamond', 'accuracy'],
64
+ ['mmlu_pro', 'naive_average'],
65
+ ['openai_humaneval', 'humaneval_pass@1'],
66
+ ['lcb_code_generation', 'pass@1'],
67
+ ],
68
+ },
69
+ ]
70
+
71
+ summarizer = dict(
72
+ dataset_abbrs=[
73
+ ['core_average', 'naive_average'],
74
+ '',
75
+ 'Instruction Following',
76
+ ['IFEval', 'Prompt-level-strict-accuracy'],
77
+ '',
78
+ 'General Reasoning',
79
+ ['bbh', 'naive_average'],
80
+ ['GPQA_diamond', 'accuracy'],
81
+ '',
82
+ 'Math Calculation',
83
+ ['math_prm800k_500', 'accuracy'],
84
+ ['aime2024', 'accuracy'],
85
+ '',
86
+ 'Knowledge',
87
+ ['mmlu_pro', 'naive_average'],
88
+ '',
89
+ 'Code',
90
+ ['openai_humaneval', 'humaneval_pass@1'],
91
+ ['lcb_code_generation', 'pass@1'],
92
+ ],
93
+ summary_groups=sum(
94
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
95
+ )
96
+
97
+ #######################################################################
98
+ # PART 3 Models List #
99
+ #######################################################################
100
+
101
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
102
+
103
+ #######################################################################
104
+ # PART 4 Inference/Evaluation Configuaration #
105
+ #######################################################################
106
+
107
+ # Local Runner
108
+ infer = dict(
109
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
110
+ runner=dict(
111
+ type=LocalRunner,
112
+ max_num_workers=16,
113
+ retry=0, # Modify if needed
114
+ task=dict(type=OpenICLInferTask),
115
+ ),
116
+ )
117
+
118
+ # eval with local runner
119
+ eval = dict(
120
+ partitioner=dict(type=NaivePartitioner, n=10),
121
+ runner=dict(type=LocalRunner,
122
+ max_num_workers=16,
123
+ task=dict(type=OpenICLEvalTask)),
124
+ )
125
+
126
+ #######################################################################
127
+ # PART 5 Utils Configuaration #
128
+ #######################################################################
129
+ work_dir = './outputs/oc_academic_202412'
examples/eval_academic_leaderboard_202502.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6
+ from opencompass.runners import LocalRunner, VOLCRunner
7
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
8
+
9
+ #######################################################################
10
+ # PART 0 Essential Configs #
11
+ #######################################################################
12
+ with read_base():
13
+ # Datasets Part
14
+ # Knowledge
15
+ # Math
16
+ from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
17
+ aime2024_datasets
18
+ from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
19
+ bbh_datasets
20
+ # General Reasoning
21
+ from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
22
+ gpqa_datasets
23
+ from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
24
+ humaneval_datasets
25
+ # Instruction Following
26
+ from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
27
+ ifeval_datasets
28
+ from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
29
+ LCBCodeGeneration_dataset
30
+ from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
31
+ math_datasets
32
+ from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
33
+ mmlu_pro_datasets
34
+ # Model List
35
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
36
+ models as hf_internlm2_5_7b_chat_model
37
+ # Summary Groups
38
+ from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
39
+ from opencompass.configs.summarizers.groups.mmlu_pro import \
40
+ mmlu_pro_summary_groups
41
+
42
+ #######################################################################
43
+ # PART 1 Datasets List #
44
+ #######################################################################
45
+ # datasets list for evaluation
46
+ # Only take LCB generation for evaluation
47
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
48
+ []) + [LCBCodeGeneration_dataset]
49
+
50
+ # LLM judge config: using LLM to evaluate predictions
51
+ judge_cfg = dict()
52
+ for dataset in datasets:
53
+ dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
54
+ if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
55
+ dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
56
+
57
+
58
+ #######################################################################
59
+ # PART 2 Datset Summarizer #
60
+ #######################################################################
61
+
62
+ core_summary_groups = [
63
+ {
64
+ 'name':
65
+ 'core_average',
66
+ 'subsets': [
67
+ ['IFEval', 'Prompt-level-strict-accuracy'],
68
+ ['bbh', 'naive_average'],
69
+ ['math_prm800k_500', 'accuracy'],
70
+ ['aime2024', 'accuracy'],
71
+ ['GPQA_diamond', 'accuracy'],
72
+ ['mmlu_pro', 'naive_average'],
73
+ ['openai_humaneval', 'humaneval_pass@1'],
74
+ ['lcb_code_generation', 'pass@1'],
75
+ ],
76
+ },
77
+ ]
78
+
79
+ summarizer = dict(
80
+ dataset_abbrs=[
81
+ ['core_average', 'naive_average'],
82
+ '',
83
+ 'Instruction Following',
84
+ ['IFEval', 'Prompt-level-strict-accuracy'],
85
+ '',
86
+ 'General Reasoning',
87
+ ['bbh', 'naive_average'],
88
+ ['GPQA_diamond', 'accuracy'],
89
+ '',
90
+ 'Math Calculation',
91
+ ['math_prm800k_500', 'accuracy'],
92
+ ['aime2024', 'accuracy'],
93
+ '',
94
+ 'Knowledge',
95
+ ['mmlu_pro', 'naive_average'],
96
+ '',
97
+ 'Code',
98
+ ['openai_humaneval', 'humaneval_pass@1'],
99
+ ['lcb_code_generation', 'pass@1'],
100
+ ],
101
+ summary_groups=sum(
102
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
103
+ )
104
+
105
+ #######################################################################
106
+ # PART 3 Models List #
107
+ #######################################################################
108
+
109
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
110
+
111
+ #######################################################################
112
+ # PART 4 Inference/Evaluation Configuaration #
113
+ #######################################################################
114
+
115
+ # Local Runner
116
+ infer = dict(
117
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
118
+ runner=dict(
119
+ type=LocalRunner,
120
+ max_num_workers=16,
121
+ retry=0, # Modify if needed
122
+ task=dict(type=OpenICLInferTask),
123
+ ),
124
+ )
125
+
126
+ # eval with local runner
127
+ eval = dict(
128
+ partitioner=dict(type=NaivePartitioner, n=10),
129
+ runner=dict(type=LocalRunner,
130
+ max_num_workers=16,
131
+ task=dict(type=OpenICLEvalTask)),
132
+ )
133
+
134
+ #######################################################################
135
+ # PART 5 Utils Configuaration #
136
+ #######################################################################
137
+ work_dir = './outputs/oc_academic_202502'
examples/eval_babilong.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # Models
5
+ # Datasets
6
+ from opencompass.configs.datasets.babilong.babilong_0k_gen import \
7
+ babiLong_0k_datasets
8
+ from opencompass.configs.datasets.babilong.babilong_4k_gen import \
9
+ babiLong_4k_datasets
10
+ from opencompass.configs.datasets.babilong.babilong_16k_gen import \
11
+ babiLong_16k_datasets
12
+ from opencompass.configs.datasets.babilong.babilong_32k_gen import \
13
+ babiLong_32k_datasets
14
+ from opencompass.configs.datasets.babilong.babilong_128k_gen import \
15
+ babiLong_128k_datasets
16
+ from opencompass.configs.datasets.babilong.babilong_256k_gen import \
17
+ babiLong_256k_datasets
18
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
19
+ models as lmdeploy_internlm2_5_7b_chat_model
20
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
21
+ models as lmdeploy_llama3_1_8b_instruct_model
22
+ from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
23
+ models as lmdeploy_ministral_8b_instruct_2410_model
24
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
25
+ models as lmdeploy_qwen2_5_7b_instruct_model
26
+ from opencompass.configs.summarizers.groups.babilong import \
27
+ babilong_summary_groups
28
+
29
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
30
+
31
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
32
+ for model in models:
33
+ model['engine_config']['session_len'] = 1024 * 1024
34
+ model['max_seq_len'] = 1024 * 1024
35
+ model['engine_config']['tp'] = 4
36
+ model['run_cfg']['num_gpus'] = 4
37
+
38
+ summarizer = dict(
39
+ dataset_abbrs=[
40
+ 'babilong_0k',
41
+ 'babilong_4k',
42
+ 'babilong_16k',
43
+ 'babilong_32k',
44
+ 'babilong_128k',
45
+ 'babilong_256k',
46
+ ],
47
+ summary_groups=sum(
48
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
49
+ )
50
+
51
+ work_dir = './outputs/babilong'
examples/eval_bench_intern_s1.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
8
+
9
+
10
+ #######################################################################
11
+ # PART 0 Essential Configs #
12
+ #######################################################################
13
+ with read_base():
14
+ # Datasets
15
+ from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
16
+ from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import (
17
+ gpqa_datasets,
18
+ )
19
+ from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import (
20
+ mmlu_pro_datasets,
21
+ )
22
+ from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
23
+ ifeval_datasets,
24
+ )
25
+ from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import (
26
+ smolinstruct_datasets_0shot_instruct as smolinstruct_datasets,
27
+ )
28
+ from opencompass.configs.datasets.ChemBench.ChemBench_llmjudge_gen_c584cf import (
29
+ chembench_datasets,
30
+ )
31
+ from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import (
32
+ matbench_datasets,
33
+ )
34
+ from opencompass.configs.datasets.ProteinLMBench.ProteinLMBench_llmjudge_gen_a67965 import (
35
+ proteinlmbench_datasets,
36
+ )
37
+
38
+ # Summary Groups
39
+ from opencompass.configs.summarizers.groups.mmlu_pro import (
40
+ mmlu_pro_summary_groups,
41
+ )
42
+
43
+ # Models
44
+ from opencompass.configs.models.interns1.intern_s1 import \
45
+ models as interns1_model
46
+
47
+ #######################################################################
48
+ # PART 1 Datasets List #
49
+ #######################################################################
50
+ # datasets list for evaluation
51
+ # Only take LCB generation for evaluation
52
+
53
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
54
+ [])
55
+
56
+ # LLM judge config: using LLM to evaluate predictions
57
+ judge_cfg = dict()
58
+
59
+ for item in datasets:
60
+ item['infer_cfg']['inferencer']['max_out_len'] = 65536
61
+ if 'judge_cfg' in item['eval_cfg']['evaluator']:
62
+ item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
63
+ if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys() and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
64
+ item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
65
+
66
+
67
+ #######################################################################
68
+ # PART 2 Datset Summarizer #
69
+ #######################################################################
70
+
71
+ summary_groups = sum(
72
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []
73
+ )
74
+
75
+ summary_groups.extend(
76
+ [
77
+ {
78
+ 'name': 'ChemBench',
79
+ 'subsets': [
80
+ 'ChemBench_Name_Conversion',
81
+ 'ChemBench_Property_Prediction',
82
+ 'ChemBench_Mol2caption',
83
+ 'ChemBench_Caption2mol',
84
+ 'ChemBench_Product_Prediction',
85
+ 'ChemBench_Retrosynthesis',
86
+ 'ChemBench_Yield_Prediction',
87
+ 'ChemBench_Temperature_Prediction',
88
+ ],
89
+ },
90
+ ]
91
+ )
92
+
93
+ summarizer = dict(
94
+ dataset_abbrs=[
95
+ 'Knowledge',
96
+ ['mmlu_pro', 'accuracy'],
97
+ '',
98
+ 'Instruction Following',
99
+ ['IFEval', 'Prompt-level-strict-accuracy'],
100
+ '',
101
+ 'General Reasoning',
102
+ ['GPQA_diamond', 'accuracy'],
103
+ '',
104
+ 'Math Calculation',
105
+ ['aime2025', 'accuracy'],
106
+ '',
107
+ 'Academic',
108
+ ['ChemBench', 'naive_average'],
109
+ ['ProteinLMBench', 'accuracy'],
110
+ '',
111
+ 'SmolInstruct',
112
+ ['NC-I2F-0shot-instruct', 'score'],
113
+ ['NC-I2S-0shot-instruct', 'score'],
114
+ ['NC-S2F-0shot-instruct', 'score'],
115
+ ['NC-S2I-0shot-instruct', 'score'],
116
+ ['PP-ESOL-0shot-instruct', 'score'],
117
+ ['PP-Lipo-0shot-instruct', 'score'],
118
+ ['PP-BBBP-0shot-instruct', 'accuracy'],
119
+ ['PP-ClinTox-0shot-instruct', 'accuracy'],
120
+ ['PP-HIV-0shot-instruct', 'accuracy'],
121
+ ['PP-SIDER-0shot-instruct', 'accuracy'],
122
+ ['MC-0shot-instruct', 'score'],
123
+ ['MG-0shot-instruct', 'score'],
124
+ ['FS-0shot-instruct', 'score'],
125
+ ['RS-0shot-instruct', 'score'],
126
+ '',
127
+ ['matbench_expt_gap', 'mae'],
128
+ ['matbench_steels', 'mae'],
129
+ ['matbench_expt_is_metal', 'accuracy'],
130
+ ['matbench_glass', 'accuracy'],
131
+ '',
132
+ ],
133
+ summary_groups=summary_groups,
134
+ )
135
+
136
+ #######################################################################
137
+ # PART 3 Models List #
138
+ #######################################################################
139
+
140
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
141
+
142
+ #######################################################################
143
+ # PART 4 Inference/Evaluation Configuaration #
144
+ #######################################################################
145
+
146
+ # infer with local runner
147
+ infer = dict(
148
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
149
+ runner=dict(
150
+ type=LocalRunner,
151
+ max_num_workers=16,
152
+ retry=0, # Modify if needed
153
+ task=dict(type=OpenICLInferTask),
154
+ ),
155
+ )
156
+
157
+ # eval with local runner
158
+ eval = dict(
159
+ partitioner=dict(type=NaivePartitioner, n=10),
160
+ runner=dict(type=LocalRunner,
161
+ max_num_workers=16,
162
+ task=dict(type=OpenICLEvalTask)),
163
+ )
164
+
165
+ #######################################################################
166
+ # PART 5 Utils Configuaration #
167
+ #######################################################################
168
+
169
+ work_dir = './outputs/oc_bench_intern_s1'
examples/eval_cascade_evaluator.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from mmengine.config import read_base
3
+
4
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
5
+ from opencompass.openicl.icl_retriever import ZeroRetriever
6
+ from opencompass.openicl.icl_inferencer import GenInferencer
7
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
8
+ from opencompass.openicl.icl_retriever import ZeroRetriever
9
+ from opencompass.openicl.icl_inferencer import GenInferencer
10
+ from opencompass.evaluator import (
11
+ GenericLLMEvaluator,
12
+ CascadeEvaluator,
13
+ MATHVerifyEvaluator,
14
+ )
15
+ from opencompass.datasets import generic_llmjudge_postprocess
16
+ from opencompass.datasets import (
17
+ MATHDataset,
18
+ math_postprocess_v2,
19
+ normalize_final_answer,
20
+ )
21
+ #######################################################################
22
+ # PART 0 Essential Configs #
23
+ #######################################################################
24
+
25
+ with read_base():
26
+ # Datasets, Summarizer
27
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
28
+ models as lmdeploy_qwen2_5_7b_instruct_model,
29
+ )
30
+
31
+ reader_cfg = dict(input_columns=['problem'], output_column='solution')
32
+
33
+ infer_cfg = dict(
34
+ prompt_template=dict(
35
+ type=PromptTemplate,
36
+ template=dict(
37
+ round=[
38
+ dict(
39
+ role='HUMAN',
40
+ prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
41
+ ),
42
+ ]
43
+ ),
44
+ ),
45
+ retriever=dict(type=ZeroRetriever),
46
+ inferencer=dict(type=GenInferencer),
47
+ )
48
+
49
+ ########################## Evaluator #################################
50
+ GRADER_TEMPLATE = """
51
+ Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
52
+
53
+ Here are some evaluation criteria:
54
+ 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
55
+ 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
56
+ 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
57
+ 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
58
+ 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
59
+
60
+ Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
61
+ A: CORRECT
62
+ B: INCORRECT
63
+ Just return the letters "A" or "B", with no text around it.
64
+
65
+ Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
66
+
67
+
68
+ <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
69
+ <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
70
+ <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
71
+
72
+ Judging the correctness of candidates' answers:
73
+ """.strip()
74
+
75
+ llm_judge_evaluator = dict(
76
+ type=GenericLLMEvaluator,
77
+ prompt_template=dict(
78
+ type=PromptTemplate,
79
+ template=dict(
80
+ begin=[
81
+ dict(
82
+ role='SYSTEM',
83
+ fallback_role='HUMAN',
84
+ prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
85
+ )
86
+ ],
87
+ round=[
88
+ dict(role='HUMAN', prompt=GRADER_TEMPLATE),
89
+ ],
90
+ ),
91
+ ),
92
+ dataset_cfg=dict(
93
+ type=MATHDataset,
94
+ path='opencompass/math',
95
+ file_name='test_prm800k_500.json',
96
+ ),
97
+ judge_cfg=dict(),
98
+ )
99
+
100
+ rule_evaluator =dict(type=MATHVerifyEvaluator)
101
+ cascade_evaluator = dict(type=CascadeEvaluator,
102
+ llm_evaluator=llm_judge_evaluator,
103
+ rule_evaluator=rule_evaluator,
104
+ parallel=False
105
+ )
106
+ ########################## #################################
107
+ eval_cfg = dict()
108
+
109
+ # eval_cfg['evaluator'] = rule_evaluator
110
+ # eval_cfg['evaluator'] = llm_judge_evaluator
111
+ eval_cfg['evaluator'] = cascade_evaluator
112
+
113
+ math_datasets = [
114
+ dict(
115
+ abbr='math_prm800k_500',
116
+ type=MATHDataset,
117
+ path='opencompass/math',
118
+ file_name='test_prm800k_500.json',
119
+ reader_cfg=reader_cfg,
120
+ infer_cfg=infer_cfg,
121
+ eval_cfg=eval_cfg,
122
+ )
123
+ ]
124
+
125
+
126
+ datasets = math_datasets
127
+ models = lmdeploy_qwen2_5_7b_instruct_model
128
+
129
+
130
+ work_dir = 'math_prm800k_500_cascade_evaluator'
examples/eval_chat_agent.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lagent import ReAct
2
+ from lagent.agents.react import ReActProtocol
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.lagent.actions.python_interpreter import PythonInterpreter
6
+ from opencompass.models.lagent import LagentAgent
7
+ from opencompass.models.openai_api import OpenAI
8
+ from opencompass.partitioners import SizePartitioner
9
+ from opencompass.runners import LocalRunner
10
+ from opencompass.tasks import OpenICLInferTask
11
+
12
+ with read_base():
13
+ from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import \
14
+ gsm8k_datasets
15
+ from opencompass.configs.datasets.math.math_agent_gen_af2293 import \
16
+ math_datasets
17
+ from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import \
18
+ mathbench_agent_datasets
19
+ from opencompass.configs.summarizers.math_agent import summarizer
20
+
21
+ datasets = []
22
+ datasets += gsm8k_datasets
23
+ datasets += math_datasets
24
+ datasets += mathbench_agent_datasets
25
+
26
+ system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
27
+ ```
28
+ def solution():
29
+ variable_names_with_real_meaning = func(variable)
30
+ return variable_names_with_real_meaning
31
+ ```"""
32
+
33
+ protocol = dict(
34
+ type=ReActProtocol,
35
+ action=dict(role='ACTION', begin='Tool:', end='\n'),
36
+ action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
37
+ finish=dict(role='FINISH', begin='FinalAnswer:', end='\n'),
38
+ call_protocol=system_prompt,
39
+ )
40
+
41
+ models = [
42
+ dict(
43
+ abbr='gpt-3.5-react',
44
+ type=LagentAgent,
45
+ agent_type=ReAct,
46
+ max_turn=3,
47
+ llm=dict(
48
+ type=OpenAI,
49
+ path='gpt-3.5-turbo',
50
+ key='ENV',
51
+ query_per_second=1,
52
+ max_seq_len=4096,
53
+ ),
54
+ actions=[
55
+ dict(type=PythonInterpreter),
56
+ ],
57
+ protocol=protocol,
58
+ batch_size=1,
59
+ ),
60
+ ]
61
+
62
+ infer = dict(
63
+ partitioner=dict(type=SizePartitioner, max_task_size=1000),
64
+ runner=dict(type=LocalRunner,
65
+ max_num_workers=16,
66
+ task=dict(type=OpenICLInferTask)),
67
+ )
examples/eval_chat_demo.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
5
+ gsm8k_datasets
6
+ from opencompass.configs.datasets.demo.demo_math_chat_gen import \
7
+ math_datasets
8
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import \
9
+ models as hf_internlm2_chat_1_8b_models
10
+ from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
11
+ models as hf_qwen2_1_5b_instruct_models
12
+
13
+ datasets = gsm8k_datasets + math_datasets
14
+ models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models
examples/eval_chat_last.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.openai_api import OpenAI
4
+ from opencompass.openicl import ChatInferencer
5
+ from opencompass.partitioners import SizePartitioner
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.tasks import OpenICLInferTask
8
+
9
+ with read_base():
10
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
11
+ gsm8k_datasets as datasets
12
+
13
+ models = [
14
+ dict(
15
+ abbr='gpt-3.5',
16
+ type=OpenAI,
17
+ path='gpt-3.5-turbo',
18
+ key='ENV',
19
+ max_out_len=100,
20
+ max_seq_len=2048,
21
+ batch_size=16,
22
+ run_cfg=dict(num_gpus=1, num_procs=1),
23
+ )
24
+ ]
25
+
26
+ for dataset in datasets:
27
+ # Use ChatInferencer instead of GenInferencer
28
+ dataset['infer_cfg']['inferencer'] = dict(type=ChatInferencer)
29
+
30
+ infer = dict(
31
+ partitioner=dict(type=SizePartitioner, max_task_size=1000),
32
+ runner=dict(type=LocalRunner,
33
+ max_num_workers=16,
34
+ task=dict(type=OpenICLInferTask)),
35
+ )
examples/eval_chatml_datasets.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+
3
+ from mmengine.config import read_base
4
+
5
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
8
+
9
+ #######################################################################
10
+ # PART 0 Essential Configs #
11
+ #######################################################################
12
+ with read_base():
13
+
14
+ # Models (add your models here)
15
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
16
+ models as hf_internlm2_5_7b_chat_model
17
+
18
+ # Datasets
19
+ from opencompass.configs.chatml_datasets.MaScQA.MaScQA_gen import datasets as MaScQA_chatml
20
+ from opencompass.configs.chatml_datasets.CPsyExam.CPsyExam_gen import datasets as CPsyExam_chatml
21
+
22
+
23
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
24
+
25
+ chatml_datasets = sum(
26
+ (v for k, v in locals().items() if k.endswith('_chatml')),
27
+ [],
28
+ )
29
+
30
+ # Your Judge Model Configs Here
31
+ judge_cfg = dict()
32
+
33
+ for dataset in chatml_datasets:
34
+ if dataset['evaluator']['type'] == 'llm_evaluator':
35
+ dataset['evaluator']['judge_cfg'] = judge_cfg
36
+ if dataset['evaluator']['type'] == 'cascade_evaluator':
37
+ dataset['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
38
+
39
+ infer = dict(
40
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
41
+ runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
42
+ )
43
+
44
+ eval = dict(
45
+ partitioner=dict(type=NaivePartitioner, n=8),
46
+ runner=dict(
47
+ type=LocalRunner, task=dict(type=OpenICLEvalTask), max_num_workers=32
48
+ ),
49
+ )
50
+
51
+ work_dir = 'outputs/ChatML_Datasets'
examples/eval_chembench.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.ChemBench.ChemBench_gen import \
5
+ chembench_datasets
6
+ from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
7
+ models
8
+
9
+ datasets = [*chembench_datasets]
10
+ models = [*models]
11
+ '''
12
+ dataset version metric mode mistral-7b-instruct-v0.2-hf
13
+ -------------------------------- --------- -------- ------ -----------------------------
14
+ ChemBench_Name_Conversion d4e6a1 accuracy gen 45.43
15
+ ChemBench_Property_Prediction d4e6a1 accuracy gen 47.11
16
+ ChemBench_Mol2caption d4e6a1 accuracy gen 64.21
17
+ ChemBench_Caption2mol d4e6a1 accuracy gen 35.38
18
+ ChemBench_Product_Prediction d4e6a1 accuracy gen 38.67
19
+ ChemBench_Retrosynthesis d4e6a1 accuracy gen 27
20
+ ChemBench_Yield_Prediction d4e6a1 accuracy gen 27
21
+ ChemBench_Temperature_Prediction d4e6a1 accuracy gen 26.73
22
+ ChemBench_Solvent_Prediction d4e6a1 accuracy gen 32.67
23
+ '''
examples/eval_chinese_simpleqa.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
5
+
6
+ from opencompass.models import HuggingFacewithChatTemplate
7
+ from opencompass.models.openai_api import OpenAI
8
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
9
+ from opencompass.runners import LocalRunner
10
+ from opencompass.summarizers import DefaultSubjectiveSummarizer
11
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
12
+
13
+ # -------------Inference Stage ----------------------------------------
14
+ models = [
15
+ dict(
16
+ type=HuggingFacewithChatTemplate,
17
+ abbr='Qwen2.5-1.5B-Instruct',
18
+ path='Qwen/Qwen2.5-1.5B-Instruct',
19
+ model_kwargs=dict(
20
+ device_map='auto',
21
+ trust_remote_code=True,
22
+ ),
23
+ tokenizer_kwargs=dict(
24
+ padding_side='left',
25
+ truncation_side='left',
26
+ trust_remote_code=True,
27
+ ),
28
+ generation_kwargs=dict(do_sample=True, ),
29
+ max_out_len=200,
30
+ max_seq_len=4096,
31
+ batch_size=8,
32
+ run_cfg=dict(num_gpus=1, num_procs=1),
33
+ )
34
+ ]
35
+
36
+ datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
37
+ summarizer = dict(type=DefaultSubjectiveSummarizer)
38
+
39
+ # -------------Evalation Stage ----------------------------------------
40
+
41
+ ## ------------- JudgeLLM Configuration
42
+
43
+ api_meta_template = dict(round=[
44
+ dict(role='SYSTEM', api_role='SYSTEM'),
45
+ dict(role='HUMAN', api_role='HUMAN'),
46
+ dict(role='BOT', api_role='BOT', generate=True),
47
+ ])
48
+ judge_models = [
49
+ dict(
50
+ # GPT4o
51
+ abbr='gpt-4o-0513-global',
52
+ type=OpenAI,
53
+ # gpt-4o
54
+ path='gpt-4o-0513-global',
55
+ key='xxx', # provide OPENAI_API_KEY
56
+ meta_template=api_meta_template,
57
+ query_per_second=16,
58
+ max_out_len=1000,
59
+ batch_size=8,
60
+ retry=3)
61
+ ]
62
+
63
+ ## ------------- Evaluation Configuration
64
+ eval = dict(
65
+ partitioner=dict(type=SubjectiveNaivePartitioner,
66
+ models=models,
67
+ judge_models=judge_models),
68
+ runner=dict(type=LocalRunner,
69
+ max_num_workers=16,
70
+ task=dict(type=SubjectiveEvalTask)),
71
+ )
72
+
73
+ work_dir = 'outputs/chinese_simpleqa/'
examples/eval_cibench.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+
3
+ from lagent import ReAct
4
+ from lagent.agents.react import ReActProtocol
5
+ from mmengine.config import read_base
6
+
7
+ from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
8
+ from opencompass.lagent.actions.python_interpreter import PythonInterpreter
9
+ from opencompass.lagent.agents.react import CIReAct
10
+ from opencompass.models import HuggingFaceCausalLM
11
+ from opencompass.models.lagent import CodeAgent, LagentAgent
12
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
13
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
14
+ from opencompass.tasks import OpenICLInferTask
15
+
16
+ with read_base():
17
+ # Note that it might occur cuda OOM error for hf model
18
+ from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
19
+ cibench_datasets as cibench_datasets_generation
20
+ from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
21
+ cibench_datasets as cibench_datasets_template
22
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
23
+ models as lmdeploy_llama3_8b_instruct_model
24
+ from opencompass.configs.summarizers.cibench import summarizer
25
+
26
+ # Oracle mode for analysis
27
+ # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
28
+ # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
29
+
30
+ datasets = []
31
+ datasets += cibench_datasets_template
32
+ datasets += cibench_datasets_generation
33
+ # datasets += cibench_datasets_template_oracle
34
+ # datasets += cibench_datasets_generation_oracle
35
+
36
+ _origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
37
+ [])
38
+
39
+ FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
40
+
41
+ FEWSHOT_INSTRUCTION = """\
42
+ You are an assistant who can utilize external tools.
43
+ {tool_description}
44
+ To use a tool, please response with the following format:
45
+ ```
46
+ {thought} Think what you need to solve, do you need to use tools?
47
+ {action} The tool name, should be one of [{action_names}].
48
+ {action_input} The input to the tool that you want to use.
49
+ ```
50
+ The tool will give you response after your response using the following format:
51
+ ```
52
+ {response} the results after call the tool.
53
+ ```
54
+ Therefore DO NOT generate tool response by yourself.
55
+
56
+ Also please follow the guidelines:
57
+ 1. Always use code interpreter to solve the problem.
58
+ 2. The generated codes should always in a markdown code block format.
59
+ 3. The generated codes will be executed in an ipython manner and the results will be cached.
60
+ 4. Your responded code should always be simple and only solves the problem in current step.
61
+
62
+ For example:
63
+
64
+ File url: `xxxx`
65
+ ### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
66
+
67
+ {thought} We should use `pandas` to solve this step.
68
+ {action} IPythonInterpreter
69
+ {action_input} ```python
70
+ import pandas as pd
71
+ url = "xxxx"
72
+ data = pd.read_csv(url)
73
+ ```
74
+ {response} The code is succeed without any outputs.
75
+
76
+ Let us begin from here!
77
+ """
78
+
79
+ IPYTHON_INTERPRETER_DESCRIPTION = '''\
80
+ It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
81
+
82
+ actions = [
83
+ dict(type=IPythonInterpreter,
84
+ user_data_dir='./data/cibench_dataset/datasources',
85
+ description=IPYTHON_INTERPRETER_DESCRIPTION)
86
+ ]
87
+ protocol = dict(
88
+ type=ReActProtocol,
89
+ call_protocol=FEWSHOT_INSTRUCTION,
90
+ force_stop=FORCE_STOP_PROMPT_EN,
91
+ finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
92
+ )
93
+
94
+ work_dir = './outputs/cibench/'
95
+
96
+ _agent_models = []
97
+ for m in _origin_models:
98
+ m = deepcopy(m)
99
+ if 'meta_template' in m and 'round' in m['meta_template']:
100
+ round = m['meta_template']['round']
101
+ if all(r['role'].upper() != 'SYSTEM'
102
+ for r in round): # no system round
103
+ if not any('api_role' in r for r in round):
104
+ m['meta_template']['round'].append(
105
+ dict(role='system', begin='System response:', end='\n'))
106
+ else:
107
+ m['meta_template']['round'].append(
108
+ dict(role='system', api_role='SYSTEM'))
109
+ print(
110
+ f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}'
111
+ )
112
+ _agent_models.append(m)
113
+
114
+ protocol = dict(
115
+ type=ReActProtocol,
116
+ call_protocol=FEWSHOT_INSTRUCTION,
117
+ force_stop=FORCE_STOP_PROMPT_EN,
118
+ finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
119
+ )
120
+
121
+ models = []
122
+ for m in _agent_models:
123
+ m = deepcopy(m)
124
+ origin_abbr = m.pop('abbr')
125
+ abbr = origin_abbr
126
+ m.pop('batch_size', None)
127
+ m.pop('max_out_len', None)
128
+ m.pop('max_seq_len', None)
129
+ run_cfg = m.pop('run_cfg', {})
130
+
131
+ agent_model = dict(
132
+ abbr=abbr,
133
+ summarizer_abbr=origin_abbr,
134
+ type=CodeAgent,
135
+ agent_type=CIReAct,
136
+ max_turn=3,
137
+ llm=m,
138
+ actions=[
139
+ dict(type=IPythonInterpreter,
140
+ user_data_dir='./data/cibench_dataset/datasources',
141
+ description=IPYTHON_INTERPRETER_DESCRIPTION)
142
+ ],
143
+ protocol=protocol,
144
+ batch_size=1,
145
+ run_cfg=run_cfg,
146
+ )
147
+ models.append(agent_model)
148
+
149
+ infer = dict(
150
+ partitioner=dict(type=NaivePartitioner),
151
+ runner=dict(type=LocalRunner,
152
+ max_num_workers=4,
153
+ task=dict(type=OpenICLInferTask)),
154
+ )
examples/eval_claude.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.partitioners import NaivePartitioner
4
+ from opencompass.runners import LocalRunner
5
+ from opencompass.tasks import OpenICLInferTask
6
+
7
+ with read_base():
8
+ # choose a list of datasets
9
+ from opencompass.configs.datasets.collections.chat_medium import datasets
10
+ from opencompass.configs.models.claude.claude import models
11
+ # and output the results in a choosen format
12
+ from opencompass.configs.summarizers.medium import summarizer
13
+
14
+ infer = dict(
15
+ partitioner=dict(type=NaivePartitioner),
16
+ runner=dict(type=LocalRunner,
17
+ max_num_workers=8,
18
+ task=dict(type=OpenICLInferTask)),
19
+ )
examples/eval_codeagent.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import HuggingFaceCausalLM, OpenAI
4
+ from opencompass.models.lagent import CodeAgent
5
+ from opencompass.partitioners import SizePartitioner
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.tasks import OpenICLInferTask
8
+
9
+ with read_base():
10
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_57b0b1 import \
11
+ gsm8k_datasets
12
+ from opencompass.configs.datasets.math.math_gen_943d32 import math_datasets
13
+
14
+ datasets = []
15
+ datasets += gsm8k_datasets
16
+ datasets += math_datasets
17
+
18
+ models = [
19
+ dict(abbr='gpt-3.5-react',
20
+ type=CodeAgent,
21
+ llm=dict(
22
+ type=OpenAI,
23
+ path='gpt-3.5-turbo',
24
+ key='ENV',
25
+ query_per_second=1,
26
+ max_seq_len=4096,
27
+ ),
28
+ batch_size=8),
29
+ dict(abbr='WizardCoder-Python-13B-V1.0-react',
30
+ type=CodeAgent,
31
+ llm=dict(
32
+ type=HuggingFaceCausalLM,
33
+ path='WizardLM/WizardCoder-Python-13B-V1.0',
34
+ tokenizer_path='WizardLM/WizardCoder-Python-13B-V1.0',
35
+ tokenizer_kwargs=dict(
36
+ padding_side='left',
37
+ truncation_side='left',
38
+ trust_remote_code=True,
39
+ ),
40
+ max_seq_len=2048,
41
+ model_kwargs=dict(trust_remote_code=True, device_map='auto'),
42
+ ),
43
+ batch_size=8,
44
+ run_cfg=dict(num_gpus=2, num_procs=1)),
45
+ ]
46
+
47
+ infer = dict(
48
+ partitioner=dict(type=SizePartitioner, max_task_size=40000),
49
+ runner=dict(type=LocalRunner,
50
+ max_num_workers=16,
51
+ task=dict(type=OpenICLInferTask)),
52
+ )
examples/eval_codebench_full.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config is used to test all the code benchmarks
2
+ from mmengine.config import read_base
3
+ import os.path as osp
4
+ from opencompass.runners import LocalRunner, VOLCRunner
5
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6
+ from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
7
+
8
+ with read_base():
9
+ # Datasets Part
10
+ # bigcodebench
11
+ from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import (
12
+ bigcodebench_full_instruct_datasets
13
+ )
14
+ from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import (
15
+ bigcodebench_hard_instruct_datasets
16
+ )
17
+ # livecodebench code generation lite v5
18
+ from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen_a4f90b import (
19
+ LCB_datasets
20
+ )
21
+ # huamneval series
22
+ from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
23
+ humaneval_datasets
24
+ )
25
+ from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
26
+ humanevalpro_datasets
27
+ )
28
+ from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import (
29
+ humanevalx_datasets
30
+ )
31
+ from opencompass.configs.datasets.humaneval_plus.humaneval_plus_gen import (
32
+ humaneval_plus_datasets
33
+ )
34
+ # mbpp series
35
+ from opencompass.configs.datasets.mbpp.mbpp_gen import (
36
+ mbpp_datasets
37
+ )
38
+ from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
39
+ mbpppro_datasets
40
+ )
41
+ # multipl-e
42
+ from opencompass.configs.datasets.multipl_e.multiple_gen import (
43
+ multiple_datasets
44
+ )
45
+ # ds1000
46
+ from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
47
+ ds1000_datasets
48
+ )
49
+
50
+ # Models Part
51
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
52
+ models as lmdeploy_qwen2_5_7b_instruct_model,
53
+ )
54
+
55
+ # Summary Groups
56
+ from opencompass.configs.summarizers.groups.ds1000 import (
57
+ ds1000_summary_groups,
58
+ )
59
+ from opencompass.configs.summarizers.groups.multipl_e import (
60
+ multiple_summary_groups,
61
+ )
62
+ from opencompass.configs.summarizers.groups.humanevalx import (
63
+ humanevalx_summary_groups,
64
+ )
65
+
66
+ # models config
67
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
68
+
69
+ for model in models:
70
+ model['max_seq_len'] = 16384
71
+ model['max_out_len'] = 8192
72
+
73
+ # datasets config
74
+ datasets = sum(
75
+ (v for k, v in locals().items() if k.endswith('_datasets')),
76
+ [],
77
+ )
78
+
79
+ for item in humanevalx_datasets:
80
+ item['eval_cfg']['evaluator'][
81
+ 'ip_address'
82
+ ] = 'codeeval.opencompass.org.cn/humanevalx'
83
+ item['eval_cfg']['evaluator']['port'] = ''
84
+ for item in ds1000_datasets:
85
+ item['eval_cfg']['evaluator'][
86
+ 'ip_address'
87
+ ] = 'codeeval.opencompass.org.cn/ds1000'
88
+ item['eval_cfg']['evaluator']['port'] = ''
89
+
90
+
91
+ for dataset in datasets:
92
+ dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
93
+
94
+
95
+ # summary
96
+ summary_groups = sum(
97
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []
98
+ )
99
+ summary_groups.append(
100
+ {'name': 'humanevalx',
101
+ 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
102
+ )
103
+ summarizer = dict(
104
+ dataset_abbrs = [
105
+ ['bigcodebench_hard_instruct', 'pass@1'],
106
+ ['bigcodebench_full_instruct', 'pass@1'],
107
+ ['lcb_code_generation', 'pass@1'],
108
+ ['openai_humaneval', 'humaneval_pass@1'],
109
+ ['mbpp', 'score'],
110
+ ['humaneval_pro', 'pass@1'],
111
+ ['mbpp_pro', 'pass@1'],
112
+ ['humaneval_plus', 'humaneval_plus_pass@1'],
113
+ ['multiple', 'naive_average'],
114
+ ['humanevalx', 'naive_average'],
115
+ ['ds1000', 'naive_average'],
116
+ '',
117
+ 'humanevalx-python',
118
+ 'humanevalx-cpp',
119
+ 'humanevalx-java',
120
+ 'humanevalx-js',
121
+ '',
122
+ 'ds1000_Pandas',
123
+ 'ds1000_Numpy',
124
+ 'ds1000_Tensorflow',
125
+ 'ds1000_Scipy',
126
+ 'ds1000_Sklearn',
127
+ 'ds1000_Pytorch',
128
+ 'ds1000_Matplotlib',
129
+ '',
130
+ 'humaneval-multiple-cpp',
131
+ 'humaneval-multiple-cs',
132
+ 'humaneval-multiple-go',
133
+ 'humaneval-multiple-java',
134
+ 'humaneval-multiple-rb',
135
+ 'humaneval-multiple-js',
136
+ 'humaneval-multiple-php',
137
+ 'humaneval-multiple-r',
138
+ 'humaneval-multiple-rs',
139
+ 'humaneval-multiple-sh',
140
+ '',
141
+ 'mbpp-multiple-cpp',
142
+ 'mbpp-multiple-cs',
143
+ 'mbpp-multiple-go',
144
+ 'mbpp-multiple-java',
145
+ 'mbpp-multiple-rb',
146
+ 'mbpp-multiple-js',
147
+ 'mbpp-multiple-php',
148
+ 'mbpp-multiple-r',
149
+ 'mbpp-multiple-rs',
150
+ 'mbpp-multiple-sh'
151
+ ],
152
+ summary_groups=summary_groups,
153
+ )
154
+
155
+ work_dir = 'outputs/code'
examples/eval_compassarena_subjectivebench_bradleyterry.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_bt_judge import (
5
+ compassarena_subjectivebench_bradleyterry_singleturn_datasets, )
6
+ from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_bt_judge import (
7
+ compassarena_subjectivebench_bradleyterry_multiturn_datasets, )
8
+
9
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
10
+ models as lmdeploy_internlm2_5_7b_chat, )
11
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
12
+ models as lmdeploy_internlm2_5_20b_chat, )
13
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
14
+ models as lmdeploy_llama3_1_8b_instruct, )
15
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import (
16
+ models as lmdeploy_llama3_1_70b_instruct, )
17
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import (
18
+ models as lmdeploy_qwen2_5_0_5b_instruct, )
19
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import (
20
+ models as lmdeploy_qwen2_5_1_5b_instruct, )
21
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import (
22
+ models as lmdeploy_qwen2_5_3b_instruct, )
23
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
24
+ models as lmdeploy_qwen2_5_7b_instruct, )
25
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
26
+ models as lmdeploy_qwen2_5_14b_instruct, )
27
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
28
+ models as lmdeploy_qwen2_5_32b_instruct, )
29
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import (
30
+ models as lmdeploy_qwen2_5_72b_instruct, )
31
+ from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
32
+ models as lmdeploy_qwen2_7b_instruct, )
33
+
34
+ from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
35
+ HuggingFaceChatGLM3, OpenAI,
36
+ TurboMindModelwithChatTemplate)
37
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
38
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
39
+ from opencompass.partitioners.sub_num_worker import \
40
+ SubjectiveNumWorkerPartitioner
41
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
42
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
43
+ from opencompass.summarizers import CompassArenaBradleyTerrySummarizer
44
+ from opencompass.tasks import OpenICLInferTask
45
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
46
+
47
+ api_meta_template = dict(round=[
48
+ dict(role='HUMAN', api_role='HUMAN'),
49
+ dict(role='BOT', api_role='BOT', generate=True),
50
+ ])
51
+
52
+ # -------------Inference Stage ----------------------------------------
53
+ models = [
54
+ *lmdeploy_qwen2_5_14b_instruct,
55
+ *lmdeploy_qwen2_5_32b_instruct,
56
+ *lmdeploy_qwen2_5_7b_instruct,
57
+ *lmdeploy_qwen2_7b_instruct,
58
+ ]
59
+
60
+ datasets = [
61
+ *compassarena_subjectivebench_bradleyterry_singleturn_datasets,
62
+ *compassarena_subjectivebench_bradleyterry_multiturn_datasets,
63
+ ]
64
+
65
+ infer = dict(
66
+ partitioner=dict(type=NaivePartitioner),
67
+ runner=dict(type=LocalRunner,
68
+ max_num_workers=16,
69
+ task=dict(type=OpenICLInferTask)),
70
+ )
71
+ # -------------Evalation Stage ----------------------------------------
72
+
73
+ ## ------------- JudgeLLM Configuration
74
+ judge_models = [
75
+ dict(
76
+ type=TurboMindModelwithChatTemplate,
77
+ abbr='CompassJudger-1-32B-Instruct',
78
+ path='opencompass/CompassJudger-1-32B-Instruct',
79
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
80
+ gen_config=dict(top_k=1,
81
+ temperature=1e-6,
82
+ top_p=0.9,
83
+ max_new_tokens=2048),
84
+ max_seq_len=16384,
85
+ max_out_len=2048,
86
+ batch_size=16,
87
+ run_cfg=dict(num_gpus=4),
88
+ )
89
+ ]
90
+
91
+ ## ------------- Evaluation Configuration
92
+ eval = dict(
93
+ partitioner=dict(
94
+ type=SubjectiveNaivePartitioner,
95
+ models=models,
96
+ judge_models=judge_models,
97
+ ),
98
+ runner=dict(type=LocalRunner,
99
+ max_num_workers=16,
100
+ task=dict(type=SubjectiveEvalTask)),
101
+ )
102
+
103
+ ## ------------- Summary Configuration
104
+ # This step fits a Bradley-Terry model (statistical model) with an option
105
+ # to include style features and control variables based on groups
106
+ # (group variables must be available in the input dataset for each observation).
107
+ summarizer = dict(
108
+ type=CompassArenaBradleyTerrySummarizer,
109
+ rating_system='bradleyterry',
110
+ report_pred_win_rates=True,
111
+ num_bootstrap=100,
112
+ num_cpu=None,
113
+ with_control_vars=True,
114
+ normalize_style_features=False,
115
+ odds_ratio=True,
116
+ groups=['difficulty', 'category'],
117
+ )
118
+
119
+ work_dir = 'outputs/compassarena_subjectivebench_bradleyterry/'
examples/eval_contamination.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
5
+ ARC_c_datasets
6
+ from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
7
+ ceval_datasets
8
+ from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
9
+ hellaswag_datasets
10
+ from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets
11
+ from opencompass.configs.models.hf_llama.hf_llama2_7b import \
12
+ models as hf_llama2_7b_model
13
+ from opencompass.configs.models.qwen.hf_qwen_7b import \
14
+ models as hf_qwen_7b_model
15
+ from opencompass.configs.models.yi.hf_yi_6b import models as hf_yi_6b_model
16
+ from opencompass.configs.summarizers.contamination import summarizer
17
+
18
+ datasets = [
19
+ *ceval_datasets, *mmlu_datasets, *hellaswag_datasets, *ARC_c_datasets
20
+ ]
21
+ models = [*hf_yi_6b_model, *hf_qwen_7b_model, *hf_llama2_7b_model]
examples/eval_corebench_2409_longcontext.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path as osp
2
+ from copy import deepcopy
3
+
4
+ from mmengine.config import read_base
5
+
6
+ from opencompass.models import (HuggingFacewithChatTemplate,
7
+ TurboMindModelwithChatTemplate)
8
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
9
+ from opencompass.runners import DLCRunner, LocalRunner
10
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
11
+
12
+ #######################################################################
13
+ # PART 0 Essential Configs #
14
+ #######################################################################
15
+ with read_base():
16
+ from opencompass.configs.datasets.longbench.longbench import \
17
+ longbench_datasets
18
+ from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
19
+ needlebench_datasets as needlebench_8k_datasets
20
+ from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
21
+ needlebench_datasets as needlebench_32k_datasets
22
+ from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
23
+ needlebench_datasets as needlebench_128k_datasets
24
+ from opencompass.configs.datasets.ruler.ruler_8k_gen import \
25
+ ruler_datasets as ruler_8k_datasets
26
+ from opencompass.configs.datasets.ruler.ruler_32k_gen import \
27
+ ruler_datasets as ruler_32k_datasets
28
+ from opencompass.configs.datasets.ruler.ruler_128k_gen import \
29
+ ruler_datasets as ruler_128k_datasets
30
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
31
+ models as lmdeploy_internlm2_5_7b_1m_chat_model
32
+ from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
33
+ models as llama3_1_8b_instruct_model
34
+ # Instruct models
35
+ from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
36
+ models as lmdeploy_qwen2_7b_instruct_model
37
+ # Summary Groups
38
+ from opencompass.configs.summarizers.groups.longbench import \
39
+ longbench_summary_groups
40
+ from opencompass.configs.summarizers.groups.ruler import \
41
+ ruler_summary_groups
42
+ from opencompass.configs.summarizers.needlebench import (
43
+ needlebench_8k_summarizer, needlebench_32k_summarizer,
44
+ needlebench_128k_summarizer)
45
+
46
+ #######################################################################
47
+ # PART 1 Datasets List #
48
+ #######################################################################
49
+ # datasets list for evaluation
50
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
51
+
52
+ #######################################################################
53
+ # PART 2 Datset Summarizer #
54
+ #######################################################################
55
+ needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
56
+ needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
57
+ needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
58
+
59
+ # Instruct models summarizer
60
+ summarizer = dict(
61
+ dataset_abbrs=[
62
+ ['ruler_8k', 'naive_average'],
63
+ ['ruler_32k', 'naive_average'],
64
+ ['ruler_128k', 'naive_average'],
65
+ ['NeedleBench-Overall-Score-8K', 'weighted_average'],
66
+ ['NeedleBench-Overall-Score-32K', 'weighted_average'],
67
+ ['NeedleBench-Overall-Score-128K', 'weighted_average'],
68
+ ['longbench', 'naive_average'],
69
+ ['longbench_zh', 'naive_average'],
70
+ ['longbench_en', 'naive_average'],
71
+ '',
72
+ 'longbench_single-document-qa',
73
+ 'longbench_multi-document-qa',
74
+ 'longbench_summarization',
75
+ 'longbench_few-shot-learning',
76
+ 'longbench_synthetic-tasks',
77
+ 'longbench_code-completion',
78
+ ],
79
+ summary_groups=sum(
80
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
81
+ )
82
+
83
+ #######################################################################
84
+ # PART 3 Models List #
85
+ #######################################################################
86
+
87
+ lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
88
+ lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
89
+ lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
90
+ lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
91
+ lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
92
+
93
+ llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
94
+ llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
95
+ llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
96
+ llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
97
+ llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
98
+
99
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
100
+
101
+ #######################################################################
102
+ # PART 4 Inference/Evaluation Configuaration #
103
+ #######################################################################
104
+
105
+ # Local Runner
106
+ infer = dict(
107
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
108
+ runner=dict(
109
+ type=LocalRunner,
110
+ max_num_workers=16,
111
+ retry=0, # Modify if needed
112
+ task=dict(type=OpenICLInferTask)),
113
+ )
114
+
115
+ # eval with local runner
116
+ eval = dict(
117
+ partitioner=dict(type=NaivePartitioner, n=10),
118
+ runner=dict(type=LocalRunner,
119
+ max_num_workers=16,
120
+ task=dict(type=OpenICLEvalTask)),
121
+ )
122
+
123
+ #######################################################################
124
+ # PART 5 Utils Configuaration #
125
+ #######################################################################
126
+ base_exp_dir = 'outputs/corebench/'
127
+ work_dir = osp.join(base_exp_dir, 'long_context')
examples/eval_corebench_2409_subjective.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path as osp
2
+ from copy import deepcopy
3
+
4
+ from mmengine.config import read_base
5
+
6
+ from opencompass.models import (HuggingFacewithChatTemplate,
7
+ TurboMindModelwithChatTemplate)
8
+ from opencompass.models.openai_api import OpenAI, OpenAISDK
9
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
10
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
11
+ from opencompass.runners import DLCRunner, LocalRunner
12
+ from opencompass.summarizers import SubjectiveSummarizer
13
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
14
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
15
+
16
+ #######################################################################
17
+ # PART 0 Essential Configs #
18
+ #######################################################################
19
+ with read_base():
20
+ # Datasets Part
21
+ from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm import \
22
+ alignbench_datasets
23
+ from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
24
+ arenahard_datasets
25
+ from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import \
26
+ mtbench_datasets
27
+
28
+ # Summarizer
29
+ # Model List
30
+ # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
31
+ # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
32
+
33
+ #######################################################################
34
+ # PART 1 Datasets List #
35
+ #######################################################################
36
+ # datasets list for evaluation
37
+
38
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
39
+
40
+ #######################################################################
41
+ # PART 2 Datset Summarizer #
42
+ #######################################################################
43
+ summarizer = dict(type=SubjectiveSummarizer, function='subjective')
44
+
45
+ #######################################################################
46
+ # PART 3 Models List #
47
+ #######################################################################
48
+
49
+ models = [
50
+ dict(
51
+ type=TurboMindModelwithChatTemplate,
52
+ abbr='internlm2_5-7b-chat-turbomind',
53
+ path='internlm/internlm2_5-7b-chat',
54
+ engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
55
+ gen_config=dict(top_k=40,
56
+ temperature=1.0,
57
+ top_p=0.9,
58
+ max_new_tokens=4096),
59
+ max_seq_len=16384,
60
+ max_out_len=4096,
61
+ batch_size=16,
62
+ run_cfg=dict(num_gpus=1),
63
+ )
64
+ ]
65
+
66
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], models)
67
+
68
+ #######################################################################
69
+ # PART 4 Inference/Evaluation Configuaration #
70
+ #######################################################################
71
+
72
+ # Local Runner
73
+ infer = dict(
74
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
75
+ runner=dict(
76
+ type=LocalRunner,
77
+ max_num_workers=16,
78
+ retry=0, # Modify if needed
79
+ task=dict(type=OpenICLInferTask)),
80
+ )
81
+
82
+ # JudgeLLM
83
+ api_meta_template = dict(round=[
84
+ dict(role='HUMAN', api_role='HUMAN'),
85
+ dict(role='BOT', api_role='BOT', generate=True),
86
+ ])
87
+
88
+ judge_models = [
89
+ dict(
90
+ type=OpenAISDK,
91
+ abbr='gpt-4o-2024-08-06',
92
+ path='gpt-4o-2024-08-06',
93
+ # openai_api_base=
94
+ # 'http://10.140.1.86:10001/v1', # Change to your own url if needed.
95
+ key='YOUR_API_KEY',
96
+ retry=10,
97
+ meta_template=api_meta_template,
98
+ rpm_verbose=True,
99
+ query_per_second=1,
100
+ max_out_len=4096,
101
+ max_seq_len=16384,
102
+ batch_size=16,
103
+ temperature=0.01,
104
+ tokenizer_path='gpt-4o-2024-08-06')
105
+ ]
106
+
107
+ # Evaluation with local runner
108
+ eval = dict(
109
+ partitioner=dict(
110
+ type=SubjectiveNaivePartitioner,
111
+ models=models,
112
+ judge_models=judge_models,
113
+ ),
114
+ runner=dict(type=LocalRunner,
115
+ max_num_workers=16,
116
+ task=dict(type=SubjectiveEvalTask)),
117
+ )
118
+
119
+ #######################################################################
120
+ # PART 5 Utils Configuaration #
121
+ #######################################################################
122
+ base_exp_dir = 'outputs/corebench/'
123
+ work_dir = osp.join(base_exp_dir, 'chat_subjective')
examples/eval_edgellm_demo.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ # datasets
5
+ from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
6
+ from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \
7
+ commonsenseqa_datasets
8
+ from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \
9
+ chid_datasets
10
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
11
+ from opencompass.configs.datasets.humaneval.humaneval_gen import \
12
+ humaneval_datasets
13
+ from opencompass.configs.datasets.longbench.longbench import \
14
+ longbench_datasets
15
+ from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \
16
+ truthfulqa_datasets
17
+ # models
18
+ from opencompass.configs.models.hf_llama.hf_llama3_8b import \
19
+ models as hf_llama3_8b_model
20
+ from opencompass.configs.models.others.hf_phi_2 import \
21
+ models as hf_phi_2_model
22
+ from opencompass.configs.models.qwen.hf_qwen2_7b import \
23
+ models as hf_qwen2_7b_model
24
+
25
+ datasets = sum([
26
+ v
27
+ for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
28
+ ], [])
29
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
30
+ work_dir = './outputs/edgellm/'
31
+
32
+ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
33
+ # dataset version metric mode phi-2_hf
34
+ # ------------------------------------------- --------- ---------------- ------ ----------
35
+ # commonsense_qa c946f2 accuracy gen 65.19
36
+ # openai_humaneval 8e312c humaneval_pass@1 gen 30.49
37
+ # truthful_qa 5ddc62 rouge_max gen 0.08
38
+ # truthful_qa 5ddc62 rouge_diff gen -0.00
39
+ # truthful_qa 5ddc62 rouge_acc gen 0.41
40
+ # gsm8k 1d7fe4 accuracy gen 62.40
41
+ # chid-dev 211ee7 accuracy gen 12.87
42
+ # chid-test 211ee7 accuracy gen 14.34
43
+ # bbh - naive_average gen 59.50
44
+
45
+ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
46
+ # dataset version metric mode Meta-Llama-3-8B_hf
47
+ # ------------------------------------------- --------- ---------------- ------ --------------------
48
+ # commonsense_qa c946f2 accuracy gen 70.11
49
+ # openai_humaneval 8e312c humaneval_pass@1 gen 26.22
50
+ # truthful_qa 5ddc62 rouge_max gen 0.07
51
+ # truthful_qa 5ddc62 rouge_diff gen -0.01
52
+ # truthful_qa 5ddc62 rouge_acc gen 0.41
53
+ # gsm8k 1d7fe4 accuracy gen 55.80
54
+ # chid-dev 211ee7 accuracy gen 40.59
55
+ # chid-test 211ee7 accuracy gen 36.66
56
+ # bbh - naive_average gen 61.62
57
+ # 20240816_060452
58
+ # tabulate format
59
+ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
60
+ # dataset version metric mode qwen2-7b-hf
61
+ # -------------- --------- ---------- ------ -------------
62
+ # commonsense_qa 734a22 accuracy gen 65.19
63
+ # truthful_qa 5ddc62 rouge_max gen 0.08
64
+ # truthful_qa 5ddc62 rouge_diff gen -0.02
65
+ # truthful_qa 5ddc62 rouge_acc gen 0.44
examples/eval_gpt3.5.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import OpenAI
4
+ from opencompass.partitioners import NaivePartitioner
5
+ from opencompass.runners import LocalRunner
6
+ from opencompass.tasks import OpenICLInferTask
7
+
8
+ with read_base():
9
+ # choose a list of datasets
10
+ from opencompass.configs.datasets.collections.chat_medium import datasets
11
+ # and output the results in a choosen format
12
+ from opencompass.configs.summarizers.medium import summarizer
13
+
14
+ api_meta_template = dict(round=[
15
+ dict(role='HUMAN', api_role='HUMAN'),
16
+ dict(role='BOT', api_role='BOT', generate=True),
17
+ ], )
18
+
19
+ models = [
20
+ dict(
21
+ abbr='GPT-3.5-turbo-0613',
22
+ type=OpenAI,
23
+ path='gpt-3.5-turbo-0613',
24
+ key=
25
+ 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
26
+ meta_template=api_meta_template,
27
+ query_per_second=1,
28
+ max_out_len=2048,
29
+ max_seq_len=4096,
30
+ batch_size=8),
31
+ ]
32
+
33
+ infer = dict(
34
+ partitioner=dict(type=NaivePartitioner),
35
+ runner=dict(type=LocalRunner,
36
+ max_num_workers=8,
37
+ task=dict(type=OpenICLInferTask)),
38
+ )
examples/eval_hellobench.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.subjective.hellobench.hellobench import hellobench_datasets
5
+
6
+ from opencompass.models import HuggingFacewithChatTemplate, OpenAI
7
+ from opencompass.partitioners import NaivePartitioner
8
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
9
+ from opencompass.runners import LocalRunner
10
+ from opencompass.summarizers import DefaultSubjectiveSummarizer
11
+ from opencompass.tasks import OpenICLInferTask
12
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
13
+
14
+ api_meta_template = dict(round=[
15
+ dict(role='HUMAN', api_role='HUMAN'),
16
+ dict(role='BOT', api_role='BOT', generate=True),
17
+ ])
18
+
19
+ # -------------Inference Stage ----------------------------------------
20
+ # For subjective evaluation, we often set do sample for models
21
+ # make sure your models' generation parameters are set properly, for example, if you set temperature=0.8, make sure you set all models' temperature to 0.8
22
+ models = [
23
+ dict(
24
+ type=HuggingFacewithChatTemplate,
25
+ abbr='glm-4-9b-chat-hf',
26
+ path='THUDM/glm-4-9b-chat',
27
+ max_out_len=16384,
28
+ generation_kwargs=dict(
29
+ temperature=0.8,
30
+ do_sample=
31
+ True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
32
+ ),
33
+ model_kwargs=dict(
34
+ device_map='auto',
35
+ trust_remote_code=True,
36
+ ),
37
+ batch_size=1,
38
+ run_cfg=dict(num_gpus=2, num_procs=1),
39
+ stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
40
+ )
41
+ ]
42
+
43
+ datasets = [*hellobench_datasets] # add datasets you want
44
+
45
+ infer = dict(
46
+ partitioner=dict(type=NaivePartitioner),
47
+ runner=dict(type=LocalRunner,
48
+ max_num_workers=16,
49
+ task=dict(type=OpenICLInferTask)),
50
+ )
51
+ # -------------Evalation Stage ----------------------------------------
52
+
53
+ # ------------- JudgeLLM Configuration
54
+ # we recommand to use gpt4o-mini as the judge model
55
+
56
+ # if you want to use open-source LLMs as judge models, you can uncomment the following code
57
+ # judge_models = [
58
+ # dict(
59
+ # type=HuggingFacewithChatTemplate,
60
+ # abbr='glm-4-9b-chat-hf',
61
+ # path='THUDM/glm-4-9b-chat',
62
+ # max_out_len=16384,
63
+ # generation_kwargs=dict(
64
+ # temperature=0.8,
65
+ # do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
66
+ # ),
67
+ # model_kwargs=dict(
68
+ # device_map='auto',
69
+ # trust_remote_code=True,
70
+ # ),
71
+ # batch_size=1,
72
+ # run_cfg=dict(num_gpus=2, num_procs=1),
73
+ # stop_words=['<|endoftext|>', '<|user|>', '<|observation|>'],
74
+ # )
75
+ # ]
76
+
77
+ judge_models = [
78
+ dict(
79
+ abbr='GPT4o',
80
+ type=OpenAI,
81
+ path='gpt-4o',
82
+ key=
83
+ 'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
84
+ meta_template=api_meta_template,
85
+ query_per_second=16,
86
+ max_out_len=4096,
87
+ batch_size=1,
88
+ temperature=0.8,
89
+ seed=42,
90
+ )
91
+ ]
92
+
93
+ ## ------------- Evaluation Configuration
94
+ eval = dict(
95
+ partitioner=dict(
96
+ type=SubjectiveNaivePartitioner,
97
+ models=models,
98
+ judge_models=judge_models,
99
+ ),
100
+ runner=dict(type=LocalRunner,
101
+ max_num_workers=16,
102
+ task=dict(type=SubjectiveEvalTask)),
103
+ )
104
+
105
+ summarizer = dict(type=DefaultSubjectiveSummarizer)
106
+ work_dir = 'outputs/hellobench/'
examples/eval_internlm2_chat_keyset.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+
3
+ from mmengine.config import read_base
4
+
5
+ with read_base():
6
+ from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import \
7
+ agieval_datasets
8
+ from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
9
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
10
+ gsm8k_datasets
11
+ from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
12
+ humaneval_datasets
13
+ from opencompass.configs.datasets.math.math_evaluatorv2_gen_cecb31 import \
14
+ math_datasets
15
+ from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import \
16
+ sanitized_mbpp_datasets
17
+ from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
18
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
19
+ models as hf_internlm2_chat_7b_model
20
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
21
+ models as hf_internlm2_chat_20b_model
22
+ from opencompass.configs.summarizers.internlm2_keyset import summarizer
23
+
24
+ work_dir = './outputs/internlm2-chat-keyset/'
25
+
26
+ _origin_datasets = sum(
27
+ [v for k, v in locals().items() if k.endswith('_datasets')], [])
28
+ _origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
29
+ [])
30
+
31
+ _vanilla_datasets = [deepcopy(d) for d in _origin_datasets]
32
+ _vanilla_models = []
33
+ for m in _origin_models:
34
+ m = deepcopy(m)
35
+ if 'meta_template' in m and 'round' in m['meta_template']:
36
+ round = m['meta_template']['round']
37
+ if any(r['role'] == 'SYSTEM' for r in round):
38
+ new_round = [r for r in round if r['role'] != 'SYSTEM']
39
+ print(
40
+ f'WARNING: remove SYSTEM round in meta_template for {m.get("abbr", None)}'
41
+ )
42
+ m['meta_template']['round'] = new_round
43
+ _vanilla_models.append(m)
44
+
45
+ datasets = _vanilla_datasets
46
+ models = _vanilla_models
examples/eval_internlm2_keyset.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.agieval.agieval_mixed_713d14 import \
5
+ agieval_datasets
6
+ from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
7
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
8
+ gsm8k_datasets
9
+ from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
10
+ humaneval_datasets
11
+ from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
12
+ from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import \
13
+ sanitized_mbpp_datasets
14
+ from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
15
+ from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
16
+ models as hf_internlm2_7b_model
17
+ from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
18
+ models as hf_internlm2_20b_model
19
+ from opencompass.configs.summarizers.internlm2_keyset import summarizer
20
+
21
+ work_dir = './outputs/internlm2-keyset/'
22
+
23
+ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
24
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
examples/eval_internlm3_math500_thinking.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To run this example, you need to do the following steps:
2
+ # 1. Install latest opencompass
3
+ # 2. Start a local server with Qwen2.5-72B-Instruct as LLMJudge server (i.e. using vLLM or LMDeploy)
4
+ # 3. Change the judge_cfg openai_api_base to your corresponindg local server address
5
+ # 4. Start this evaluation by running 'opencompass eval_internlm3_math500_thinking.py'
6
+ from opencompass.models import VLLMwithChatTemplate, OpenAISDK
7
+ from mmengine.config import read_base
8
+
9
+ with read_base():
10
+ from opencompass.configs.datasets.math.math_prm800k_500_0shot_nocot_genericllmeval_gen_63a000 import (
11
+ math_datasets,
12
+ )
13
+
14
+ api_meta_template = dict(
15
+ round=[
16
+ dict(role='HUMAN', api_role='HUMAN'),
17
+ dict(role='BOT', api_role='BOT', generate=True),
18
+ ],
19
+ )
20
+
21
+
22
+ judge_cfg = dict(
23
+ abbr='qwen2-5-72b-instruct',
24
+ type=OpenAISDK,
25
+ path='Qwen/Qwen2.5-72B-Instruct',
26
+ key='YOUR_API_KEY',
27
+ openai_api_base=[
28
+ 'http://172.30.56.81:23333/v1/', ### Change to your own server
29
+ ],
30
+ meta_template=api_meta_template,
31
+ query_per_second=16,
32
+ batch_size=16,
33
+ temperature=0.001,
34
+ max_seq_len=32768,
35
+ max_completion_tokens=32768,
36
+ )
37
+
38
+ datasets = sum(
39
+ (v for k, v in locals().items() if k.endswith('_datasets')),
40
+ [],
41
+ )
42
+ # set max_out_len for inference
43
+ for item in datasets:
44
+ item['infer_cfg']['inferencer']['max_out_len'] = 16384
45
+ if 'judge_cfg' in item['eval_cfg']['evaluator']:
46
+ item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
47
+
48
+ reasoning_chat_template = """You are an expert mathematician with extensive experience in mathematical competitions. You approach problems through systematic thinking and rigorous reasoning. When solving problems, follow these thought processes:
49
+ ## Deep Understanding
50
+ Take time to fully comprehend the problem before attempting a solution. Consider:
51
+ - What is the real question being asked?
52
+ - What are the given conditions and what do they tell us?
53
+ - Are there any special restrictions or assumptions?
54
+ - Which information is crucial and which is supplementary?
55
+ ## Multi-angle Analysis
56
+ Before solving, conduct thorough analysis:
57
+ - What mathematical concepts and properties are involved?
58
+ - Can you recall similar classic problems or solution methods?
59
+ - Would diagrams or tables help visualize the problem?
60
+ - Are there special cases that need separate consideration?
61
+ ## Systematic Thinking
62
+ Plan your solution path:
63
+ - Propose multiple possible approaches
64
+ - Analyze the feasibility and merits of each method
65
+ - Choose the most appropriate method and explain why
66
+ - Break complex problems into smaller, manageable steps
67
+ ## Rigorous Proof
68
+ During the solution process:
69
+ - Provide solid justification for each step
70
+ - Include detailed proofs for key conclusions
71
+ - Pay attention to logical connections
72
+ - Be vigilant about potential oversights
73
+ ## Repeated Verification
74
+ After completing your solution:
75
+ - Verify your results satisfy all conditions
76
+ - Check for overlooked special cases
77
+ - Consider if the solution can be optimized or simplified
78
+ - Review your reasoning process
79
+ Remember:
80
+ 1. Take time to think thoroughly rather than rushing to an answer
81
+ 2. Rigorously prove each key conclusion
82
+ 3. Keep an open mind and try different approaches
83
+ 4. Summarize valuable problem-solving methods
84
+ 5. Maintain healthy skepticism and verify multiple times
85
+ Your response should reflect deep mathematical understanding and precise logical thinking, making your solution path and reasoning clear to others.
86
+ When you're ready, present your complete solution with:
87
+ - Clear problem understanding
88
+ - Detailed solution process
89
+ - Key insights
90
+ - Thorough verification
91
+ Focus on clear, logical progression of ideas and thorough explanation of your mathematical reasoning. Provide answers in the same language as the user asking the question, repeat the final answer using a '\\boxed{}' without any units, you have [[8192]] tokens to complete the answer.
92
+ """
93
+
94
+ reasoning_meta_template = dict(
95
+ begin=dict(
96
+ role='SYSTEM', api_role='SYSTEM', prompt=reasoning_chat_template
97
+ ),
98
+ round=[
99
+ dict(role='HUMAN', api_role='HUMAN'),
100
+ # XXX: all system roles are mapped to human in purpose
101
+ dict(role='BOT', api_role='BOT', generate=True),
102
+ ],
103
+ )
104
+
105
+ models = [
106
+ dict(
107
+ type=VLLMwithChatTemplate,
108
+ abbr='internlm3-8b-instruct-vllm',
109
+ path='internlm/internlm3-8b-instruct',
110
+ model_kwargs=dict(tensor_parallel_size=1),
111
+ generation_kwargs=dict(do_sample=False), # greedy
112
+ max_seq_len=32768,
113
+ max_out_len=16384,
114
+ batch_size=16,
115
+ run_cfg=dict(num_gpus=1),
116
+ meta_template=reasoning_meta_template,
117
+ )
118
+ ]
119
+
120
+ datasets = math_datasets
examples/eval_internlm_chat_lmdeploy_apiserver.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.turbomind_api import TurboMindAPIModel
4
+
5
+ with read_base():
6
+ # choose a list of datasets
7
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
8
+ ceval_datasets
9
+ from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
10
+ crowspairs_datasets
11
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
12
+ gsm8k_datasets
13
+ from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
14
+ from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
15
+ from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
16
+ WiC_datasets
17
+ from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
18
+ WSC_datasets
19
+ from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
20
+ triviaqa_datasets
21
+ # and output the results in a choosen format
22
+ from opencompass.configs.summarizers.medium import summarizer
23
+
24
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
25
+
26
+ meta_template = dict(round=[
27
+ dict(role='HUMAN', begin='<|User|>:', end='\n'),
28
+ dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
29
+ ],
30
+ eos_token_id=103028)
31
+
32
+ internlm_chat_20b = dict(
33
+ type=TurboMindAPIModel,
34
+ abbr='internlm-chat-20b-turbomind',
35
+ api_addr='http://0.0.0.0:23333',
36
+ api_key='internlm-chat-20b', # api_key
37
+ max_out_len=100,
38
+ max_seq_len=2048,
39
+ batch_size=8,
40
+ meta_template=meta_template,
41
+ run_cfg=dict(num_gpus=1, num_procs=1),
42
+ end_str='<eoa>',
43
+ )
44
+
45
+ internlm_chat_7b = dict(
46
+ type=TurboMindAPIModel,
47
+ abbr='internlm-chat-7b-turbomind',
48
+ api_addr='http://0.0.0.0:23333',
49
+ api_key='interlm-chat-7b', # api_key
50
+ max_out_len=100,
51
+ max_seq_len=2048,
52
+ batch_size=16,
53
+ meta_template=meta_template,
54
+ run_cfg=dict(num_gpus=1, num_procs=1),
55
+ end_str='<eoa>',
56
+ )
57
+
58
+ models = [internlm_chat_20b]
examples/eval_internlm_flames_chat.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import HuggingFaceCausalLM
4
+ from opencompass.partitioners import NaivePartitioner
5
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
6
+ from opencompass.runners import LocalRunner
7
+ from opencompass.summarizers import FlamesSummarizer
8
+ from opencompass.tasks import OpenICLInferTask
9
+ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
10
+
11
+ # -------------Inferen Stage ----------------------------------------
12
+
13
+ with read_base():
14
+ from opencompass.configs.datasets.flames.flames_gen import flames_datasets
15
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
16
+ models
17
+
18
+ datasets = [*flames_datasets]
19
+
20
+ from opencompass.models import HuggingFaceCausalLM
21
+
22
+ _meta_template = dict(round=[
23
+ dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
24
+ dict(role='BOT',
25
+ begin='<|im_start|>assistant\n',
26
+ end='<|im_end|>\n',
27
+ generate=True),
28
+ ], )
29
+
30
+ models = [
31
+ dict(
32
+ type=HuggingFaceCausalLM,
33
+ abbr='internlm2-chat-7b-hf',
34
+ path='internlm/internlm2-chat-7b',
35
+ tokenizer_path='internlm/internlm2-chat-7b',
36
+ model_kwargs=dict(
37
+ trust_remote_code=True,
38
+ device_map='auto',
39
+ ),
40
+ tokenizer_kwargs=dict(
41
+ padding_side='left',
42
+ truncation_side='left',
43
+ use_fast=False,
44
+ trust_remote_code=True,
45
+ ),
46
+ max_out_len=2048,
47
+ max_seq_len=2048,
48
+ batch_size=8,
49
+ meta_template=_meta_template,
50
+ run_cfg=dict(num_gpus=1, num_procs=1),
51
+ end_str='<|im_end|>',
52
+ generation_kwargs={
53
+ 'eos_token_id': [2, 92542],
54
+ 'do_sample': True
55
+ },
56
+ batch_padding=True,
57
+ )
58
+ ]
59
+
60
+ infer = dict(
61
+ partitioner=dict(type=NaivePartitioner),
62
+ runner=dict(type=LocalRunner,
63
+ max_num_workers=256,
64
+ task=dict(type=OpenICLInferTask)),
65
+ )
66
+
67
+ # -------------Evalation Stage ----------------------------------------
68
+
69
+ ## ------------- JudgeLLM Configuration---------------------------------
70
+ internlm1_chat_template = dict(round=[
71
+ dict(role='HUMAN', begin='<|User|>:', end='\n'),
72
+ dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
73
+ ], )
74
+
75
+ judge_models = [
76
+ dict(
77
+ type=HuggingFaceCausalLM,
78
+ abbr='flames-scorer',
79
+ path='CaasiHUANG/flames-scorer',
80
+ tokenizer_path='CaasiHUANG/flames-scorer',
81
+ model_kwargs=dict(
82
+ trust_remote_code=True,
83
+ device_map='auto',
84
+ ),
85
+ tokenizer_kwargs=dict(
86
+ padding_side='left',
87
+ truncation_side='left',
88
+ use_fast=False,
89
+ trust_remote_code=True,
90
+ ),
91
+ generation_kwargs={'do_sample': True},
92
+ max_out_len=512,
93
+ max_seq_len=4096,
94
+ batch_size=8,
95
+ meta_template=internlm1_chat_template,
96
+ run_cfg=dict(num_gpus=1, num_procs=1),
97
+ end_str='<eoa>',
98
+ )
99
+ ]
100
+
101
+ ## ------------- Evaluation Configuration----------------
102
+ eval = dict(
103
+ partitioner=dict(
104
+ type=SubjectiveNaivePartitioner,
105
+ mode='singlescore',
106
+ models=models,
107
+ judge_models=judge_models,
108
+ ),
109
+ runner=dict(type=LocalRunner,
110
+ max_num_workers=256,
111
+ task=dict(type=SubjectiveEvalTask)),
112
+ )
113
+
114
+ summarizer = dict(type=FlamesSummarizer, judge_type='general')
115
+
116
+ work_dir = 'outputs/flames/'
examples/eval_internlm_lmdeploy_apiserver.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.turbomind_api import TurboMindAPIModel
4
+
5
+ with read_base():
6
+ # choose a list of datasets
7
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
8
+ ceval_datasets
9
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
10
+ gsm8k_datasets
11
+ from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
12
+ humaneval_datasets
13
+ from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
14
+ from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
15
+ WiC_datasets
16
+ from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
17
+ triviaqa_datasets
18
+ # and output the results in a choosen format
19
+ from opencompass.configs.summarizers.medium import summarizer
20
+
21
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
22
+
23
+ internlm_chat_20b = dict(
24
+ type=TurboMindAPIModel,
25
+ abbr='internlm-chat-20b-turbomind',
26
+ api_addr='http://0.0.0.0:23333',
27
+ max_out_len=100,
28
+ max_seq_len=2048,
29
+ batch_size=8,
30
+ run_cfg=dict(num_gpus=1, num_procs=1),
31
+ )
32
+
33
+ internlm_chat_7b = dict(
34
+ type=TurboMindAPIModel,
35
+ abbr='internlm-chat-7b-turbomind',
36
+ api_addr='http://0.0.0.0:23333',
37
+ max_out_len=100,
38
+ max_seq_len=2048,
39
+ batch_size=16,
40
+ run_cfg=dict(num_gpus=1, num_procs=1),
41
+ )
42
+
43
+ models = [internlm_chat_20b]
examples/eval_internlm_math_chat.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models.huggingface import HuggingFaceCausalLM
4
+
5
+ with read_base():
6
+ # choose a list of datasets
7
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
8
+ from opencompass.configs.datasets.math.math_gen_736506 import math_datasets
9
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_7b import \
10
+ models as internlm_math_chat_7b_models
11
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_20b import \
12
+ models as internlm_math_chat_20b_models
13
+
14
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
15
+ # Eval Math and GSM8k for both Internlm-Math-Chat-7B and 20b
16
+ datasets = [*math_datasets, *gsm8k_datasets]
17
+ models = [*internlm_math_chat_7b_models, *internlm_math_chat_20b_models]
examples/eval_lightllm.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.models import LightllmAPI
4
+ from opencompass.partitioners import NaivePartitioner
5
+ from opencompass.runners import LocalRunner
6
+ from opencompass.tasks import OpenICLInferTask
7
+
8
+ with read_base():
9
+ from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
10
+ humaneval_datasets
11
+ from opencompass.configs.summarizers.leaderboard import summarizer
12
+
13
+ datasets = [*humaneval_datasets]
14
+ '''
15
+ # Prompt template for InternLM2-Chat
16
+ # https://github.com/InternLM/InternLM/blob/main/chat/chat_format.md
17
+
18
+ _meta_template = dict(
19
+ begin='<|im_start|>system\nYou are InternLM2-Chat, a harmless AI assistant<|im_end|>\n',
20
+ round=[
21
+ dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
22
+ dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
23
+ ]
24
+ )
25
+ '''
26
+
27
+ _meta_template = None
28
+
29
+ models = [
30
+ dict(
31
+ abbr='LightllmAPI',
32
+ type=LightllmAPI,
33
+ url='http://localhost:1030/generate',
34
+ meta_template=_meta_template,
35
+ batch_size=32,
36
+ max_workers_per_task=128,
37
+ rate_per_worker=1024,
38
+ retry=4,
39
+ generation_kwargs=dict(do_sample=False,
40
+ ignore_eos=False,
41
+ max_new_tokens=1024),
42
+ ),
43
+ ]
44
+
45
+ infer = dict(
46
+ partitioner=dict(type=NaivePartitioner),
47
+ runner=dict(
48
+ type=LocalRunner,
49
+ max_num_workers=32,
50
+ task=dict(type=OpenICLInferTask),
51
+ ),
52
+ )
examples/eval_math_llm_judge_internal.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import \
5
+ math_datasets
6
+ # 选择一个感兴趣的模型
7
+ from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
8
+ models as qwen2_5_72b_instruct_model
9
+
10
+ eval_model_name = 'eval_model_name'
11
+ postprocessor_model_name = 'postprocessor_model_name'
12
+ eval_model_urls = ['http://0.0.0.0:23333/v1']
13
+ postprocessor_model_urls = ['http://0.0.0.0:23333/v1']
14
+
15
+ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
16
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
17
+
18
+ for dataset in datasets:
19
+ dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name
20
+ dataset['eval_cfg']['evaluator']['url'] = eval_model_urls
21
+ dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls
22
+ dataset['eval_cfg']['evaluator'][
23
+ 'post_model_name'] = postprocessor_model_name
24
+
25
+ # -------------Inferen Stage ----------------------------------------
26
+
27
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
28
+ from opencompass.runners import LocalRunner
29
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
30
+
31
+ infer = dict(
32
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
33
+ runner=dict(type=LocalRunner,
34
+ max_num_workers=8,
35
+ task=dict(type=OpenICLInferTask)),
36
+ )
37
+
38
+ eval = dict(
39
+ partitioner=dict(type=NaivePartitioner, n=10),
40
+ runner=dict(type=LocalRunner,
41
+ max_num_workers=256,
42
+ task=dict(type=OpenICLEvalTask)),
43
+ )
examples/eval_mathbench.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+
5
+ # Import models
6
+ # Import datasets
7
+ from opencompass.configs.datasets.MathBench.mathbench_gen import \
8
+ mathbench_datasets
9
+ from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
10
+ models as internlm2_chat_7b_model
11
+ from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
12
+ models as llama3_8b_instruct_model
13
+ # Import summarizers for display results
14
+ from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
15
+ summarizer # Grouped results for MathBench-A and MathBench-T separately
16
+
17
+ # from opencompass.configs.summarizers.mathbench_v1 import summarizer # Detailed results for every sub-dataset
18
+ # from opencompass.configs.summarizers.groups.mathbench_v1_2024_lang import summarizer # Grouped results for bilingual results
19
+
20
+ datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
21
+ models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
22
+
23
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
24
+ from opencompass.runners import LocalRunner
25
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
26
+
27
+ eval = dict(
28
+ partitioner=dict(type=NaivePartitioner, n=8),
29
+ runner=dict(type=LocalRunner,
30
+ max_num_workers=256,
31
+ task=dict(type=OpenICLEvalTask)),
32
+ )
33
+
34
+ infer = dict(
35
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
36
+ runner=dict(type=LocalRunner,
37
+ max_num_workers=256,
38
+ task=dict(type=OpenICLInferTask)),
39
+ )
40
+
41
+ work_dir = './outputs/mathbench_results'
examples/eval_modelscope_datasets.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # export DATASET_SOURCE='ModelScope' # before run this script
2
+ from datasets import Dataset, DatasetDict
3
+ from mmengine.config import read_base
4
+ from tqdm import tqdm
5
+
6
+ with read_base():
7
+ from opencompass.configs.datasets.agieval.agieval_gen import \
8
+ agieval_datasets as agieval_v2_datasets # ok
9
+ from opencompass.configs.datasets.agieval.agieval_gen_a0c741 import \
10
+ agieval_datasets as agieval_v1_datasets # ok
11
+ from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
12
+ ARC_c_datasets as ARC_c_clean_datasets # ok
13
+ from opencompass.configs.datasets.ARC_c.ARC_c_gen import \
14
+ ARC_c_datasets # ok
15
+ from opencompass.configs.datasets.ARC_e.ARC_e_gen import \
16
+ ARC_e_datasets # ok
17
+ from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets
18
+ from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
19
+ ceval_datasets as ceval_clean_datasets # ok
20
+ from opencompass.configs.datasets.ceval.ceval_gen import \
21
+ ceval_datasets # ok
22
+ from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen import \
23
+ afqmc_datasets # ok
24
+ from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import \
25
+ cmnli_datasets # ok
26
+ from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import \
27
+ cmnli_datasets as cmnli_ppl_datasets # ok
28
+ from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import \
29
+ ocnli_datasets # ok
30
+ from opencompass.configs.datasets.cmmlu.cmmlu_gen import \
31
+ cmmlu_datasets # ok
32
+ from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen import \
33
+ commonsenseqa_datasets # 额外处理gpt
34
+ from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen import \
35
+ GaokaoBench_datasets # ok
36
+ from opencompass.configs.datasets.GaokaoBench.GaokaoBench_mixed import \
37
+ GaokaoBench_datasets as GaokaoBench_mixed_datasets # ok
38
+ from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
39
+ GaokaoBench_datasets as GaokaoBench_no_subjective_datasets # ok
40
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
41
+ gsm8k_datasets # ok
42
+ from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
43
+ hellaswag_datasets as hellaswag_ice_datasets # ok
44
+ from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
45
+ hellaswag_datasets as hellaswag_clean_datasets # ok
46
+ from opencompass.configs.datasets.hellaswag.hellaswag_gen import \
47
+ hellaswag_datasets as hellaswag_v2_datasets # ok
48
+ from opencompass.configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import \
49
+ hellaswag_datasets as hellaswag_v1_datasets # ok
50
+ from opencompass.configs.datasets.hellaswag.hellaswag_ppl_a6e128 import \
51
+ hellaswag_datasets as hellaswag_v3_datasets # ok
52
+ from opencompass.configs.datasets.humaneval.humaneval_gen import \
53
+ humaneval_datasets # ok
54
+ from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
55
+ humaneval_datasets as humaneval_repeat10_datasets # ok
56
+ from opencompass.configs.datasets.lambada.lambada_gen import \
57
+ lambada_datasets # ok
58
+ from opencompass.configs.datasets.lcsts.lcsts_gen import \
59
+ lcsts_datasets # ok
60
+ from opencompass.configs.datasets.math.math_gen import math_datasets # ok
61
+ from opencompass.configs.datasets.mbpp.mbpp_gen import \
62
+ mbpp_datasets as mbpp_v1_datasets # ok
63
+ from opencompass.configs.datasets.mbpp.mbpp_passk_gen_830460 import \
64
+ mbpp_datasets as mbpp_v2_datasets # ok
65
+ from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \
66
+ sanitized_mbpp_datasets # ok
67
+ from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import \
68
+ mmlu_datasets as mmlu_clean_datasets # ok
69
+ from opencompass.configs.datasets.mmlu.mmlu_gen import mmlu_datasets # ok
70
+ from opencompass.configs.datasets.nq.nq_gen import nq_datasets # ok
71
+ from opencompass.configs.datasets.obqa.obqa_gen import obqa_datasets # ok
72
+ from opencompass.configs.datasets.obqa.obqa_ppl_6aac9e import \
73
+ obqa_datasets as obqa_ppl_datasets # ok
74
+ from opencompass.configs.datasets.piqa.piqa_gen import \
75
+ piqa_datasets as piqa_v2_datasets # ok
76
+ from opencompass.configs.datasets.piqa.piqa_ppl import \
77
+ piqa_datasets as piqa_v1_datasets # ok
78
+ from opencompass.configs.datasets.piqa.piqa_ppl_0cfff2 import \
79
+ piqa_datasets as piqa_v3_datasets # ok
80
+ from opencompass.configs.datasets.race.race_ppl import race_datasets # ok
81
+ from opencompass.configs.datasets.siqa.siqa_gen import \
82
+ siqa_datasets as siqa_v2_datasets # ok
83
+ from opencompass.configs.datasets.siqa.siqa_gen_18632c import \
84
+ siqa_datasets as siqa_v3_datasets # ok
85
+ from opencompass.configs.datasets.siqa.siqa_ppl_42bc6e import \
86
+ siqa_datasets as siqa_ppl_datasets # ok
87
+ from opencompass.configs.datasets.storycloze.storycloze_gen import \
88
+ storycloze_datasets # ok
89
+ from opencompass.configs.datasets.storycloze.storycloze_ppl import \
90
+ storycloze_datasets as storycloze_ppl_datasets # ok
91
+ from opencompass.configs.datasets.strategyqa.strategyqa_gen import \
92
+ strategyqa_datasets
93
+ from opencompass.configs.datasets.summedits.summedits_gen import \
94
+ summedits_datasets as summedits_v2_datasets # ok
95
+ from opencompass.configs.datasets.triviaqa.triviaqa_gen import \
96
+ triviaqa_datasets # ok
97
+ from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
98
+ triviaqa_datasets as triviaqa_wiki_1shot_datasets # ok
99
+ from opencompass.configs.datasets.tydiqa.tydiqa_gen import \
100
+ tydiqa_datasets # ok
101
+ from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
102
+ winogrande_datasets as winogrande_5shot_ll_datasets # ok
103
+ from opencompass.configs.datasets.winogrande.winogrande_gen import \
104
+ winogrande_datasets
105
+ from opencompass.configs.datasets.winogrande.winogrande_ll import \
106
+ winogrande_datasets as winogrande_ll_datasets # ok
107
+ from opencompass.configs.datasets.Xsum.Xsum_gen import Xsum_datasets
108
+ from opencompass.configs.models.opt.hf_opt_125m import models
109
+
110
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
111
+ for d in datasets:
112
+ d['reader_cfg'].update({'train_range': '[0:5]', 'test_range': '[0:5]'})
examples/eval_qwen_7b.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.collections.leaderboard.qwen import \
5
+ datasets
6
+ from opencompass.configs.models.qwen.hf_qwen_7b import models
7
+ from opencompass.configs.summarizers.leaderboard import summarizer
8
+ '''
9
+ dataset version metric mode qwen-7b-hf
10
+ -------------------------------------- --------- ---------------- ------ ------------
11
+ --------- 考试 Exam --------- - - - -
12
+ ceval - naive_average ppl 58.65
13
+ agieval - naive_average mixed 40.49
14
+ mmlu - naive_average ppl 57.78
15
+ cmmlu - naive_average ppl 58.57
16
+ GaokaoBench - weighted_average mixed 51.76
17
+ ARC-c 72cf91 accuracy gen 83.73
18
+ ARC-e 72cf91 accuracy gen 90.65
19
+ --------- 语言 Language --------- - - - -
20
+ WiC ce62e6 accuracy ppl 51.10
21
+ chid-dev 25f3d3 accuracy ppl 86.63
22
+ afqmc-dev cc328c accuracy ppl 69.00
23
+ WSC 678cb5 accuracy ppl 63.46
24
+ tydiqa-goldp - naive_average gen 19.98
25
+ flores_100 - naive_average gen 3.20
26
+ --------- 知识 Knowledge --------- - - - -
27
+ BoolQ 463fee accuracy ppl 83.00
28
+ commonsense_qa 0d8e25 accuracy ppl 67.49
29
+ triviaqa b6904f score gen 40.45
30
+ nq b6904f score gen 14.16
31
+ --------- 理解 Understanding --------- - - - -
32
+ C3 e6778d accuracy gen 75.29
33
+ race-middle 73bdec accuracy ppl 90.53
34
+ race-high 73bdec accuracy ppl 87.71
35
+ openbookqa_fact fa871c accuracy gen 92.20
36
+ csl_dev 3c4211 accuracy ppl 56.25
37
+ lcsts 0b3969 rouge1 gen 12.38
38
+ Xsum 207e69 rouge1 gen 36.00
39
+ eprstmt-dev 101429 accuracy gen 89.38
40
+ lambada de1af2 accuracy gen 67.88
41
+ --------- 推理 Reasoning --------- - - - -
42
+ cmnli 15e783 accuracy ppl 54.85
43
+ ocnli 1471e7 accuracy gen 42.34
44
+ AX_b 793c72 accuracy gen 58.61
45
+ AX_g c4c886 accuracy gen 69.10
46
+ RTE c4c886 accuracy gen 57.76
47
+ COPA 59f42c accuracy gen 88.00
48
+ ReCoRD 3e0689 score gen 27.78
49
+ hellaswag 06a1e2 accuracy gen 92.47
50
+ piqa 24369d accuracy gen 78.02
51
+ siqa ea30d1 accuracy ppl 75.03
52
+ math 2c0b9e accuracy gen 11.06
53
+ gsm8k 4c7f6e accuracy gen 50.87
54
+ drop 53a0a7 score gen 44.95
55
+ openai_humaneval dd0dff humaneval_pass@1 gen 23.78
56
+ mbpp 60ca11 score gen 31.20
57
+ bbh - naive_average gen 40.03
58
+ '''
examples/eval_ruler_fix_tokenizer.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
4
+ from opencompass.runners import LocalRunner
5
+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
6
+
7
+ with read_base():
8
+ from opencompass.configs.datasets.ruler.ruler_combined_gen import \
9
+ ruler_combined_datasets
10
+ from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
11
+ models as internlm2_5_7b_chat_1m
12
+ from opencompass.configs.summarizers.groups.ruler import \
13
+ ruler_summary_groups
14
+
15
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
16
+ models = internlm2_5_7b_chat_1m
17
+ work_dir = './outputs/ruler'
18
+
19
+ infer = dict(
20
+ partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
21
+ runner=dict(type=LocalRunner,
22
+ max_num_workers=16,
23
+ task=dict(type=OpenICLInferTask),
24
+ retry=5),
25
+ )
26
+
27
+ eval = dict(
28
+ partitioner=dict(type=NaivePartitioner),
29
+ runner=dict(type=LocalRunner,
30
+ max_num_workers=32,
31
+ task=dict(type=OpenICLEvalTask)),
32
+ )
33
+
34
+ summarizer = dict(
35
+ dataset_abbrs=['ruler_4k', 'ruler_8k', 'ruler_16k', 'ruler_32k'],
36
+ summary_groups=sum(
37
+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
38
+ )
examples/eval_subjective_alpacaeval_official.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
5
+
6
+ from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
7
+ HuggingFaceChatGLM3)
8
+ from opencompass.models.openai_api import OpenAI
9
+ from opencompass.partitioners import NaivePartitioner, SizePartitioner
10
+ from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
11
+ from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
12
+ from opencompass.runners import LocalRunner, SlurmSequentialRunner
13
+ from opencompass.summarizers import AlpacaSummarizer
14
+ from opencompass.tasks import OpenICLInferTask
15
+ from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
16
+
17
+ api_meta_template = dict(
18
+ round=[
19
+ dict(role='HUMAN', api_role='HUMAN'),
20
+ dict(role='BOT', api_role='BOT', generate=True),
21
+ ],
22
+ reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
23
+ )
24
+
25
+ # To run this config, please ensure to successfully installed `alpaca-eval==0.6` and `scikit-learn==1.5`
26
+
27
+ # -------------Inference Stage ----------------------------------------
28
+
29
+ # For subjective evaluation, we often set do sample for models
30
+ models = [
31
+ dict(
32
+ type=HuggingFaceChatGLM3,
33
+ abbr='chatglm3-6b',
34
+ path='THUDM/chatglm3-6b',
35
+ tokenizer_path='THUDM/chatglm3-6b',
36
+ model_kwargs=dict(
37
+ device_map='auto',
38
+ trust_remote_code=True,
39
+ ),
40
+ tokenizer_kwargs=dict(
41
+ padding_side='left',
42
+ truncation_side='left',
43
+ trust_remote_code=True,
44
+ ),
45
+ generation_kwargs=dict(do_sample=True, ),
46
+ meta_template=api_meta_template,
47
+ max_out_len=2048,
48
+ max_seq_len=4096,
49
+ batch_size=1,
50
+ run_cfg=dict(num_gpus=1, num_procs=1),
51
+ )
52
+ ]
53
+
54
+ datasets = [*alpacav2]
55
+
56
+ # -------------Evalation Stage ----------------------------------------
57
+
58
+ ## ------------- JudgeLLM Configuration
59
+ gpt4_judge = dict(
60
+ abbr='GPT4-Turbo',
61
+ path='gpt-4-1106-preview',
62
+ key=
63
+ '', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
64
+ config='weighted_alpaca_eval_gpt4_turbo')
65
+ ## ------------- Evaluation Configuration
66
+ eval = dict(partitioner=dict(type=NaivePartitioner),
67
+ runner=dict(
68
+ type=LocalRunner,
69
+ max_num_workers=256,
70
+ task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
71
+ ))
72
+ work_dir = 'outputs/alpaca/'
requirements/vllm.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ vllm
tmp/08b1e522-33ea-430a-ba78-4d273bf09a88_params.py ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa_5',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_range='[125:150]',
33
+ test_split='test',
34
+ train_split='test'),
35
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
36
+ dict(
37
+ abbr='LongBench_hotpotqa_5',
38
+ eval_cfg=dict(
39
+ evaluator=dict(
40
+ type='opencompass.datasets.LongBenchF1Evaluator'),
41
+ pred_role='BOT'),
42
+ infer_cfg=dict(
43
+ inferencer=dict(
44
+ max_out_len=32,
45
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
46
+ prompt_template=dict(
47
+ template=dict(round=[
48
+ dict(
49
+ prompt=
50
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
51
+ role='HUMAN'),
52
+ ]),
53
+ type=
54
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
55
+ retriever=dict(
56
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
57
+ name='hotpotqa',
58
+ path='opencompass/Longbench',
59
+ reader_cfg=dict(
60
+ input_columns=[
61
+ 'context',
62
+ 'input',
63
+ ],
64
+ output_column='answers',
65
+ test_range='[125:150]',
66
+ test_split='test',
67
+ train_split='test'),
68
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
69
+ dict(
70
+ abbr='LongBench_musique_5',
71
+ eval_cfg=dict(
72
+ evaluator=dict(
73
+ type='opencompass.datasets.LongBenchF1Evaluator'),
74
+ pred_role='BOT'),
75
+ infer_cfg=dict(
76
+ inferencer=dict(
77
+ max_out_len=32,
78
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
79
+ prompt_template=dict(
80
+ template=dict(round=[
81
+ dict(
82
+ prompt=
83
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
84
+ role='HUMAN'),
85
+ ]),
86
+ type=
87
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
88
+ retriever=dict(
89
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
90
+ name='musique',
91
+ path='opencompass/Longbench',
92
+ reader_cfg=dict(
93
+ input_columns=[
94
+ 'context',
95
+ 'input',
96
+ ],
97
+ output_column='answers',
98
+ test_range='[125:150]',
99
+ test_split='test',
100
+ train_split='test'),
101
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
102
+ dict(
103
+ abbr='LongBench_multifieldqa_en_5',
104
+ eval_cfg=dict(
105
+ evaluator=dict(
106
+ type='opencompass.datasets.LongBenchF1Evaluator'),
107
+ pred_role='BOT'),
108
+ infer_cfg=dict(
109
+ inferencer=dict(
110
+ max_out_len=64,
111
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
112
+ prompt_template=dict(
113
+ template=dict(round=[
114
+ dict(
115
+ prompt=
116
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
117
+ role='HUMAN'),
118
+ ]),
119
+ type=
120
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
121
+ retriever=dict(
122
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
123
+ name='multifieldqa_en',
124
+ path='opencompass/Longbench',
125
+ reader_cfg=dict(
126
+ input_columns=[
127
+ 'context',
128
+ 'input',
129
+ ],
130
+ output_column='answers',
131
+ test_range='[95:114]',
132
+ test_split='test',
133
+ train_split='test'),
134
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
135
+ dict(
136
+ abbr='LongBench_multifieldqa_zh_5',
137
+ eval_cfg=dict(
138
+ evaluator=dict(
139
+ language='zh',
140
+ type='opencompass.datasets.LongBenchF1Evaluator'),
141
+ pred_role='BOT'),
142
+ infer_cfg=dict(
143
+ inferencer=dict(
144
+ max_out_len=64,
145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
146
+ prompt_template=dict(
147
+ template=dict(round=[
148
+ dict(
149
+ prompt=
150
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
151
+ role='HUMAN'),
152
+ ]),
153
+ type=
154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
155
+ retriever=dict(
156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
157
+ name='multifieldqa_zh',
158
+ path='opencompass/Longbench',
159
+ reader_cfg=dict(
160
+ input_columns=[
161
+ 'context',
162
+ 'input',
163
+ ],
164
+ output_column='answers',
165
+ test_range='[125:150]',
166
+ test_split='test',
167
+ train_split='test'),
168
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
169
+ dict(
170
+ abbr='LongBench_narrativeqa_5',
171
+ eval_cfg=dict(
172
+ evaluator=dict(
173
+ type='opencompass.datasets.LongBenchF1Evaluator'),
174
+ pred_role='BOT'),
175
+ infer_cfg=dict(
176
+ inferencer=dict(
177
+ max_out_len=128,
178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
179
+ prompt_template=dict(
180
+ template=dict(round=[
181
+ dict(
182
+ prompt=
183
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
184
+ role='HUMAN'),
185
+ ]),
186
+ type=
187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
188
+ retriever=dict(
189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
190
+ name='narrativeqa',
191
+ path='opencompass/Longbench',
192
+ reader_cfg=dict(
193
+ input_columns=[
194
+ 'context',
195
+ 'input',
196
+ ],
197
+ output_column='answers',
198
+ test_range='[125:150]',
199
+ test_split='test',
200
+ train_split='test'),
201
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
202
+ dict(
203
+ abbr='LongBench_qasper_5',
204
+ eval_cfg=dict(
205
+ evaluator=dict(
206
+ type='opencompass.datasets.LongBenchF1Evaluator'),
207
+ pred_role='BOT'),
208
+ infer_cfg=dict(
209
+ inferencer=dict(
210
+ max_out_len=32,
211
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
212
+ prompt_template=dict(
213
+ template=dict(round=[
214
+ dict(
215
+ prompt=
216
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
217
+ role='HUMAN'),
218
+ ]),
219
+ type=
220
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
221
+ retriever=dict(
222
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
223
+ name='qasper',
224
+ path='opencompass/Longbench',
225
+ reader_cfg=dict(
226
+ input_columns=[
227
+ 'context',
228
+ 'input',
229
+ ],
230
+ output_column='answers',
231
+ test_range='[125:150]',
232
+ test_split='test',
233
+ train_split='test'),
234
+ type='opencompass.datasets.LongBenchqasperDataset'),
235
+ dict(
236
+ abbr='LongBench_triviaqa_5',
237
+ eval_cfg=dict(
238
+ evaluator=dict(
239
+ type='opencompass.datasets.LongBenchF1Evaluator'),
240
+ pred_postprocessor=dict(
241
+ type='opencompass.datasets.triviaqa_postprocess'),
242
+ pred_role='BOT'),
243
+ infer_cfg=dict(
244
+ inferencer=dict(
245
+ max_out_len=32,
246
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
247
+ prompt_template=dict(
248
+ template=dict(round=[
249
+ dict(
250
+ prompt=
251
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
252
+ role='HUMAN'),
253
+ ]),
254
+ type=
255
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
256
+ retriever=dict(
257
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
258
+ name='triviaqa',
259
+ path='opencompass/Longbench',
260
+ reader_cfg=dict(
261
+ input_columns=[
262
+ 'context',
263
+ 'input',
264
+ ],
265
+ output_column='answers',
266
+ test_range='[125:150]',
267
+ test_split='test',
268
+ train_split='test'),
269
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
270
+ dict(
271
+ abbr='LongBench_gov_report_5',
272
+ eval_cfg=dict(
273
+ evaluator=dict(
274
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
275
+ pred_role='BOT'),
276
+ infer_cfg=dict(
277
+ inferencer=dict(
278
+ max_out_len=512,
279
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
280
+ prompt_template=dict(
281
+ template=dict(round=[
282
+ dict(
283
+ prompt=
284
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
285
+ role='HUMAN'),
286
+ ]),
287
+ type=
288
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
289
+ retriever=dict(
290
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
291
+ name='gov_report',
292
+ path='opencompass/Longbench',
293
+ reader_cfg=dict(
294
+ input_columns=[
295
+ 'context',
296
+ ],
297
+ output_column='answers',
298
+ test_range='[125:150]',
299
+ test_split='test',
300
+ train_split='test'),
301
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
302
+ dict(
303
+ abbr='LongBench_qmsum_5',
304
+ eval_cfg=dict(
305
+ evaluator=dict(
306
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
307
+ pred_role='BOT'),
308
+ infer_cfg=dict(
309
+ inferencer=dict(
310
+ max_out_len=512,
311
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
312
+ prompt_template=dict(
313
+ template=dict(round=[
314
+ dict(
315
+ prompt=
316
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
317
+ role='HUMAN'),
318
+ ]),
319
+ type=
320
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
321
+ retriever=dict(
322
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
323
+ name='qmsum',
324
+ path='opencompass/Longbench',
325
+ reader_cfg=dict(
326
+ input_columns=[
327
+ 'context',
328
+ 'input',
329
+ ],
330
+ output_column='answers',
331
+ test_range='[125:150]',
332
+ test_split='test',
333
+ train_split='test'),
334
+ type='opencompass.datasets.LongBenchqmsumDataset'),
335
+ dict(
336
+ abbr='LongBench_vcsum_5',
337
+ eval_cfg=dict(
338
+ evaluator=dict(
339
+ language='zh',
340
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
341
+ pred_role='BOT'),
342
+ infer_cfg=dict(
343
+ inferencer=dict(
344
+ max_out_len=512,
345
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
346
+ prompt_template=dict(
347
+ template=dict(round=[
348
+ dict(
349
+ prompt=
350
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
351
+ role='HUMAN'),
352
+ ]),
353
+ type=
354
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
355
+ retriever=dict(
356
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
357
+ name='vcsum',
358
+ path='opencompass/Longbench',
359
+ reader_cfg=dict(
360
+ input_columns=[
361
+ 'context',
362
+ ],
363
+ output_column='answers',
364
+ test_range='[125:150]',
365
+ test_split='test',
366
+ train_split='test'),
367
+ type='opencompass.datasets.LongBenchvcsumDataset'),
368
+ dict(
369
+ abbr='LongBench_dureader_5',
370
+ eval_cfg=dict(
371
+ evaluator=dict(
372
+ language='zh',
373
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
374
+ pred_role='BOT'),
375
+ infer_cfg=dict(
376
+ inferencer=dict(
377
+ max_out_len=128,
378
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
379
+ prompt_template=dict(
380
+ template=dict(round=[
381
+ dict(
382
+ prompt=
383
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
384
+ role='HUMAN'),
385
+ ]),
386
+ type=
387
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
388
+ retriever=dict(
389
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
390
+ name='dureader',
391
+ path='opencompass/Longbench',
392
+ reader_cfg=dict(
393
+ input_columns=[
394
+ 'context',
395
+ 'input',
396
+ ],
397
+ output_column='answers',
398
+ test_range='[125:150]',
399
+ test_split='test',
400
+ train_split='test'),
401
+ type='opencompass.datasets.LongBenchdureaderDataset'),
402
+ dict(
403
+ abbr='LongBench_lcc_5',
404
+ eval_cfg=dict(
405
+ evaluator=dict(
406
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
407
+ pred_role='BOT'),
408
+ infer_cfg=dict(
409
+ inferencer=dict(
410
+ max_out_len=64,
411
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
412
+ prompt_template=dict(
413
+ template=dict(round=[
414
+ dict(
415
+ prompt=
416
+ 'Please complete the code given below. \n{context}Next line of code:\n',
417
+ role='HUMAN'),
418
+ ]),
419
+ type=
420
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
421
+ retriever=dict(
422
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
423
+ name='lcc',
424
+ path='opencompass/Longbench',
425
+ reader_cfg=dict(
426
+ input_columns=[
427
+ 'context',
428
+ ],
429
+ output_column='answers',
430
+ test_range='[315:378]',
431
+ test_split='test',
432
+ train_split='test'),
433
+ type='opencompass.datasets.LongBenchlccDataset'),
434
+ dict(
435
+ abbr='LongBench_repobench-p_5',
436
+ eval_cfg=dict(
437
+ evaluator=dict(
438
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
439
+ pred_role='BOT'),
440
+ infer_cfg=dict(
441
+ inferencer=dict(
442
+ max_out_len=64,
443
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
444
+ prompt_template=dict(
445
+ template=dict(round=[
446
+ dict(
447
+ prompt=
448
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
449
+ role='HUMAN'),
450
+ ]),
451
+ type=
452
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
453
+ retriever=dict(
454
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
455
+ name='repobench-p',
456
+ path='opencompass/Longbench',
457
+ reader_cfg=dict(
458
+ input_columns=[
459
+ 'context',
460
+ 'input',
461
+ ],
462
+ output_column='answers',
463
+ test_range='[315:378]',
464
+ test_split='test',
465
+ train_split='test'),
466
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
467
+ dict(
468
+ abbr='LongBench_passage_retrieval_en_5',
469
+ eval_cfg=dict(
470
+ evaluator=dict(
471
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
472
+ pred_role='BOT'),
473
+ infer_cfg=dict(
474
+ inferencer=dict(
475
+ max_out_len=32,
476
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
477
+ prompt_template=dict(
478
+ template=dict(round=[
479
+ dict(
480
+ prompt=
481
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
482
+ role='HUMAN'),
483
+ ]),
484
+ type=
485
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
486
+ retriever=dict(
487
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
488
+ name='passage_retrieval_en',
489
+ path='opencompass/Longbench',
490
+ reader_cfg=dict(
491
+ input_columns=[
492
+ 'context',
493
+ 'input',
494
+ ],
495
+ output_column='answers',
496
+ test_range='[125:150]',
497
+ test_split='test',
498
+ train_split='test'),
499
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
500
+ dict(
501
+ abbr='LongBench_passage_retrieval_zh_5',
502
+ eval_cfg=dict(
503
+ evaluator=dict(
504
+ language='zh',
505
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
506
+ pred_role='BOT'),
507
+ infer_cfg=dict(
508
+ inferencer=dict(
509
+ max_out_len=32,
510
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
511
+ prompt_template=dict(
512
+ template=dict(round=[
513
+ dict(
514
+ prompt=
515
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
516
+ role='HUMAN'),
517
+ ]),
518
+ type=
519
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
520
+ retriever=dict(
521
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
522
+ name='passage_retrieval_zh',
523
+ path='opencompass/Longbench',
524
+ reader_cfg=dict(
525
+ input_columns=[
526
+ 'context',
527
+ 'input',
528
+ ],
529
+ output_column='answers',
530
+ test_range='[125:150]',
531
+ test_split='test',
532
+ train_split='test'),
533
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
534
+ dict(
535
+ abbr='LongBench_passage_count_5',
536
+ eval_cfg=dict(
537
+ evaluator=dict(
538
+ type='opencompass.datasets.LongBenchCountEvaluator'),
539
+ pred_role='BOT'),
540
+ infer_cfg=dict(
541
+ inferencer=dict(
542
+ max_out_len=32,
543
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
544
+ prompt_template=dict(
545
+ template=dict(round=[
546
+ dict(
547
+ prompt=
548
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
549
+ role='HUMAN'),
550
+ ]),
551
+ type=
552
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
553
+ retriever=dict(
554
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
555
+ name='passage_count',
556
+ path='opencompass/Longbench',
557
+ reader_cfg=dict(
558
+ input_columns=[
559
+ 'context',
560
+ 'input',
561
+ ],
562
+ output_column='answers',
563
+ test_range='[125:150]',
564
+ test_split='test',
565
+ train_split='test'),
566
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
567
+ dict(
568
+ abbr='LongBench_trec_5',
569
+ eval_cfg=dict(
570
+ evaluator=dict(
571
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
572
+ ),
573
+ pred_postprocessor=dict(
574
+ type='opencompass.datasets.trec_postprocess'),
575
+ pred_role='BOT'),
576
+ infer_cfg=dict(
577
+ inferencer=dict(
578
+ max_out_len=64,
579
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
580
+ prompt_template=dict(
581
+ template=dict(round=[
582
+ dict(
583
+ prompt=
584
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
585
+ role='HUMAN'),
586
+ ]),
587
+ type=
588
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
589
+ retriever=dict(
590
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
591
+ name='trec',
592
+ path='opencompass/Longbench',
593
+ reader_cfg=dict(
594
+ input_columns=[
595
+ 'context',
596
+ 'input',
597
+ ],
598
+ output_column='all_labels',
599
+ test_range='[125:150]',
600
+ test_split='test',
601
+ train_split='test'),
602
+ type='opencompass.datasets.LongBenchtrecDataset'),
603
+ dict(
604
+ abbr='LongBench_lsht_5',
605
+ eval_cfg=dict(
606
+ evaluator=dict(
607
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
608
+ ),
609
+ pred_postprocessor=dict(
610
+ type='opencompass.datasets.lsht_postprocess'),
611
+ pred_role='BOT'),
612
+ infer_cfg=dict(
613
+ inferencer=dict(
614
+ max_out_len=64,
615
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
616
+ prompt_template=dict(
617
+ template=dict(round=[
618
+ dict(
619
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
620
+ role='HUMAN'),
621
+ ]),
622
+ type=
623
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
624
+ retriever=dict(
625
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
626
+ name='lsht',
627
+ path='opencompass/Longbench',
628
+ reader_cfg=dict(
629
+ input_columns=[
630
+ 'context',
631
+ 'input',
632
+ ],
633
+ output_column='all_labels',
634
+ test_range='[125:150]',
635
+ test_split='test',
636
+ train_split='test'),
637
+ type='opencompass.datasets.LongBenchlshtDataset'),
638
+ dict(
639
+ abbr='LongBench_multi_news_5',
640
+ eval_cfg=dict(
641
+ evaluator=dict(
642
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
643
+ pred_role='BOT'),
644
+ infer_cfg=dict(
645
+ inferencer=dict(
646
+ max_out_len=512,
647
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
648
+ prompt_template=dict(
649
+ template=dict(round=[
650
+ dict(
651
+ prompt=
652
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
653
+ role='HUMAN'),
654
+ ]),
655
+ type=
656
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
657
+ retriever=dict(
658
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
659
+ name='multi_news',
660
+ path='opencompass/Longbench',
661
+ reader_cfg=dict(
662
+ input_columns=[
663
+ 'context',
664
+ ],
665
+ output_column='answers',
666
+ test_range='[125:150]',
667
+ test_split='test',
668
+ train_split='test'),
669
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
670
+ dict(
671
+ abbr='LongBench_samsum_5',
672
+ eval_cfg=dict(
673
+ evaluator=dict(
674
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
675
+ pred_postprocessor=dict(
676
+ type='opencompass.datasets.samsum_postprocess'),
677
+ pred_role='BOT'),
678
+ infer_cfg=dict(
679
+ inferencer=dict(
680
+ max_out_len=128,
681
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
682
+ prompt_template=dict(
683
+ template=dict(round=[
684
+ dict(
685
+ prompt=
686
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
687
+ role='HUMAN'),
688
+ ]),
689
+ type=
690
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
691
+ retriever=dict(
692
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
693
+ name='samsum',
694
+ path='opencompass/Longbench',
695
+ reader_cfg=dict(
696
+ input_columns=[
697
+ 'context',
698
+ 'input',
699
+ ],
700
+ output_column='answers',
701
+ test_range='[125:150]',
702
+ test_split='test',
703
+ train_split='test'),
704
+ type='opencompass.datasets.LongBenchsamsumDataset'),
705
+ dict(
706
+ abbr='LongBench_2wikimqa_5',
707
+ eval_cfg=dict(
708
+ evaluator=dict(
709
+ type='opencompass.datasets.LongBenchF1Evaluator'),
710
+ pred_role='BOT'),
711
+ infer_cfg=dict(
712
+ inferencer=dict(
713
+ max_out_len=32,
714
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
715
+ prompt_template=dict(
716
+ template=dict(round=[
717
+ dict(
718
+ prompt=
719
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
720
+ role='HUMAN'),
721
+ ]),
722
+ type=
723
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
724
+ retriever=dict(
725
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
726
+ name='2wikimqa',
727
+ path='opencompass/Longbench',
728
+ reader_cfg=dict(
729
+ input_columns=[
730
+ 'context',
731
+ 'input',
732
+ ],
733
+ output_column='answers',
734
+ test_range='[125:150]',
735
+ test_split='test',
736
+ train_split='test'),
737
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
738
+ dict(
739
+ abbr='LongBench_hotpotqa_5',
740
+ eval_cfg=dict(
741
+ evaluator=dict(
742
+ type='opencompass.datasets.LongBenchF1Evaluator'),
743
+ pred_role='BOT'),
744
+ infer_cfg=dict(
745
+ inferencer=dict(
746
+ max_out_len=32,
747
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
748
+ prompt_template=dict(
749
+ template=dict(round=[
750
+ dict(
751
+ prompt=
752
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
753
+ role='HUMAN'),
754
+ ]),
755
+ type=
756
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
757
+ retriever=dict(
758
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
759
+ name='hotpotqa',
760
+ path='opencompass/Longbench',
761
+ reader_cfg=dict(
762
+ input_columns=[
763
+ 'context',
764
+ 'input',
765
+ ],
766
+ output_column='answers',
767
+ test_range='[125:150]',
768
+ test_split='test',
769
+ train_split='test'),
770
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
771
+ dict(
772
+ abbr='LongBench_musique_5',
773
+ eval_cfg=dict(
774
+ evaluator=dict(
775
+ type='opencompass.datasets.LongBenchF1Evaluator'),
776
+ pred_role='BOT'),
777
+ infer_cfg=dict(
778
+ inferencer=dict(
779
+ max_out_len=32,
780
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
781
+ prompt_template=dict(
782
+ template=dict(round=[
783
+ dict(
784
+ prompt=
785
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
786
+ role='HUMAN'),
787
+ ]),
788
+ type=
789
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
790
+ retriever=dict(
791
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
792
+ name='musique',
793
+ path='opencompass/Longbench',
794
+ reader_cfg=dict(
795
+ input_columns=[
796
+ 'context',
797
+ 'input',
798
+ ],
799
+ output_column='answers',
800
+ test_range='[125:150]',
801
+ test_split='test',
802
+ train_split='test'),
803
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
804
+ dict(
805
+ abbr='LongBench_multifieldqa_en_5',
806
+ eval_cfg=dict(
807
+ evaluator=dict(
808
+ type='opencompass.datasets.LongBenchF1Evaluator'),
809
+ pred_role='BOT'),
810
+ infer_cfg=dict(
811
+ inferencer=dict(
812
+ max_out_len=64,
813
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
814
+ prompt_template=dict(
815
+ template=dict(round=[
816
+ dict(
817
+ prompt=
818
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
819
+ role='HUMAN'),
820
+ ]),
821
+ type=
822
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
823
+ retriever=dict(
824
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
825
+ name='multifieldqa_en',
826
+ path='opencompass/Longbench',
827
+ reader_cfg=dict(
828
+ input_columns=[
829
+ 'context',
830
+ 'input',
831
+ ],
832
+ output_column='answers',
833
+ test_range='[95:114]',
834
+ test_split='test',
835
+ train_split='test'),
836
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
837
+ dict(
838
+ abbr='LongBench_multifieldqa_zh_5',
839
+ eval_cfg=dict(
840
+ evaluator=dict(
841
+ language='zh',
842
+ type='opencompass.datasets.LongBenchF1Evaluator'),
843
+ pred_role='BOT'),
844
+ infer_cfg=dict(
845
+ inferencer=dict(
846
+ max_out_len=64,
847
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
848
+ prompt_template=dict(
849
+ template=dict(round=[
850
+ dict(
851
+ prompt=
852
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
853
+ role='HUMAN'),
854
+ ]),
855
+ type=
856
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
857
+ retriever=dict(
858
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
859
+ name='multifieldqa_zh',
860
+ path='opencompass/Longbench',
861
+ reader_cfg=dict(
862
+ input_columns=[
863
+ 'context',
864
+ 'input',
865
+ ],
866
+ output_column='answers',
867
+ test_range='[125:150]',
868
+ test_split='test',
869
+ train_split='test'),
870
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
871
+ dict(
872
+ abbr='LongBench_narrativeqa_5',
873
+ eval_cfg=dict(
874
+ evaluator=dict(
875
+ type='opencompass.datasets.LongBenchF1Evaluator'),
876
+ pred_role='BOT'),
877
+ infer_cfg=dict(
878
+ inferencer=dict(
879
+ max_out_len=128,
880
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
881
+ prompt_template=dict(
882
+ template=dict(round=[
883
+ dict(
884
+ prompt=
885
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
886
+ role='HUMAN'),
887
+ ]),
888
+ type=
889
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
890
+ retriever=dict(
891
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
892
+ name='narrativeqa',
893
+ path='opencompass/Longbench',
894
+ reader_cfg=dict(
895
+ input_columns=[
896
+ 'context',
897
+ 'input',
898
+ ],
899
+ output_column='answers',
900
+ test_range='[125:150]',
901
+ test_split='test',
902
+ train_split='test'),
903
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
904
+ dict(
905
+ abbr='LongBench_qasper_5',
906
+ eval_cfg=dict(
907
+ evaluator=dict(
908
+ type='opencompass.datasets.LongBenchF1Evaluator'),
909
+ pred_role='BOT'),
910
+ infer_cfg=dict(
911
+ inferencer=dict(
912
+ max_out_len=32,
913
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
914
+ prompt_template=dict(
915
+ template=dict(round=[
916
+ dict(
917
+ prompt=
918
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
919
+ role='HUMAN'),
920
+ ]),
921
+ type=
922
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
923
+ retriever=dict(
924
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
925
+ name='qasper',
926
+ path='opencompass/Longbench',
927
+ reader_cfg=dict(
928
+ input_columns=[
929
+ 'context',
930
+ 'input',
931
+ ],
932
+ output_column='answers',
933
+ test_range='[125:150]',
934
+ test_split='test',
935
+ train_split='test'),
936
+ type='opencompass.datasets.LongBenchqasperDataset'),
937
+ dict(
938
+ abbr='LongBench_triviaqa_5',
939
+ eval_cfg=dict(
940
+ evaluator=dict(
941
+ type='opencompass.datasets.LongBenchF1Evaluator'),
942
+ pred_postprocessor=dict(
943
+ type='opencompass.datasets.triviaqa_postprocess'),
944
+ pred_role='BOT'),
945
+ infer_cfg=dict(
946
+ inferencer=dict(
947
+ max_out_len=32,
948
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
949
+ prompt_template=dict(
950
+ template=dict(round=[
951
+ dict(
952
+ prompt=
953
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
954
+ role='HUMAN'),
955
+ ]),
956
+ type=
957
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
958
+ retriever=dict(
959
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
960
+ name='triviaqa',
961
+ path='opencompass/Longbench',
962
+ reader_cfg=dict(
963
+ input_columns=[
964
+ 'context',
965
+ 'input',
966
+ ],
967
+ output_column='answers',
968
+ test_range='[125:150]',
969
+ test_split='test',
970
+ train_split='test'),
971
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
972
+ dict(
973
+ abbr='LongBench_gov_report_5',
974
+ eval_cfg=dict(
975
+ evaluator=dict(
976
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
977
+ pred_role='BOT'),
978
+ infer_cfg=dict(
979
+ inferencer=dict(
980
+ max_out_len=512,
981
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
982
+ prompt_template=dict(
983
+ template=dict(round=[
984
+ dict(
985
+ prompt=
986
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
987
+ role='HUMAN'),
988
+ ]),
989
+ type=
990
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
991
+ retriever=dict(
992
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
993
+ name='gov_report',
994
+ path='opencompass/Longbench',
995
+ reader_cfg=dict(
996
+ input_columns=[
997
+ 'context',
998
+ ],
999
+ output_column='answers',
1000
+ test_range='[125:150]',
1001
+ test_split='test',
1002
+ train_split='test'),
1003
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
1004
+ dict(
1005
+ abbr='LongBench_qmsum_5',
1006
+ eval_cfg=dict(
1007
+ evaluator=dict(
1008
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1009
+ pred_role='BOT'),
1010
+ infer_cfg=dict(
1011
+ inferencer=dict(
1012
+ max_out_len=512,
1013
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1014
+ prompt_template=dict(
1015
+ template=dict(round=[
1016
+ dict(
1017
+ prompt=
1018
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
1019
+ role='HUMAN'),
1020
+ ]),
1021
+ type=
1022
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1023
+ retriever=dict(
1024
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1025
+ name='qmsum',
1026
+ path='opencompass/Longbench',
1027
+ reader_cfg=dict(
1028
+ input_columns=[
1029
+ 'context',
1030
+ 'input',
1031
+ ],
1032
+ output_column='answers',
1033
+ test_range='[125:150]',
1034
+ test_split='test',
1035
+ train_split='test'),
1036
+ type='opencompass.datasets.LongBenchqmsumDataset'),
1037
+ dict(
1038
+ abbr='LongBench_vcsum_5',
1039
+ eval_cfg=dict(
1040
+ evaluator=dict(
1041
+ language='zh',
1042
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1043
+ pred_role='BOT'),
1044
+ infer_cfg=dict(
1045
+ inferencer=dict(
1046
+ max_out_len=512,
1047
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1048
+ prompt_template=dict(
1049
+ template=dict(round=[
1050
+ dict(
1051
+ prompt=
1052
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
1053
+ role='HUMAN'),
1054
+ ]),
1055
+ type=
1056
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1057
+ retriever=dict(
1058
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1059
+ name='vcsum',
1060
+ path='opencompass/Longbench',
1061
+ reader_cfg=dict(
1062
+ input_columns=[
1063
+ 'context',
1064
+ ],
1065
+ output_column='answers',
1066
+ test_range='[125:150]',
1067
+ test_split='test',
1068
+ train_split='test'),
1069
+ type='opencompass.datasets.LongBenchvcsumDataset'),
1070
+ dict(
1071
+ abbr='LongBench_dureader_5',
1072
+ eval_cfg=dict(
1073
+ evaluator=dict(
1074
+ language='zh',
1075
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1076
+ pred_role='BOT'),
1077
+ infer_cfg=dict(
1078
+ inferencer=dict(
1079
+ max_out_len=128,
1080
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1081
+ prompt_template=dict(
1082
+ template=dict(round=[
1083
+ dict(
1084
+ prompt=
1085
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
1086
+ role='HUMAN'),
1087
+ ]),
1088
+ type=
1089
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1090
+ retriever=dict(
1091
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1092
+ name='dureader',
1093
+ path='opencompass/Longbench',
1094
+ reader_cfg=dict(
1095
+ input_columns=[
1096
+ 'context',
1097
+ 'input',
1098
+ ],
1099
+ output_column='answers',
1100
+ test_range='[125:150]',
1101
+ test_split='test',
1102
+ train_split='test'),
1103
+ type='opencompass.datasets.LongBenchdureaderDataset'),
1104
+ dict(
1105
+ abbr='LongBench_lcc_5',
1106
+ eval_cfg=dict(
1107
+ evaluator=dict(
1108
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1109
+ pred_role='BOT'),
1110
+ infer_cfg=dict(
1111
+ inferencer=dict(
1112
+ max_out_len=64,
1113
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1114
+ prompt_template=dict(
1115
+ template=dict(round=[
1116
+ dict(
1117
+ prompt=
1118
+ 'Please complete the code given below. \n{context}Next line of code:\n',
1119
+ role='HUMAN'),
1120
+ ]),
1121
+ type=
1122
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1123
+ retriever=dict(
1124
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1125
+ name='lcc',
1126
+ path='opencompass/Longbench',
1127
+ reader_cfg=dict(
1128
+ input_columns=[
1129
+ 'context',
1130
+ ],
1131
+ output_column='answers',
1132
+ test_range='[315:378]',
1133
+ test_split='test',
1134
+ train_split='test'),
1135
+ type='opencompass.datasets.LongBenchlccDataset'),
1136
+ dict(
1137
+ abbr='LongBench_repobench-p_5',
1138
+ eval_cfg=dict(
1139
+ evaluator=dict(
1140
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1141
+ pred_role='BOT'),
1142
+ infer_cfg=dict(
1143
+ inferencer=dict(
1144
+ max_out_len=64,
1145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1146
+ prompt_template=dict(
1147
+ template=dict(round=[
1148
+ dict(
1149
+ prompt=
1150
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
1151
+ role='HUMAN'),
1152
+ ]),
1153
+ type=
1154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1155
+ retriever=dict(
1156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1157
+ name='repobench-p',
1158
+ path='opencompass/Longbench',
1159
+ reader_cfg=dict(
1160
+ input_columns=[
1161
+ 'context',
1162
+ 'input',
1163
+ ],
1164
+ output_column='answers',
1165
+ test_range='[315:378]',
1166
+ test_split='test',
1167
+ train_split='test'),
1168
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
1169
+ dict(
1170
+ abbr='LongBench_passage_retrieval_en_5',
1171
+ eval_cfg=dict(
1172
+ evaluator=dict(
1173
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1174
+ pred_role='BOT'),
1175
+ infer_cfg=dict(
1176
+ inferencer=dict(
1177
+ max_out_len=32,
1178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1179
+ prompt_template=dict(
1180
+ template=dict(round=[
1181
+ dict(
1182
+ prompt=
1183
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
1184
+ role='HUMAN'),
1185
+ ]),
1186
+ type=
1187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1188
+ retriever=dict(
1189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1190
+ name='passage_retrieval_en',
1191
+ path='opencompass/Longbench',
1192
+ reader_cfg=dict(
1193
+ input_columns=[
1194
+ 'context',
1195
+ 'input',
1196
+ ],
1197
+ output_column='answers',
1198
+ test_range='[125:150]',
1199
+ test_split='test',
1200
+ train_split='test'),
1201
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
1202
+ dict(
1203
+ abbr='LongBench_passage_retrieval_zh_5',
1204
+ eval_cfg=dict(
1205
+ evaluator=dict(
1206
+ language='zh',
1207
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1208
+ pred_role='BOT'),
1209
+ infer_cfg=dict(
1210
+ inferencer=dict(
1211
+ max_out_len=32,
1212
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1213
+ prompt_template=dict(
1214
+ template=dict(round=[
1215
+ dict(
1216
+ prompt=
1217
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
1218
+ role='HUMAN'),
1219
+ ]),
1220
+ type=
1221
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1222
+ retriever=dict(
1223
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1224
+ name='passage_retrieval_zh',
1225
+ path='opencompass/Longbench',
1226
+ reader_cfg=dict(
1227
+ input_columns=[
1228
+ 'context',
1229
+ 'input',
1230
+ ],
1231
+ output_column='answers',
1232
+ test_range='[125:150]',
1233
+ test_split='test',
1234
+ train_split='test'),
1235
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
1236
+ dict(
1237
+ abbr='LongBench_passage_count_5',
1238
+ eval_cfg=dict(
1239
+ evaluator=dict(
1240
+ type='opencompass.datasets.LongBenchCountEvaluator'),
1241
+ pred_role='BOT'),
1242
+ infer_cfg=dict(
1243
+ inferencer=dict(
1244
+ max_out_len=32,
1245
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1246
+ prompt_template=dict(
1247
+ template=dict(round=[
1248
+ dict(
1249
+ prompt=
1250
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
1251
+ role='HUMAN'),
1252
+ ]),
1253
+ type=
1254
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1255
+ retriever=dict(
1256
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1257
+ name='passage_count',
1258
+ path='opencompass/Longbench',
1259
+ reader_cfg=dict(
1260
+ input_columns=[
1261
+ 'context',
1262
+ 'input',
1263
+ ],
1264
+ output_column='answers',
1265
+ test_range='[125:150]',
1266
+ test_split='test',
1267
+ train_split='test'),
1268
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
1269
+ dict(
1270
+ abbr='LongBench_trec_5',
1271
+ eval_cfg=dict(
1272
+ evaluator=dict(
1273
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1274
+ ),
1275
+ pred_postprocessor=dict(
1276
+ type='opencompass.datasets.trec_postprocess'),
1277
+ pred_role='BOT'),
1278
+ infer_cfg=dict(
1279
+ inferencer=dict(
1280
+ max_out_len=64,
1281
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1282
+ prompt_template=dict(
1283
+ template=dict(round=[
1284
+ dict(
1285
+ prompt=
1286
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
1287
+ role='HUMAN'),
1288
+ ]),
1289
+ type=
1290
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1291
+ retriever=dict(
1292
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1293
+ name='trec',
1294
+ path='opencompass/Longbench',
1295
+ reader_cfg=dict(
1296
+ input_columns=[
1297
+ 'context',
1298
+ 'input',
1299
+ ],
1300
+ output_column='all_labels',
1301
+ test_range='[125:150]',
1302
+ test_split='test',
1303
+ train_split='test'),
1304
+ type='opencompass.datasets.LongBenchtrecDataset'),
1305
+ dict(
1306
+ abbr='LongBench_lsht_5',
1307
+ eval_cfg=dict(
1308
+ evaluator=dict(
1309
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1310
+ ),
1311
+ pred_postprocessor=dict(
1312
+ type='opencompass.datasets.lsht_postprocess'),
1313
+ pred_role='BOT'),
1314
+ infer_cfg=dict(
1315
+ inferencer=dict(
1316
+ max_out_len=64,
1317
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1318
+ prompt_template=dict(
1319
+ template=dict(round=[
1320
+ dict(
1321
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
1322
+ role='HUMAN'),
1323
+ ]),
1324
+ type=
1325
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1326
+ retriever=dict(
1327
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1328
+ name='lsht',
1329
+ path='opencompass/Longbench',
1330
+ reader_cfg=dict(
1331
+ input_columns=[
1332
+ 'context',
1333
+ 'input',
1334
+ ],
1335
+ output_column='all_labels',
1336
+ test_range='[125:150]',
1337
+ test_split='test',
1338
+ train_split='test'),
1339
+ type='opencompass.datasets.LongBenchlshtDataset'),
1340
+ dict(
1341
+ abbr='LongBench_multi_news_5',
1342
+ eval_cfg=dict(
1343
+ evaluator=dict(
1344
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1345
+ pred_role='BOT'),
1346
+ infer_cfg=dict(
1347
+ inferencer=dict(
1348
+ max_out_len=512,
1349
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1350
+ prompt_template=dict(
1351
+ template=dict(round=[
1352
+ dict(
1353
+ prompt=
1354
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
1355
+ role='HUMAN'),
1356
+ ]),
1357
+ type=
1358
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1359
+ retriever=dict(
1360
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1361
+ name='multi_news',
1362
+ path='opencompass/Longbench',
1363
+ reader_cfg=dict(
1364
+ input_columns=[
1365
+ 'context',
1366
+ ],
1367
+ output_column='answers',
1368
+ test_range='[125:150]',
1369
+ test_split='test',
1370
+ train_split='test'),
1371
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
1372
+ dict(
1373
+ abbr='LongBench_samsum_5',
1374
+ eval_cfg=dict(
1375
+ evaluator=dict(
1376
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1377
+ pred_postprocessor=dict(
1378
+ type='opencompass.datasets.samsum_postprocess'),
1379
+ pred_role='BOT'),
1380
+ infer_cfg=dict(
1381
+ inferencer=dict(
1382
+ max_out_len=128,
1383
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1384
+ prompt_template=dict(
1385
+ template=dict(round=[
1386
+ dict(
1387
+ prompt=
1388
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
1389
+ role='HUMAN'),
1390
+ ]),
1391
+ type=
1392
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1393
+ retriever=dict(
1394
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1395
+ name='samsum',
1396
+ path='opencompass/Longbench',
1397
+ reader_cfg=dict(
1398
+ input_columns=[
1399
+ 'context',
1400
+ 'input',
1401
+ ],
1402
+ output_column='answers',
1403
+ test_range='[125:150]',
1404
+ test_split='test',
1405
+ train_split='test'),
1406
+ type='opencompass.datasets.LongBenchsamsumDataset'),
1407
+ ],
1408
+ ]
1409
+ models = [
1410
+ dict(
1411
+ abbr='delta_net',
1412
+ batch_size=128,
1413
+ max_seq_len=2048,
1414
+ model_kwargs=dict(
1415
+ device_map='auto',
1416
+ torch_dtype='torch.bfloat16',
1417
+ trust_remote_code=True),
1418
+ path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1419
+ run_cfg=dict(num_gpus=1),
1420
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
1421
+ tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1422
+ type='opencompass.models.HuggingFaceBaseModel'),
1423
+ ]
1424
+ work_dir = 'outputs/default/20251127_202918'
tmp/0954e290-fcd0-400c-8c58-f14a577dc5e4_params.py ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa_0',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_range='[0:25]',
33
+ test_split='test',
34
+ train_split='test'),
35
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
36
+ dict(
37
+ abbr='LongBench_hotpotqa_0',
38
+ eval_cfg=dict(
39
+ evaluator=dict(
40
+ type='opencompass.datasets.LongBenchF1Evaluator'),
41
+ pred_role='BOT'),
42
+ infer_cfg=dict(
43
+ inferencer=dict(
44
+ max_out_len=32,
45
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
46
+ prompt_template=dict(
47
+ template=dict(round=[
48
+ dict(
49
+ prompt=
50
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
51
+ role='HUMAN'),
52
+ ]),
53
+ type=
54
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
55
+ retriever=dict(
56
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
57
+ name='hotpotqa',
58
+ path='opencompass/Longbench',
59
+ reader_cfg=dict(
60
+ input_columns=[
61
+ 'context',
62
+ 'input',
63
+ ],
64
+ output_column='answers',
65
+ test_range='[0:25]',
66
+ test_split='test',
67
+ train_split='test'),
68
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
69
+ dict(
70
+ abbr='LongBench_musique_0',
71
+ eval_cfg=dict(
72
+ evaluator=dict(
73
+ type='opencompass.datasets.LongBenchF1Evaluator'),
74
+ pred_role='BOT'),
75
+ infer_cfg=dict(
76
+ inferencer=dict(
77
+ max_out_len=32,
78
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
79
+ prompt_template=dict(
80
+ template=dict(round=[
81
+ dict(
82
+ prompt=
83
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
84
+ role='HUMAN'),
85
+ ]),
86
+ type=
87
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
88
+ retriever=dict(
89
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
90
+ name='musique',
91
+ path='opencompass/Longbench',
92
+ reader_cfg=dict(
93
+ input_columns=[
94
+ 'context',
95
+ 'input',
96
+ ],
97
+ output_column='answers',
98
+ test_range='[0:25]',
99
+ test_split='test',
100
+ train_split='test'),
101
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
102
+ dict(
103
+ abbr='LongBench_multifieldqa_en_0',
104
+ eval_cfg=dict(
105
+ evaluator=dict(
106
+ type='opencompass.datasets.LongBenchF1Evaluator'),
107
+ pred_role='BOT'),
108
+ infer_cfg=dict(
109
+ inferencer=dict(
110
+ max_out_len=64,
111
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
112
+ prompt_template=dict(
113
+ template=dict(round=[
114
+ dict(
115
+ prompt=
116
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
117
+ role='HUMAN'),
118
+ ]),
119
+ type=
120
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
121
+ retriever=dict(
122
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
123
+ name='multifieldqa_en',
124
+ path='opencompass/Longbench',
125
+ reader_cfg=dict(
126
+ input_columns=[
127
+ 'context',
128
+ 'input',
129
+ ],
130
+ output_column='answers',
131
+ test_range='[0:19]',
132
+ test_split='test',
133
+ train_split='test'),
134
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
135
+ dict(
136
+ abbr='LongBench_multifieldqa_zh_0',
137
+ eval_cfg=dict(
138
+ evaluator=dict(
139
+ language='zh',
140
+ type='opencompass.datasets.LongBenchF1Evaluator'),
141
+ pred_role='BOT'),
142
+ infer_cfg=dict(
143
+ inferencer=dict(
144
+ max_out_len=64,
145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
146
+ prompt_template=dict(
147
+ template=dict(round=[
148
+ dict(
149
+ prompt=
150
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
151
+ role='HUMAN'),
152
+ ]),
153
+ type=
154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
155
+ retriever=dict(
156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
157
+ name='multifieldqa_zh',
158
+ path='opencompass/Longbench',
159
+ reader_cfg=dict(
160
+ input_columns=[
161
+ 'context',
162
+ 'input',
163
+ ],
164
+ output_column='answers',
165
+ test_range='[0:25]',
166
+ test_split='test',
167
+ train_split='test'),
168
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
169
+ dict(
170
+ abbr='LongBench_narrativeqa_0',
171
+ eval_cfg=dict(
172
+ evaluator=dict(
173
+ type='opencompass.datasets.LongBenchF1Evaluator'),
174
+ pred_role='BOT'),
175
+ infer_cfg=dict(
176
+ inferencer=dict(
177
+ max_out_len=128,
178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
179
+ prompt_template=dict(
180
+ template=dict(round=[
181
+ dict(
182
+ prompt=
183
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
184
+ role='HUMAN'),
185
+ ]),
186
+ type=
187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
188
+ retriever=dict(
189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
190
+ name='narrativeqa',
191
+ path='opencompass/Longbench',
192
+ reader_cfg=dict(
193
+ input_columns=[
194
+ 'context',
195
+ 'input',
196
+ ],
197
+ output_column='answers',
198
+ test_range='[0:25]',
199
+ test_split='test',
200
+ train_split='test'),
201
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
202
+ dict(
203
+ abbr='LongBench_qasper_0',
204
+ eval_cfg=dict(
205
+ evaluator=dict(
206
+ type='opencompass.datasets.LongBenchF1Evaluator'),
207
+ pred_role='BOT'),
208
+ infer_cfg=dict(
209
+ inferencer=dict(
210
+ max_out_len=32,
211
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
212
+ prompt_template=dict(
213
+ template=dict(round=[
214
+ dict(
215
+ prompt=
216
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
217
+ role='HUMAN'),
218
+ ]),
219
+ type=
220
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
221
+ retriever=dict(
222
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
223
+ name='qasper',
224
+ path='opencompass/Longbench',
225
+ reader_cfg=dict(
226
+ input_columns=[
227
+ 'context',
228
+ 'input',
229
+ ],
230
+ output_column='answers',
231
+ test_range='[0:25]',
232
+ test_split='test',
233
+ train_split='test'),
234
+ type='opencompass.datasets.LongBenchqasperDataset'),
235
+ dict(
236
+ abbr='LongBench_triviaqa_0',
237
+ eval_cfg=dict(
238
+ evaluator=dict(
239
+ type='opencompass.datasets.LongBenchF1Evaluator'),
240
+ pred_postprocessor=dict(
241
+ type='opencompass.datasets.triviaqa_postprocess'),
242
+ pred_role='BOT'),
243
+ infer_cfg=dict(
244
+ inferencer=dict(
245
+ max_out_len=32,
246
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
247
+ prompt_template=dict(
248
+ template=dict(round=[
249
+ dict(
250
+ prompt=
251
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
252
+ role='HUMAN'),
253
+ ]),
254
+ type=
255
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
256
+ retriever=dict(
257
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
258
+ name='triviaqa',
259
+ path='opencompass/Longbench',
260
+ reader_cfg=dict(
261
+ input_columns=[
262
+ 'context',
263
+ 'input',
264
+ ],
265
+ output_column='answers',
266
+ test_range='[0:25]',
267
+ test_split='test',
268
+ train_split='test'),
269
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
270
+ dict(
271
+ abbr='LongBench_gov_report_0',
272
+ eval_cfg=dict(
273
+ evaluator=dict(
274
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
275
+ pred_role='BOT'),
276
+ infer_cfg=dict(
277
+ inferencer=dict(
278
+ max_out_len=512,
279
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
280
+ prompt_template=dict(
281
+ template=dict(round=[
282
+ dict(
283
+ prompt=
284
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
285
+ role='HUMAN'),
286
+ ]),
287
+ type=
288
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
289
+ retriever=dict(
290
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
291
+ name='gov_report',
292
+ path='opencompass/Longbench',
293
+ reader_cfg=dict(
294
+ input_columns=[
295
+ 'context',
296
+ ],
297
+ output_column='answers',
298
+ test_range='[0:25]',
299
+ test_split='test',
300
+ train_split='test'),
301
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
302
+ dict(
303
+ abbr='LongBench_qmsum_0',
304
+ eval_cfg=dict(
305
+ evaluator=dict(
306
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
307
+ pred_role='BOT'),
308
+ infer_cfg=dict(
309
+ inferencer=dict(
310
+ max_out_len=512,
311
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
312
+ prompt_template=dict(
313
+ template=dict(round=[
314
+ dict(
315
+ prompt=
316
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
317
+ role='HUMAN'),
318
+ ]),
319
+ type=
320
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
321
+ retriever=dict(
322
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
323
+ name='qmsum',
324
+ path='opencompass/Longbench',
325
+ reader_cfg=dict(
326
+ input_columns=[
327
+ 'context',
328
+ 'input',
329
+ ],
330
+ output_column='answers',
331
+ test_range='[0:25]',
332
+ test_split='test',
333
+ train_split='test'),
334
+ type='opencompass.datasets.LongBenchqmsumDataset'),
335
+ dict(
336
+ abbr='LongBench_vcsum_0',
337
+ eval_cfg=dict(
338
+ evaluator=dict(
339
+ language='zh',
340
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
341
+ pred_role='BOT'),
342
+ infer_cfg=dict(
343
+ inferencer=dict(
344
+ max_out_len=512,
345
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
346
+ prompt_template=dict(
347
+ template=dict(round=[
348
+ dict(
349
+ prompt=
350
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
351
+ role='HUMAN'),
352
+ ]),
353
+ type=
354
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
355
+ retriever=dict(
356
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
357
+ name='vcsum',
358
+ path='opencompass/Longbench',
359
+ reader_cfg=dict(
360
+ input_columns=[
361
+ 'context',
362
+ ],
363
+ output_column='answers',
364
+ test_range='[0:25]',
365
+ test_split='test',
366
+ train_split='test'),
367
+ type='opencompass.datasets.LongBenchvcsumDataset'),
368
+ dict(
369
+ abbr='LongBench_dureader_0',
370
+ eval_cfg=dict(
371
+ evaluator=dict(
372
+ language='zh',
373
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
374
+ pred_role='BOT'),
375
+ infer_cfg=dict(
376
+ inferencer=dict(
377
+ max_out_len=128,
378
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
379
+ prompt_template=dict(
380
+ template=dict(round=[
381
+ dict(
382
+ prompt=
383
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
384
+ role='HUMAN'),
385
+ ]),
386
+ type=
387
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
388
+ retriever=dict(
389
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
390
+ name='dureader',
391
+ path='opencompass/Longbench',
392
+ reader_cfg=dict(
393
+ input_columns=[
394
+ 'context',
395
+ 'input',
396
+ ],
397
+ output_column='answers',
398
+ test_range='[0:25]',
399
+ test_split='test',
400
+ train_split='test'),
401
+ type='opencompass.datasets.LongBenchdureaderDataset'),
402
+ dict(
403
+ abbr='LongBench_lcc_0',
404
+ eval_cfg=dict(
405
+ evaluator=dict(
406
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
407
+ pred_role='BOT'),
408
+ infer_cfg=dict(
409
+ inferencer=dict(
410
+ max_out_len=64,
411
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
412
+ prompt_template=dict(
413
+ template=dict(round=[
414
+ dict(
415
+ prompt=
416
+ 'Please complete the code given below. \n{context}Next line of code:\n',
417
+ role='HUMAN'),
418
+ ]),
419
+ type=
420
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
421
+ retriever=dict(
422
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
423
+ name='lcc',
424
+ path='opencompass/Longbench',
425
+ reader_cfg=dict(
426
+ input_columns=[
427
+ 'context',
428
+ ],
429
+ output_column='answers',
430
+ test_range='[0:63]',
431
+ test_split='test',
432
+ train_split='test'),
433
+ type='opencompass.datasets.LongBenchlccDataset'),
434
+ dict(
435
+ abbr='LongBench_repobench-p_0',
436
+ eval_cfg=dict(
437
+ evaluator=dict(
438
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
439
+ pred_role='BOT'),
440
+ infer_cfg=dict(
441
+ inferencer=dict(
442
+ max_out_len=64,
443
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
444
+ prompt_template=dict(
445
+ template=dict(round=[
446
+ dict(
447
+ prompt=
448
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
449
+ role='HUMAN'),
450
+ ]),
451
+ type=
452
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
453
+ retriever=dict(
454
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
455
+ name='repobench-p',
456
+ path='opencompass/Longbench',
457
+ reader_cfg=dict(
458
+ input_columns=[
459
+ 'context',
460
+ 'input',
461
+ ],
462
+ output_column='answers',
463
+ test_range='[0:63]',
464
+ test_split='test',
465
+ train_split='test'),
466
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
467
+ dict(
468
+ abbr='LongBench_passage_retrieval_en_0',
469
+ eval_cfg=dict(
470
+ evaluator=dict(
471
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
472
+ pred_role='BOT'),
473
+ infer_cfg=dict(
474
+ inferencer=dict(
475
+ max_out_len=32,
476
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
477
+ prompt_template=dict(
478
+ template=dict(round=[
479
+ dict(
480
+ prompt=
481
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
482
+ role='HUMAN'),
483
+ ]),
484
+ type=
485
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
486
+ retriever=dict(
487
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
488
+ name='passage_retrieval_en',
489
+ path='opencompass/Longbench',
490
+ reader_cfg=dict(
491
+ input_columns=[
492
+ 'context',
493
+ 'input',
494
+ ],
495
+ output_column='answers',
496
+ test_range='[0:25]',
497
+ test_split='test',
498
+ train_split='test'),
499
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
500
+ dict(
501
+ abbr='LongBench_passage_retrieval_zh_0',
502
+ eval_cfg=dict(
503
+ evaluator=dict(
504
+ language='zh',
505
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
506
+ pred_role='BOT'),
507
+ infer_cfg=dict(
508
+ inferencer=dict(
509
+ max_out_len=32,
510
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
511
+ prompt_template=dict(
512
+ template=dict(round=[
513
+ dict(
514
+ prompt=
515
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
516
+ role='HUMAN'),
517
+ ]),
518
+ type=
519
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
520
+ retriever=dict(
521
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
522
+ name='passage_retrieval_zh',
523
+ path='opencompass/Longbench',
524
+ reader_cfg=dict(
525
+ input_columns=[
526
+ 'context',
527
+ 'input',
528
+ ],
529
+ output_column='answers',
530
+ test_range='[0:25]',
531
+ test_split='test',
532
+ train_split='test'),
533
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
534
+ dict(
535
+ abbr='LongBench_passage_count_0',
536
+ eval_cfg=dict(
537
+ evaluator=dict(
538
+ type='opencompass.datasets.LongBenchCountEvaluator'),
539
+ pred_role='BOT'),
540
+ infer_cfg=dict(
541
+ inferencer=dict(
542
+ max_out_len=32,
543
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
544
+ prompt_template=dict(
545
+ template=dict(round=[
546
+ dict(
547
+ prompt=
548
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
549
+ role='HUMAN'),
550
+ ]),
551
+ type=
552
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
553
+ retriever=dict(
554
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
555
+ name='passage_count',
556
+ path='opencompass/Longbench',
557
+ reader_cfg=dict(
558
+ input_columns=[
559
+ 'context',
560
+ 'input',
561
+ ],
562
+ output_column='answers',
563
+ test_range='[0:25]',
564
+ test_split='test',
565
+ train_split='test'),
566
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
567
+ dict(
568
+ abbr='LongBench_trec_0',
569
+ eval_cfg=dict(
570
+ evaluator=dict(
571
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
572
+ ),
573
+ pred_postprocessor=dict(
574
+ type='opencompass.datasets.trec_postprocess'),
575
+ pred_role='BOT'),
576
+ infer_cfg=dict(
577
+ inferencer=dict(
578
+ max_out_len=64,
579
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
580
+ prompt_template=dict(
581
+ template=dict(round=[
582
+ dict(
583
+ prompt=
584
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
585
+ role='HUMAN'),
586
+ ]),
587
+ type=
588
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
589
+ retriever=dict(
590
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
591
+ name='trec',
592
+ path='opencompass/Longbench',
593
+ reader_cfg=dict(
594
+ input_columns=[
595
+ 'context',
596
+ 'input',
597
+ ],
598
+ output_column='all_labels',
599
+ test_range='[0:25]',
600
+ test_split='test',
601
+ train_split='test'),
602
+ type='opencompass.datasets.LongBenchtrecDataset'),
603
+ dict(
604
+ abbr='LongBench_lsht_0',
605
+ eval_cfg=dict(
606
+ evaluator=dict(
607
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
608
+ ),
609
+ pred_postprocessor=dict(
610
+ type='opencompass.datasets.lsht_postprocess'),
611
+ pred_role='BOT'),
612
+ infer_cfg=dict(
613
+ inferencer=dict(
614
+ max_out_len=64,
615
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
616
+ prompt_template=dict(
617
+ template=dict(round=[
618
+ dict(
619
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
620
+ role='HUMAN'),
621
+ ]),
622
+ type=
623
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
624
+ retriever=dict(
625
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
626
+ name='lsht',
627
+ path='opencompass/Longbench',
628
+ reader_cfg=dict(
629
+ input_columns=[
630
+ 'context',
631
+ 'input',
632
+ ],
633
+ output_column='all_labels',
634
+ test_range='[0:25]',
635
+ test_split='test',
636
+ train_split='test'),
637
+ type='opencompass.datasets.LongBenchlshtDataset'),
638
+ dict(
639
+ abbr='LongBench_multi_news_0',
640
+ eval_cfg=dict(
641
+ evaluator=dict(
642
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
643
+ pred_role='BOT'),
644
+ infer_cfg=dict(
645
+ inferencer=dict(
646
+ max_out_len=512,
647
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
648
+ prompt_template=dict(
649
+ template=dict(round=[
650
+ dict(
651
+ prompt=
652
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
653
+ role='HUMAN'),
654
+ ]),
655
+ type=
656
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
657
+ retriever=dict(
658
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
659
+ name='multi_news',
660
+ path='opencompass/Longbench',
661
+ reader_cfg=dict(
662
+ input_columns=[
663
+ 'context',
664
+ ],
665
+ output_column='answers',
666
+ test_range='[0:25]',
667
+ test_split='test',
668
+ train_split='test'),
669
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
670
+ dict(
671
+ abbr='LongBench_samsum_0',
672
+ eval_cfg=dict(
673
+ evaluator=dict(
674
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
675
+ pred_postprocessor=dict(
676
+ type='opencompass.datasets.samsum_postprocess'),
677
+ pred_role='BOT'),
678
+ infer_cfg=dict(
679
+ inferencer=dict(
680
+ max_out_len=128,
681
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
682
+ prompt_template=dict(
683
+ template=dict(round=[
684
+ dict(
685
+ prompt=
686
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
687
+ role='HUMAN'),
688
+ ]),
689
+ type=
690
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
691
+ retriever=dict(
692
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
693
+ name='samsum',
694
+ path='opencompass/Longbench',
695
+ reader_cfg=dict(
696
+ input_columns=[
697
+ 'context',
698
+ 'input',
699
+ ],
700
+ output_column='answers',
701
+ test_range='[0:25]',
702
+ test_split='test',
703
+ train_split='test'),
704
+ type='opencompass.datasets.LongBenchsamsumDataset'),
705
+ dict(
706
+ abbr='LongBench_2wikimqa_0',
707
+ eval_cfg=dict(
708
+ evaluator=dict(
709
+ type='opencompass.datasets.LongBenchF1Evaluator'),
710
+ pred_role='BOT'),
711
+ infer_cfg=dict(
712
+ inferencer=dict(
713
+ max_out_len=32,
714
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
715
+ prompt_template=dict(
716
+ template=dict(round=[
717
+ dict(
718
+ prompt=
719
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
720
+ role='HUMAN'),
721
+ ]),
722
+ type=
723
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
724
+ retriever=dict(
725
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
726
+ name='2wikimqa',
727
+ path='opencompass/Longbench',
728
+ reader_cfg=dict(
729
+ input_columns=[
730
+ 'context',
731
+ 'input',
732
+ ],
733
+ output_column='answers',
734
+ test_range='[0:25]',
735
+ test_split='test',
736
+ train_split='test'),
737
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
738
+ dict(
739
+ abbr='LongBench_hotpotqa_0',
740
+ eval_cfg=dict(
741
+ evaluator=dict(
742
+ type='opencompass.datasets.LongBenchF1Evaluator'),
743
+ pred_role='BOT'),
744
+ infer_cfg=dict(
745
+ inferencer=dict(
746
+ max_out_len=32,
747
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
748
+ prompt_template=dict(
749
+ template=dict(round=[
750
+ dict(
751
+ prompt=
752
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
753
+ role='HUMAN'),
754
+ ]),
755
+ type=
756
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
757
+ retriever=dict(
758
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
759
+ name='hotpotqa',
760
+ path='opencompass/Longbench',
761
+ reader_cfg=dict(
762
+ input_columns=[
763
+ 'context',
764
+ 'input',
765
+ ],
766
+ output_column='answers',
767
+ test_range='[0:25]',
768
+ test_split='test',
769
+ train_split='test'),
770
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
771
+ dict(
772
+ abbr='LongBench_musique_0',
773
+ eval_cfg=dict(
774
+ evaluator=dict(
775
+ type='opencompass.datasets.LongBenchF1Evaluator'),
776
+ pred_role='BOT'),
777
+ infer_cfg=dict(
778
+ inferencer=dict(
779
+ max_out_len=32,
780
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
781
+ prompt_template=dict(
782
+ template=dict(round=[
783
+ dict(
784
+ prompt=
785
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
786
+ role='HUMAN'),
787
+ ]),
788
+ type=
789
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
790
+ retriever=dict(
791
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
792
+ name='musique',
793
+ path='opencompass/Longbench',
794
+ reader_cfg=dict(
795
+ input_columns=[
796
+ 'context',
797
+ 'input',
798
+ ],
799
+ output_column='answers',
800
+ test_range='[0:25]',
801
+ test_split='test',
802
+ train_split='test'),
803
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
804
+ dict(
805
+ abbr='LongBench_multifieldqa_en_0',
806
+ eval_cfg=dict(
807
+ evaluator=dict(
808
+ type='opencompass.datasets.LongBenchF1Evaluator'),
809
+ pred_role='BOT'),
810
+ infer_cfg=dict(
811
+ inferencer=dict(
812
+ max_out_len=64,
813
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
814
+ prompt_template=dict(
815
+ template=dict(round=[
816
+ dict(
817
+ prompt=
818
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
819
+ role='HUMAN'),
820
+ ]),
821
+ type=
822
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
823
+ retriever=dict(
824
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
825
+ name='multifieldqa_en',
826
+ path='opencompass/Longbench',
827
+ reader_cfg=dict(
828
+ input_columns=[
829
+ 'context',
830
+ 'input',
831
+ ],
832
+ output_column='answers',
833
+ test_range='[0:19]',
834
+ test_split='test',
835
+ train_split='test'),
836
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
837
+ dict(
838
+ abbr='LongBench_multifieldqa_zh_0',
839
+ eval_cfg=dict(
840
+ evaluator=dict(
841
+ language='zh',
842
+ type='opencompass.datasets.LongBenchF1Evaluator'),
843
+ pred_role='BOT'),
844
+ infer_cfg=dict(
845
+ inferencer=dict(
846
+ max_out_len=64,
847
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
848
+ prompt_template=dict(
849
+ template=dict(round=[
850
+ dict(
851
+ prompt=
852
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
853
+ role='HUMAN'),
854
+ ]),
855
+ type=
856
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
857
+ retriever=dict(
858
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
859
+ name='multifieldqa_zh',
860
+ path='opencompass/Longbench',
861
+ reader_cfg=dict(
862
+ input_columns=[
863
+ 'context',
864
+ 'input',
865
+ ],
866
+ output_column='answers',
867
+ test_range='[0:25]',
868
+ test_split='test',
869
+ train_split='test'),
870
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
871
+ dict(
872
+ abbr='LongBench_narrativeqa_0',
873
+ eval_cfg=dict(
874
+ evaluator=dict(
875
+ type='opencompass.datasets.LongBenchF1Evaluator'),
876
+ pred_role='BOT'),
877
+ infer_cfg=dict(
878
+ inferencer=dict(
879
+ max_out_len=128,
880
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
881
+ prompt_template=dict(
882
+ template=dict(round=[
883
+ dict(
884
+ prompt=
885
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
886
+ role='HUMAN'),
887
+ ]),
888
+ type=
889
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
890
+ retriever=dict(
891
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
892
+ name='narrativeqa',
893
+ path='opencompass/Longbench',
894
+ reader_cfg=dict(
895
+ input_columns=[
896
+ 'context',
897
+ 'input',
898
+ ],
899
+ output_column='answers',
900
+ test_range='[0:25]',
901
+ test_split='test',
902
+ train_split='test'),
903
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
904
+ dict(
905
+ abbr='LongBench_qasper_0',
906
+ eval_cfg=dict(
907
+ evaluator=dict(
908
+ type='opencompass.datasets.LongBenchF1Evaluator'),
909
+ pred_role='BOT'),
910
+ infer_cfg=dict(
911
+ inferencer=dict(
912
+ max_out_len=32,
913
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
914
+ prompt_template=dict(
915
+ template=dict(round=[
916
+ dict(
917
+ prompt=
918
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
919
+ role='HUMAN'),
920
+ ]),
921
+ type=
922
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
923
+ retriever=dict(
924
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
925
+ name='qasper',
926
+ path='opencompass/Longbench',
927
+ reader_cfg=dict(
928
+ input_columns=[
929
+ 'context',
930
+ 'input',
931
+ ],
932
+ output_column='answers',
933
+ test_range='[0:25]',
934
+ test_split='test',
935
+ train_split='test'),
936
+ type='opencompass.datasets.LongBenchqasperDataset'),
937
+ dict(
938
+ abbr='LongBench_triviaqa_0',
939
+ eval_cfg=dict(
940
+ evaluator=dict(
941
+ type='opencompass.datasets.LongBenchF1Evaluator'),
942
+ pred_postprocessor=dict(
943
+ type='opencompass.datasets.triviaqa_postprocess'),
944
+ pred_role='BOT'),
945
+ infer_cfg=dict(
946
+ inferencer=dict(
947
+ max_out_len=32,
948
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
949
+ prompt_template=dict(
950
+ template=dict(round=[
951
+ dict(
952
+ prompt=
953
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
954
+ role='HUMAN'),
955
+ ]),
956
+ type=
957
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
958
+ retriever=dict(
959
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
960
+ name='triviaqa',
961
+ path='opencompass/Longbench',
962
+ reader_cfg=dict(
963
+ input_columns=[
964
+ 'context',
965
+ 'input',
966
+ ],
967
+ output_column='answers',
968
+ test_range='[0:25]',
969
+ test_split='test',
970
+ train_split='test'),
971
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
972
+ dict(
973
+ abbr='LongBench_gov_report_0',
974
+ eval_cfg=dict(
975
+ evaluator=dict(
976
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
977
+ pred_role='BOT'),
978
+ infer_cfg=dict(
979
+ inferencer=dict(
980
+ max_out_len=512,
981
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
982
+ prompt_template=dict(
983
+ template=dict(round=[
984
+ dict(
985
+ prompt=
986
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
987
+ role='HUMAN'),
988
+ ]),
989
+ type=
990
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
991
+ retriever=dict(
992
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
993
+ name='gov_report',
994
+ path='opencompass/Longbench',
995
+ reader_cfg=dict(
996
+ input_columns=[
997
+ 'context',
998
+ ],
999
+ output_column='answers',
1000
+ test_range='[0:25]',
1001
+ test_split='test',
1002
+ train_split='test'),
1003
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
1004
+ dict(
1005
+ abbr='LongBench_qmsum_0',
1006
+ eval_cfg=dict(
1007
+ evaluator=dict(
1008
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1009
+ pred_role='BOT'),
1010
+ infer_cfg=dict(
1011
+ inferencer=dict(
1012
+ max_out_len=512,
1013
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1014
+ prompt_template=dict(
1015
+ template=dict(round=[
1016
+ dict(
1017
+ prompt=
1018
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
1019
+ role='HUMAN'),
1020
+ ]),
1021
+ type=
1022
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1023
+ retriever=dict(
1024
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1025
+ name='qmsum',
1026
+ path='opencompass/Longbench',
1027
+ reader_cfg=dict(
1028
+ input_columns=[
1029
+ 'context',
1030
+ 'input',
1031
+ ],
1032
+ output_column='answers',
1033
+ test_range='[0:25]',
1034
+ test_split='test',
1035
+ train_split='test'),
1036
+ type='opencompass.datasets.LongBenchqmsumDataset'),
1037
+ dict(
1038
+ abbr='LongBench_vcsum_0',
1039
+ eval_cfg=dict(
1040
+ evaluator=dict(
1041
+ language='zh',
1042
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1043
+ pred_role='BOT'),
1044
+ infer_cfg=dict(
1045
+ inferencer=dict(
1046
+ max_out_len=512,
1047
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1048
+ prompt_template=dict(
1049
+ template=dict(round=[
1050
+ dict(
1051
+ prompt=
1052
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
1053
+ role='HUMAN'),
1054
+ ]),
1055
+ type=
1056
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1057
+ retriever=dict(
1058
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1059
+ name='vcsum',
1060
+ path='opencompass/Longbench',
1061
+ reader_cfg=dict(
1062
+ input_columns=[
1063
+ 'context',
1064
+ ],
1065
+ output_column='answers',
1066
+ test_range='[0:25]',
1067
+ test_split='test',
1068
+ train_split='test'),
1069
+ type='opencompass.datasets.LongBenchvcsumDataset'),
1070
+ dict(
1071
+ abbr='LongBench_dureader_0',
1072
+ eval_cfg=dict(
1073
+ evaluator=dict(
1074
+ language='zh',
1075
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1076
+ pred_role='BOT'),
1077
+ infer_cfg=dict(
1078
+ inferencer=dict(
1079
+ max_out_len=128,
1080
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1081
+ prompt_template=dict(
1082
+ template=dict(round=[
1083
+ dict(
1084
+ prompt=
1085
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
1086
+ role='HUMAN'),
1087
+ ]),
1088
+ type=
1089
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1090
+ retriever=dict(
1091
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1092
+ name='dureader',
1093
+ path='opencompass/Longbench',
1094
+ reader_cfg=dict(
1095
+ input_columns=[
1096
+ 'context',
1097
+ 'input',
1098
+ ],
1099
+ output_column='answers',
1100
+ test_range='[0:25]',
1101
+ test_split='test',
1102
+ train_split='test'),
1103
+ type='opencompass.datasets.LongBenchdureaderDataset'),
1104
+ dict(
1105
+ abbr='LongBench_lcc_0',
1106
+ eval_cfg=dict(
1107
+ evaluator=dict(
1108
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1109
+ pred_role='BOT'),
1110
+ infer_cfg=dict(
1111
+ inferencer=dict(
1112
+ max_out_len=64,
1113
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1114
+ prompt_template=dict(
1115
+ template=dict(round=[
1116
+ dict(
1117
+ prompt=
1118
+ 'Please complete the code given below. \n{context}Next line of code:\n',
1119
+ role='HUMAN'),
1120
+ ]),
1121
+ type=
1122
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1123
+ retriever=dict(
1124
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1125
+ name='lcc',
1126
+ path='opencompass/Longbench',
1127
+ reader_cfg=dict(
1128
+ input_columns=[
1129
+ 'context',
1130
+ ],
1131
+ output_column='answers',
1132
+ test_range='[0:63]',
1133
+ test_split='test',
1134
+ train_split='test'),
1135
+ type='opencompass.datasets.LongBenchlccDataset'),
1136
+ dict(
1137
+ abbr='LongBench_repobench-p_0',
1138
+ eval_cfg=dict(
1139
+ evaluator=dict(
1140
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1141
+ pred_role='BOT'),
1142
+ infer_cfg=dict(
1143
+ inferencer=dict(
1144
+ max_out_len=64,
1145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1146
+ prompt_template=dict(
1147
+ template=dict(round=[
1148
+ dict(
1149
+ prompt=
1150
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
1151
+ role='HUMAN'),
1152
+ ]),
1153
+ type=
1154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1155
+ retriever=dict(
1156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1157
+ name='repobench-p',
1158
+ path='opencompass/Longbench',
1159
+ reader_cfg=dict(
1160
+ input_columns=[
1161
+ 'context',
1162
+ 'input',
1163
+ ],
1164
+ output_column='answers',
1165
+ test_range='[0:63]',
1166
+ test_split='test',
1167
+ train_split='test'),
1168
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
1169
+ dict(
1170
+ abbr='LongBench_passage_retrieval_en_0',
1171
+ eval_cfg=dict(
1172
+ evaluator=dict(
1173
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1174
+ pred_role='BOT'),
1175
+ infer_cfg=dict(
1176
+ inferencer=dict(
1177
+ max_out_len=32,
1178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1179
+ prompt_template=dict(
1180
+ template=dict(round=[
1181
+ dict(
1182
+ prompt=
1183
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
1184
+ role='HUMAN'),
1185
+ ]),
1186
+ type=
1187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1188
+ retriever=dict(
1189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1190
+ name='passage_retrieval_en',
1191
+ path='opencompass/Longbench',
1192
+ reader_cfg=dict(
1193
+ input_columns=[
1194
+ 'context',
1195
+ 'input',
1196
+ ],
1197
+ output_column='answers',
1198
+ test_range='[0:25]',
1199
+ test_split='test',
1200
+ train_split='test'),
1201
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
1202
+ dict(
1203
+ abbr='LongBench_passage_retrieval_zh_0',
1204
+ eval_cfg=dict(
1205
+ evaluator=dict(
1206
+ language='zh',
1207
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1208
+ pred_role='BOT'),
1209
+ infer_cfg=dict(
1210
+ inferencer=dict(
1211
+ max_out_len=32,
1212
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1213
+ prompt_template=dict(
1214
+ template=dict(round=[
1215
+ dict(
1216
+ prompt=
1217
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
1218
+ role='HUMAN'),
1219
+ ]),
1220
+ type=
1221
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1222
+ retriever=dict(
1223
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1224
+ name='passage_retrieval_zh',
1225
+ path='opencompass/Longbench',
1226
+ reader_cfg=dict(
1227
+ input_columns=[
1228
+ 'context',
1229
+ 'input',
1230
+ ],
1231
+ output_column='answers',
1232
+ test_range='[0:25]',
1233
+ test_split='test',
1234
+ train_split='test'),
1235
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
1236
+ dict(
1237
+ abbr='LongBench_passage_count_0',
1238
+ eval_cfg=dict(
1239
+ evaluator=dict(
1240
+ type='opencompass.datasets.LongBenchCountEvaluator'),
1241
+ pred_role='BOT'),
1242
+ infer_cfg=dict(
1243
+ inferencer=dict(
1244
+ max_out_len=32,
1245
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1246
+ prompt_template=dict(
1247
+ template=dict(round=[
1248
+ dict(
1249
+ prompt=
1250
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
1251
+ role='HUMAN'),
1252
+ ]),
1253
+ type=
1254
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1255
+ retriever=dict(
1256
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1257
+ name='passage_count',
1258
+ path='opencompass/Longbench',
1259
+ reader_cfg=dict(
1260
+ input_columns=[
1261
+ 'context',
1262
+ 'input',
1263
+ ],
1264
+ output_column='answers',
1265
+ test_range='[0:25]',
1266
+ test_split='test',
1267
+ train_split='test'),
1268
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
1269
+ dict(
1270
+ abbr='LongBench_trec_0',
1271
+ eval_cfg=dict(
1272
+ evaluator=dict(
1273
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1274
+ ),
1275
+ pred_postprocessor=dict(
1276
+ type='opencompass.datasets.trec_postprocess'),
1277
+ pred_role='BOT'),
1278
+ infer_cfg=dict(
1279
+ inferencer=dict(
1280
+ max_out_len=64,
1281
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1282
+ prompt_template=dict(
1283
+ template=dict(round=[
1284
+ dict(
1285
+ prompt=
1286
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
1287
+ role='HUMAN'),
1288
+ ]),
1289
+ type=
1290
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1291
+ retriever=dict(
1292
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1293
+ name='trec',
1294
+ path='opencompass/Longbench',
1295
+ reader_cfg=dict(
1296
+ input_columns=[
1297
+ 'context',
1298
+ 'input',
1299
+ ],
1300
+ output_column='all_labels',
1301
+ test_range='[0:25]',
1302
+ test_split='test',
1303
+ train_split='test'),
1304
+ type='opencompass.datasets.LongBenchtrecDataset'),
1305
+ dict(
1306
+ abbr='LongBench_lsht_0',
1307
+ eval_cfg=dict(
1308
+ evaluator=dict(
1309
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1310
+ ),
1311
+ pred_postprocessor=dict(
1312
+ type='opencompass.datasets.lsht_postprocess'),
1313
+ pred_role='BOT'),
1314
+ infer_cfg=dict(
1315
+ inferencer=dict(
1316
+ max_out_len=64,
1317
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1318
+ prompt_template=dict(
1319
+ template=dict(round=[
1320
+ dict(
1321
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
1322
+ role='HUMAN'),
1323
+ ]),
1324
+ type=
1325
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1326
+ retriever=dict(
1327
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1328
+ name='lsht',
1329
+ path='opencompass/Longbench',
1330
+ reader_cfg=dict(
1331
+ input_columns=[
1332
+ 'context',
1333
+ 'input',
1334
+ ],
1335
+ output_column='all_labels',
1336
+ test_range='[0:25]',
1337
+ test_split='test',
1338
+ train_split='test'),
1339
+ type='opencompass.datasets.LongBenchlshtDataset'),
1340
+ dict(
1341
+ abbr='LongBench_multi_news_0',
1342
+ eval_cfg=dict(
1343
+ evaluator=dict(
1344
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1345
+ pred_role='BOT'),
1346
+ infer_cfg=dict(
1347
+ inferencer=dict(
1348
+ max_out_len=512,
1349
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1350
+ prompt_template=dict(
1351
+ template=dict(round=[
1352
+ dict(
1353
+ prompt=
1354
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
1355
+ role='HUMAN'),
1356
+ ]),
1357
+ type=
1358
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1359
+ retriever=dict(
1360
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1361
+ name='multi_news',
1362
+ path='opencompass/Longbench',
1363
+ reader_cfg=dict(
1364
+ input_columns=[
1365
+ 'context',
1366
+ ],
1367
+ output_column='answers',
1368
+ test_range='[0:25]',
1369
+ test_split='test',
1370
+ train_split='test'),
1371
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
1372
+ dict(
1373
+ abbr='LongBench_samsum_0',
1374
+ eval_cfg=dict(
1375
+ evaluator=dict(
1376
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1377
+ pred_postprocessor=dict(
1378
+ type='opencompass.datasets.samsum_postprocess'),
1379
+ pred_role='BOT'),
1380
+ infer_cfg=dict(
1381
+ inferencer=dict(
1382
+ max_out_len=128,
1383
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1384
+ prompt_template=dict(
1385
+ template=dict(round=[
1386
+ dict(
1387
+ prompt=
1388
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
1389
+ role='HUMAN'),
1390
+ ]),
1391
+ type=
1392
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1393
+ retriever=dict(
1394
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1395
+ name='samsum',
1396
+ path='opencompass/Longbench',
1397
+ reader_cfg=dict(
1398
+ input_columns=[
1399
+ 'context',
1400
+ 'input',
1401
+ ],
1402
+ output_column='answers',
1403
+ test_range='[0:25]',
1404
+ test_split='test',
1405
+ train_split='test'),
1406
+ type='opencompass.datasets.LongBenchsamsumDataset'),
1407
+ ],
1408
+ ]
1409
+ models = [
1410
+ dict(
1411
+ abbr='delta_net',
1412
+ batch_size=128,
1413
+ max_seq_len=2048,
1414
+ model_kwargs=dict(
1415
+ device_map='auto',
1416
+ torch_dtype='torch.bfloat16',
1417
+ trust_remote_code=True),
1418
+ path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1419
+ run_cfg=dict(num_gpus=1),
1420
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
1421
+ tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1422
+ type='opencompass.models.HuggingFaceBaseModel'),
1423
+ ]
1424
+ work_dir = 'outputs/default/20251127_202918'
tmp/0985e09b-75af-404f-ac0c-079c3aa085fb_params.py ADDED
File without changes
tmp/09d7374d-16f6-44e6-a2fa-f4925f8fb3fc_params.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_trec',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
8
+ ),
9
+ pred_postprocessor=dict(
10
+ type='opencompass.datasets.trec_postprocess'),
11
+ pred_role='BOT'),
12
+ infer_cfg=dict(
13
+ inferencer=dict(
14
+ max_out_len=64,
15
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
16
+ prompt_template=dict(
17
+ template=dict(round=[
18
+ dict(
19
+ prompt=
20
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
21
+ role='HUMAN'),
22
+ ]),
23
+ type=
24
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
25
+ retriever=dict(
26
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
27
+ name='trec',
28
+ path='opencompass/Longbench',
29
+ reader_cfg=dict(
30
+ input_columns=[
31
+ 'context',
32
+ 'input',
33
+ ],
34
+ output_column='all_labels',
35
+ test_split='test',
36
+ train_split='test'),
37
+ type='opencompass.datasets.LongBenchtrecDataset'),
38
+ ],
39
+ ]
40
+ eval = dict(runner=dict(task=dict(dump_details=True)))
41
+ models = [
42
+ dict(
43
+ abbr='gated_deltanet',
44
+ batch_size=128,
45
+ max_seq_len=2048,
46
+ model_kwargs=dict(
47
+ device_map='auto',
48
+ torch_dtype='torch.bfloat16',
49
+ trust_remote_code=True),
50
+ path='download_model/hgrn2-1.3B-100B',
51
+ run_cfg=dict(num_gpus=1),
52
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
53
+ tokenizer_path='download_model/hgrn2-1.3B-100B',
54
+ type='opencompass.models.HuggingFaceBaseModel'),
55
+ ]
56
+ work_dir = 'outputs/default/20251219_163447'
tmp/0a5aa083-12c4-41a8-92db-57a728f50ed5_params.py ADDED
File without changes
tmp/0bd141af-ea86-420f-b26c-b2890fc57de2_params.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_trec',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
8
+ ),
9
+ pred_postprocessor=dict(
10
+ type='opencompass.datasets.trec_postprocess'),
11
+ pred_role='BOT'),
12
+ infer_cfg=dict(
13
+ inferencer=dict(
14
+ max_out_len=64,
15
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
16
+ prompt_template=dict(
17
+ template=dict(round=[
18
+ dict(
19
+ prompt=
20
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
21
+ role='HUMAN'),
22
+ ]),
23
+ type=
24
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
25
+ retriever=dict(
26
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
27
+ name='trec',
28
+ path='opencompass/Longbench',
29
+ reader_cfg=dict(
30
+ input_columns=[
31
+ 'context',
32
+ 'input',
33
+ ],
34
+ output_column='all_labels',
35
+ test_split='test',
36
+ train_split='test'),
37
+ type='opencompass.datasets.LongBenchtrecDataset'),
38
+ ],
39
+ ]
40
+ eval = dict(runner=dict(task=dict(dump_details=True)))
41
+ models = [
42
+ dict(
43
+ abbr='gated_deltanet',
44
+ batch_size=128,
45
+ max_seq_len=2048,
46
+ model_kwargs=dict(
47
+ device_map='auto',
48
+ torch_dtype='torch.bfloat16',
49
+ trust_remote_code=True),
50
+ path='download_model/hgrn2-1.3B-100B',
51
+ run_cfg=dict(num_gpus=1),
52
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
53
+ tokenizer_path='download_model/hgrn2-1.3B-100B',
54
+ type='opencompass.models.HuggingFaceBaseModel'),
55
+ ]
56
+ work_dir = 'outputs/default/20251219_164057'
tmp/0c3d2c0a-49a1-40b1-b0b6-3d32b7381062_params.py ADDED
@@ -0,0 +1,1420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa_6',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_range='[150:175]',
33
+ test_split='test',
34
+ train_split='test'),
35
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
36
+ dict(
37
+ abbr='LongBench_hotpotqa_6',
38
+ eval_cfg=dict(
39
+ evaluator=dict(
40
+ type='opencompass.datasets.LongBenchF1Evaluator'),
41
+ pred_role='BOT'),
42
+ infer_cfg=dict(
43
+ inferencer=dict(
44
+ max_out_len=32,
45
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
46
+ prompt_template=dict(
47
+ template=dict(round=[
48
+ dict(
49
+ prompt=
50
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
51
+ role='HUMAN'),
52
+ ]),
53
+ type=
54
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
55
+ retriever=dict(
56
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
57
+ name='hotpotqa',
58
+ path='opencompass/Longbench',
59
+ reader_cfg=dict(
60
+ input_columns=[
61
+ 'context',
62
+ 'input',
63
+ ],
64
+ output_column='answers',
65
+ test_range='[150:175]',
66
+ test_split='test',
67
+ train_split='test'),
68
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
69
+ dict(
70
+ abbr='LongBench_musique_6',
71
+ eval_cfg=dict(
72
+ evaluator=dict(
73
+ type='opencompass.datasets.LongBenchF1Evaluator'),
74
+ pred_role='BOT'),
75
+ infer_cfg=dict(
76
+ inferencer=dict(
77
+ max_out_len=32,
78
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
79
+ prompt_template=dict(
80
+ template=dict(round=[
81
+ dict(
82
+ prompt=
83
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
84
+ role='HUMAN'),
85
+ ]),
86
+ type=
87
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
88
+ retriever=dict(
89
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
90
+ name='musique',
91
+ path='opencompass/Longbench',
92
+ reader_cfg=dict(
93
+ input_columns=[
94
+ 'context',
95
+ 'input',
96
+ ],
97
+ output_column='answers',
98
+ test_range='[150:175]',
99
+ test_split='test',
100
+ train_split='test'),
101
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
102
+ dict(
103
+ abbr='LongBench_multifieldqa_en_6',
104
+ eval_cfg=dict(
105
+ evaluator=dict(
106
+ type='opencompass.datasets.LongBenchF1Evaluator'),
107
+ pred_role='BOT'),
108
+ infer_cfg=dict(
109
+ inferencer=dict(
110
+ max_out_len=64,
111
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
112
+ prompt_template=dict(
113
+ template=dict(round=[
114
+ dict(
115
+ prompt=
116
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
117
+ role='HUMAN'),
118
+ ]),
119
+ type=
120
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
121
+ retriever=dict(
122
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
123
+ name='multifieldqa_en',
124
+ path='opencompass/Longbench',
125
+ reader_cfg=dict(
126
+ input_columns=[
127
+ 'context',
128
+ 'input',
129
+ ],
130
+ output_column='answers',
131
+ test_range='[114:133]',
132
+ test_split='test',
133
+ train_split='test'),
134
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
135
+ dict(
136
+ abbr='LongBench_multifieldqa_zh_6',
137
+ eval_cfg=dict(
138
+ evaluator=dict(
139
+ language='zh',
140
+ type='opencompass.datasets.LongBenchF1Evaluator'),
141
+ pred_role='BOT'),
142
+ infer_cfg=dict(
143
+ inferencer=dict(
144
+ max_out_len=64,
145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
146
+ prompt_template=dict(
147
+ template=dict(round=[
148
+ dict(
149
+ prompt=
150
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
151
+ role='HUMAN'),
152
+ ]),
153
+ type=
154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
155
+ retriever=dict(
156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
157
+ name='multifieldqa_zh',
158
+ path='opencompass/Longbench',
159
+ reader_cfg=dict(
160
+ input_columns=[
161
+ 'context',
162
+ 'input',
163
+ ],
164
+ output_column='answers',
165
+ test_range='[150:175]',
166
+ test_split='test',
167
+ train_split='test'),
168
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
169
+ dict(
170
+ abbr='LongBench_narrativeqa_6',
171
+ eval_cfg=dict(
172
+ evaluator=dict(
173
+ type='opencompass.datasets.LongBenchF1Evaluator'),
174
+ pred_role='BOT'),
175
+ infer_cfg=dict(
176
+ inferencer=dict(
177
+ max_out_len=128,
178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
179
+ prompt_template=dict(
180
+ template=dict(round=[
181
+ dict(
182
+ prompt=
183
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
184
+ role='HUMAN'),
185
+ ]),
186
+ type=
187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
188
+ retriever=dict(
189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
190
+ name='narrativeqa',
191
+ path='opencompass/Longbench',
192
+ reader_cfg=dict(
193
+ input_columns=[
194
+ 'context',
195
+ 'input',
196
+ ],
197
+ output_column='answers',
198
+ test_range='[150:175]',
199
+ test_split='test',
200
+ train_split='test'),
201
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
202
+ dict(
203
+ abbr='LongBench_qasper_6',
204
+ eval_cfg=dict(
205
+ evaluator=dict(
206
+ type='opencompass.datasets.LongBenchF1Evaluator'),
207
+ pred_role='BOT'),
208
+ infer_cfg=dict(
209
+ inferencer=dict(
210
+ max_out_len=32,
211
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
212
+ prompt_template=dict(
213
+ template=dict(round=[
214
+ dict(
215
+ prompt=
216
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
217
+ role='HUMAN'),
218
+ ]),
219
+ type=
220
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
221
+ retriever=dict(
222
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
223
+ name='qasper',
224
+ path='opencompass/Longbench',
225
+ reader_cfg=dict(
226
+ input_columns=[
227
+ 'context',
228
+ 'input',
229
+ ],
230
+ output_column='answers',
231
+ test_range='[150:175]',
232
+ test_split='test',
233
+ train_split='test'),
234
+ type='opencompass.datasets.LongBenchqasperDataset'),
235
+ dict(
236
+ abbr='LongBench_triviaqa_6',
237
+ eval_cfg=dict(
238
+ evaluator=dict(
239
+ type='opencompass.datasets.LongBenchF1Evaluator'),
240
+ pred_postprocessor=dict(
241
+ type='opencompass.datasets.triviaqa_postprocess'),
242
+ pred_role='BOT'),
243
+ infer_cfg=dict(
244
+ inferencer=dict(
245
+ max_out_len=32,
246
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
247
+ prompt_template=dict(
248
+ template=dict(round=[
249
+ dict(
250
+ prompt=
251
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
252
+ role='HUMAN'),
253
+ ]),
254
+ type=
255
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
256
+ retriever=dict(
257
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
258
+ name='triviaqa',
259
+ path='opencompass/Longbench',
260
+ reader_cfg=dict(
261
+ input_columns=[
262
+ 'context',
263
+ 'input',
264
+ ],
265
+ output_column='answers',
266
+ test_range='[150:175]',
267
+ test_split='test',
268
+ train_split='test'),
269
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
270
+ dict(
271
+ abbr='LongBench_gov_report_6',
272
+ eval_cfg=dict(
273
+ evaluator=dict(
274
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
275
+ pred_role='BOT'),
276
+ infer_cfg=dict(
277
+ inferencer=dict(
278
+ max_out_len=512,
279
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
280
+ prompt_template=dict(
281
+ template=dict(round=[
282
+ dict(
283
+ prompt=
284
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
285
+ role='HUMAN'),
286
+ ]),
287
+ type=
288
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
289
+ retriever=dict(
290
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
291
+ name='gov_report',
292
+ path='opencompass/Longbench',
293
+ reader_cfg=dict(
294
+ input_columns=[
295
+ 'context',
296
+ ],
297
+ output_column='answers',
298
+ test_range='[150:175]',
299
+ test_split='test',
300
+ train_split='test'),
301
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
302
+ dict(
303
+ abbr='LongBench_qmsum_6',
304
+ eval_cfg=dict(
305
+ evaluator=dict(
306
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
307
+ pred_role='BOT'),
308
+ infer_cfg=dict(
309
+ inferencer=dict(
310
+ max_out_len=512,
311
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
312
+ prompt_template=dict(
313
+ template=dict(round=[
314
+ dict(
315
+ prompt=
316
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
317
+ role='HUMAN'),
318
+ ]),
319
+ type=
320
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
321
+ retriever=dict(
322
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
323
+ name='qmsum',
324
+ path='opencompass/Longbench',
325
+ reader_cfg=dict(
326
+ input_columns=[
327
+ 'context',
328
+ 'input',
329
+ ],
330
+ output_column='answers',
331
+ test_range='[150:175]',
332
+ test_split='test',
333
+ train_split='test'),
334
+ type='opencompass.datasets.LongBenchqmsumDataset'),
335
+ dict(
336
+ abbr='LongBench_vcsum_6',
337
+ eval_cfg=dict(
338
+ evaluator=dict(
339
+ language='zh',
340
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
341
+ pred_role='BOT'),
342
+ infer_cfg=dict(
343
+ inferencer=dict(
344
+ max_out_len=512,
345
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
346
+ prompt_template=dict(
347
+ template=dict(round=[
348
+ dict(
349
+ prompt=
350
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
351
+ role='HUMAN'),
352
+ ]),
353
+ type=
354
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
355
+ retriever=dict(
356
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
357
+ name='vcsum',
358
+ path='opencompass/Longbench',
359
+ reader_cfg=dict(
360
+ input_columns=[
361
+ 'context',
362
+ ],
363
+ output_column='answers',
364
+ test_range='[150:175]',
365
+ test_split='test',
366
+ train_split='test'),
367
+ type='opencompass.datasets.LongBenchvcsumDataset'),
368
+ dict(
369
+ abbr='LongBench_dureader_6',
370
+ eval_cfg=dict(
371
+ evaluator=dict(
372
+ language='zh',
373
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
374
+ pred_role='BOT'),
375
+ infer_cfg=dict(
376
+ inferencer=dict(
377
+ max_out_len=128,
378
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
379
+ prompt_template=dict(
380
+ template=dict(round=[
381
+ dict(
382
+ prompt=
383
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
384
+ role='HUMAN'),
385
+ ]),
386
+ type=
387
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
388
+ retriever=dict(
389
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
390
+ name='dureader',
391
+ path='opencompass/Longbench',
392
+ reader_cfg=dict(
393
+ input_columns=[
394
+ 'context',
395
+ 'input',
396
+ ],
397
+ output_column='answers',
398
+ test_range='[150:175]',
399
+ test_split='test',
400
+ train_split='test'),
401
+ type='opencompass.datasets.LongBenchdureaderDataset'),
402
+ dict(
403
+ abbr='LongBench_lcc_6',
404
+ eval_cfg=dict(
405
+ evaluator=dict(
406
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
407
+ pred_role='BOT'),
408
+ infer_cfg=dict(
409
+ inferencer=dict(
410
+ max_out_len=64,
411
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
412
+ prompt_template=dict(
413
+ template=dict(round=[
414
+ dict(
415
+ prompt=
416
+ 'Please complete the code given below. \n{context}Next line of code:\n',
417
+ role='HUMAN'),
418
+ ]),
419
+ type=
420
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
421
+ retriever=dict(
422
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
423
+ name='lcc',
424
+ path='opencompass/Longbench',
425
+ reader_cfg=dict(
426
+ input_columns=[
427
+ 'context',
428
+ ],
429
+ output_column='answers',
430
+ test_range='[378:441]',
431
+ test_split='test',
432
+ train_split='test'),
433
+ type='opencompass.datasets.LongBenchlccDataset'),
434
+ dict(
435
+ abbr='LongBench_repobench-p_6',
436
+ eval_cfg=dict(
437
+ evaluator=dict(
438
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
439
+ pred_role='BOT'),
440
+ infer_cfg=dict(
441
+ inferencer=dict(
442
+ max_out_len=64,
443
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
444
+ prompt_template=dict(
445
+ template=dict(round=[
446
+ dict(
447
+ prompt=
448
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
449
+ role='HUMAN'),
450
+ ]),
451
+ type=
452
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
453
+ retriever=dict(
454
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
455
+ name='repobench-p',
456
+ path='opencompass/Longbench',
457
+ reader_cfg=dict(
458
+ input_columns=[
459
+ 'context',
460
+ 'input',
461
+ ],
462
+ output_column='answers',
463
+ test_range='[378:441]',
464
+ test_split='test',
465
+ train_split='test'),
466
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
467
+ dict(
468
+ abbr='LongBench_passage_retrieval_en_6',
469
+ eval_cfg=dict(
470
+ evaluator=dict(
471
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
472
+ pred_role='BOT'),
473
+ infer_cfg=dict(
474
+ inferencer=dict(
475
+ max_out_len=32,
476
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
477
+ prompt_template=dict(
478
+ template=dict(round=[
479
+ dict(
480
+ prompt=
481
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
482
+ role='HUMAN'),
483
+ ]),
484
+ type=
485
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
486
+ retriever=dict(
487
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
488
+ name='passage_retrieval_en',
489
+ path='opencompass/Longbench',
490
+ reader_cfg=dict(
491
+ input_columns=[
492
+ 'context',
493
+ 'input',
494
+ ],
495
+ output_column='answers',
496
+ test_range='[150:175]',
497
+ test_split='test',
498
+ train_split='test'),
499
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
500
+ dict(
501
+ abbr='LongBench_passage_retrieval_zh_6',
502
+ eval_cfg=dict(
503
+ evaluator=dict(
504
+ language='zh',
505
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
506
+ pred_role='BOT'),
507
+ infer_cfg=dict(
508
+ inferencer=dict(
509
+ max_out_len=32,
510
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
511
+ prompt_template=dict(
512
+ template=dict(round=[
513
+ dict(
514
+ prompt=
515
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
516
+ role='HUMAN'),
517
+ ]),
518
+ type=
519
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
520
+ retriever=dict(
521
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
522
+ name='passage_retrieval_zh',
523
+ path='opencompass/Longbench',
524
+ reader_cfg=dict(
525
+ input_columns=[
526
+ 'context',
527
+ 'input',
528
+ ],
529
+ output_column='answers',
530
+ test_range='[150:175]',
531
+ test_split='test',
532
+ train_split='test'),
533
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
534
+ dict(
535
+ abbr='LongBench_passage_count_6',
536
+ eval_cfg=dict(
537
+ evaluator=dict(
538
+ type='opencompass.datasets.LongBenchCountEvaluator'),
539
+ pred_role='BOT'),
540
+ infer_cfg=dict(
541
+ inferencer=dict(
542
+ max_out_len=32,
543
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
544
+ prompt_template=dict(
545
+ template=dict(round=[
546
+ dict(
547
+ prompt=
548
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
549
+ role='HUMAN'),
550
+ ]),
551
+ type=
552
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
553
+ retriever=dict(
554
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
555
+ name='passage_count',
556
+ path='opencompass/Longbench',
557
+ reader_cfg=dict(
558
+ input_columns=[
559
+ 'context',
560
+ 'input',
561
+ ],
562
+ output_column='answers',
563
+ test_range='[150:175]',
564
+ test_split='test',
565
+ train_split='test'),
566
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
567
+ dict(
568
+ abbr='LongBench_trec_6',
569
+ eval_cfg=dict(
570
+ evaluator=dict(
571
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
572
+ ),
573
+ pred_postprocessor=dict(
574
+ type='opencompass.datasets.trec_postprocess'),
575
+ pred_role='BOT'),
576
+ infer_cfg=dict(
577
+ inferencer=dict(
578
+ max_out_len=64,
579
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
580
+ prompt_template=dict(
581
+ template=dict(round=[
582
+ dict(
583
+ prompt=
584
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
585
+ role='HUMAN'),
586
+ ]),
587
+ type=
588
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
589
+ retriever=dict(
590
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
591
+ name='trec',
592
+ path='opencompass/Longbench',
593
+ reader_cfg=dict(
594
+ input_columns=[
595
+ 'context',
596
+ 'input',
597
+ ],
598
+ output_column='all_labels',
599
+ test_range='[150:175]',
600
+ test_split='test',
601
+ train_split='test'),
602
+ type='opencompass.datasets.LongBenchtrecDataset'),
603
+ dict(
604
+ abbr='LongBench_lsht_6',
605
+ eval_cfg=dict(
606
+ evaluator=dict(
607
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
608
+ ),
609
+ pred_postprocessor=dict(
610
+ type='opencompass.datasets.lsht_postprocess'),
611
+ pred_role='BOT'),
612
+ infer_cfg=dict(
613
+ inferencer=dict(
614
+ max_out_len=64,
615
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
616
+ prompt_template=dict(
617
+ template=dict(round=[
618
+ dict(
619
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
620
+ role='HUMAN'),
621
+ ]),
622
+ type=
623
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
624
+ retriever=dict(
625
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
626
+ name='lsht',
627
+ path='opencompass/Longbench',
628
+ reader_cfg=dict(
629
+ input_columns=[
630
+ 'context',
631
+ 'input',
632
+ ],
633
+ output_column='all_labels',
634
+ test_range='[150:175]',
635
+ test_split='test',
636
+ train_split='test'),
637
+ type='opencompass.datasets.LongBenchlshtDataset'),
638
+ dict(
639
+ abbr='LongBench_multi_news_6',
640
+ eval_cfg=dict(
641
+ evaluator=dict(
642
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
643
+ pred_role='BOT'),
644
+ infer_cfg=dict(
645
+ inferencer=dict(
646
+ max_out_len=512,
647
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
648
+ prompt_template=dict(
649
+ template=dict(round=[
650
+ dict(
651
+ prompt=
652
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
653
+ role='HUMAN'),
654
+ ]),
655
+ type=
656
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
657
+ retriever=dict(
658
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
659
+ name='multi_news',
660
+ path='opencompass/Longbench',
661
+ reader_cfg=dict(
662
+ input_columns=[
663
+ 'context',
664
+ ],
665
+ output_column='answers',
666
+ test_range='[150:175]',
667
+ test_split='test',
668
+ train_split='test'),
669
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
670
+ dict(
671
+ abbr='LongBench_samsum_6',
672
+ eval_cfg=dict(
673
+ evaluator=dict(
674
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
675
+ pred_postprocessor=dict(
676
+ type='opencompass.datasets.samsum_postprocess'),
677
+ pred_role='BOT'),
678
+ infer_cfg=dict(
679
+ inferencer=dict(
680
+ max_out_len=128,
681
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
682
+ prompt_template=dict(
683
+ template=dict(round=[
684
+ dict(
685
+ prompt=
686
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
687
+ role='HUMAN'),
688
+ ]),
689
+ type=
690
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
691
+ retriever=dict(
692
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
693
+ name='samsum',
694
+ path='opencompass/Longbench',
695
+ reader_cfg=dict(
696
+ input_columns=[
697
+ 'context',
698
+ 'input',
699
+ ],
700
+ output_column='answers',
701
+ test_range='[150:175]',
702
+ test_split='test',
703
+ train_split='test'),
704
+ type='opencompass.datasets.LongBenchsamsumDataset'),
705
+ dict(
706
+ abbr='LongBench_2wikimqa_6',
707
+ eval_cfg=dict(
708
+ evaluator=dict(
709
+ type='opencompass.datasets.LongBenchF1Evaluator'),
710
+ pred_role='BOT'),
711
+ infer_cfg=dict(
712
+ inferencer=dict(
713
+ max_out_len=32,
714
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
715
+ prompt_template=dict(
716
+ template=dict(round=[
717
+ dict(
718
+ prompt=
719
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
720
+ role='HUMAN'),
721
+ ]),
722
+ type=
723
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
724
+ retriever=dict(
725
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
726
+ name='2wikimqa',
727
+ path='opencompass/Longbench',
728
+ reader_cfg=dict(
729
+ input_columns=[
730
+ 'context',
731
+ 'input',
732
+ ],
733
+ output_column='answers',
734
+ test_range='[150:175]',
735
+ test_split='test',
736
+ train_split='test'),
737
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
738
+ dict(
739
+ abbr='LongBench_hotpotqa_6',
740
+ eval_cfg=dict(
741
+ evaluator=dict(
742
+ type='opencompass.datasets.LongBenchF1Evaluator'),
743
+ pred_role='BOT'),
744
+ infer_cfg=dict(
745
+ inferencer=dict(
746
+ max_out_len=32,
747
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
748
+ prompt_template=dict(
749
+ template=dict(round=[
750
+ dict(
751
+ prompt=
752
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
753
+ role='HUMAN'),
754
+ ]),
755
+ type=
756
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
757
+ retriever=dict(
758
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
759
+ name='hotpotqa',
760
+ path='opencompass/Longbench',
761
+ reader_cfg=dict(
762
+ input_columns=[
763
+ 'context',
764
+ 'input',
765
+ ],
766
+ output_column='answers',
767
+ test_range='[150:175]',
768
+ test_split='test',
769
+ train_split='test'),
770
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
771
+ dict(
772
+ abbr='LongBench_musique_6',
773
+ eval_cfg=dict(
774
+ evaluator=dict(
775
+ type='opencompass.datasets.LongBenchF1Evaluator'),
776
+ pred_role='BOT'),
777
+ infer_cfg=dict(
778
+ inferencer=dict(
779
+ max_out_len=32,
780
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
781
+ prompt_template=dict(
782
+ template=dict(round=[
783
+ dict(
784
+ prompt=
785
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
786
+ role='HUMAN'),
787
+ ]),
788
+ type=
789
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
790
+ retriever=dict(
791
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
792
+ name='musique',
793
+ path='opencompass/Longbench',
794
+ reader_cfg=dict(
795
+ input_columns=[
796
+ 'context',
797
+ 'input',
798
+ ],
799
+ output_column='answers',
800
+ test_range='[150:175]',
801
+ test_split='test',
802
+ train_split='test'),
803
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
804
+ dict(
805
+ abbr='LongBench_multifieldqa_en_6',
806
+ eval_cfg=dict(
807
+ evaluator=dict(
808
+ type='opencompass.datasets.LongBenchF1Evaluator'),
809
+ pred_role='BOT'),
810
+ infer_cfg=dict(
811
+ inferencer=dict(
812
+ max_out_len=64,
813
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
814
+ prompt_template=dict(
815
+ template=dict(round=[
816
+ dict(
817
+ prompt=
818
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
819
+ role='HUMAN'),
820
+ ]),
821
+ type=
822
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
823
+ retriever=dict(
824
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
825
+ name='multifieldqa_en',
826
+ path='opencompass/Longbench',
827
+ reader_cfg=dict(
828
+ input_columns=[
829
+ 'context',
830
+ 'input',
831
+ ],
832
+ output_column='answers',
833
+ test_range='[114:133]',
834
+ test_split='test',
835
+ train_split='test'),
836
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
837
+ dict(
838
+ abbr='LongBench_multifieldqa_zh_6',
839
+ eval_cfg=dict(
840
+ evaluator=dict(
841
+ language='zh',
842
+ type='opencompass.datasets.LongBenchF1Evaluator'),
843
+ pred_role='BOT'),
844
+ infer_cfg=dict(
845
+ inferencer=dict(
846
+ max_out_len=64,
847
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
848
+ prompt_template=dict(
849
+ template=dict(round=[
850
+ dict(
851
+ prompt=
852
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
853
+ role='HUMAN'),
854
+ ]),
855
+ type=
856
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
857
+ retriever=dict(
858
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
859
+ name='multifieldqa_zh',
860
+ path='opencompass/Longbench',
861
+ reader_cfg=dict(
862
+ input_columns=[
863
+ 'context',
864
+ 'input',
865
+ ],
866
+ output_column='answers',
867
+ test_range='[150:175]',
868
+ test_split='test',
869
+ train_split='test'),
870
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
871
+ dict(
872
+ abbr='LongBench_narrativeqa_6',
873
+ eval_cfg=dict(
874
+ evaluator=dict(
875
+ type='opencompass.datasets.LongBenchF1Evaluator'),
876
+ pred_role='BOT'),
877
+ infer_cfg=dict(
878
+ inferencer=dict(
879
+ max_out_len=128,
880
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
881
+ prompt_template=dict(
882
+ template=dict(round=[
883
+ dict(
884
+ prompt=
885
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
886
+ role='HUMAN'),
887
+ ]),
888
+ type=
889
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
890
+ retriever=dict(
891
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
892
+ name='narrativeqa',
893
+ path='opencompass/Longbench',
894
+ reader_cfg=dict(
895
+ input_columns=[
896
+ 'context',
897
+ 'input',
898
+ ],
899
+ output_column='answers',
900
+ test_range='[150:175]',
901
+ test_split='test',
902
+ train_split='test'),
903
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
904
+ dict(
905
+ abbr='LongBench_qasper_6',
906
+ eval_cfg=dict(
907
+ evaluator=dict(
908
+ type='opencompass.datasets.LongBenchF1Evaluator'),
909
+ pred_role='BOT'),
910
+ infer_cfg=dict(
911
+ inferencer=dict(
912
+ max_out_len=32,
913
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
914
+ prompt_template=dict(
915
+ template=dict(round=[
916
+ dict(
917
+ prompt=
918
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
919
+ role='HUMAN'),
920
+ ]),
921
+ type=
922
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
923
+ retriever=dict(
924
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
925
+ name='qasper',
926
+ path='opencompass/Longbench',
927
+ reader_cfg=dict(
928
+ input_columns=[
929
+ 'context',
930
+ 'input',
931
+ ],
932
+ output_column='answers',
933
+ test_range='[150:175]',
934
+ test_split='test',
935
+ train_split='test'),
936
+ type='opencompass.datasets.LongBenchqasperDataset'),
937
+ dict(
938
+ abbr='LongBench_triviaqa_6',
939
+ eval_cfg=dict(
940
+ evaluator=dict(
941
+ type='opencompass.datasets.LongBenchF1Evaluator'),
942
+ pred_postprocessor=dict(
943
+ type='opencompass.datasets.triviaqa_postprocess'),
944
+ pred_role='BOT'),
945
+ infer_cfg=dict(
946
+ inferencer=dict(
947
+ max_out_len=32,
948
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
949
+ prompt_template=dict(
950
+ template=dict(round=[
951
+ dict(
952
+ prompt=
953
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
954
+ role='HUMAN'),
955
+ ]),
956
+ type=
957
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
958
+ retriever=dict(
959
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
960
+ name='triviaqa',
961
+ path='opencompass/Longbench',
962
+ reader_cfg=dict(
963
+ input_columns=[
964
+ 'context',
965
+ 'input',
966
+ ],
967
+ output_column='answers',
968
+ test_range='[150:175]',
969
+ test_split='test',
970
+ train_split='test'),
971
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
972
+ dict(
973
+ abbr='LongBench_gov_report_6',
974
+ eval_cfg=dict(
975
+ evaluator=dict(
976
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
977
+ pred_role='BOT'),
978
+ infer_cfg=dict(
979
+ inferencer=dict(
980
+ max_out_len=512,
981
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
982
+ prompt_template=dict(
983
+ template=dict(round=[
984
+ dict(
985
+ prompt=
986
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
987
+ role='HUMAN'),
988
+ ]),
989
+ type=
990
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
991
+ retriever=dict(
992
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
993
+ name='gov_report',
994
+ path='opencompass/Longbench',
995
+ reader_cfg=dict(
996
+ input_columns=[
997
+ 'context',
998
+ ],
999
+ output_column='answers',
1000
+ test_range='[150:175]',
1001
+ test_split='test',
1002
+ train_split='test'),
1003
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
1004
+ dict(
1005
+ abbr='LongBench_qmsum_6',
1006
+ eval_cfg=dict(
1007
+ evaluator=dict(
1008
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1009
+ pred_role='BOT'),
1010
+ infer_cfg=dict(
1011
+ inferencer=dict(
1012
+ max_out_len=512,
1013
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1014
+ prompt_template=dict(
1015
+ template=dict(round=[
1016
+ dict(
1017
+ prompt=
1018
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
1019
+ role='HUMAN'),
1020
+ ]),
1021
+ type=
1022
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1023
+ retriever=dict(
1024
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1025
+ name='qmsum',
1026
+ path='opencompass/Longbench',
1027
+ reader_cfg=dict(
1028
+ input_columns=[
1029
+ 'context',
1030
+ 'input',
1031
+ ],
1032
+ output_column='answers',
1033
+ test_range='[150:175]',
1034
+ test_split='test',
1035
+ train_split='test'),
1036
+ type='opencompass.datasets.LongBenchqmsumDataset'),
1037
+ dict(
1038
+ abbr='LongBench_vcsum_6',
1039
+ eval_cfg=dict(
1040
+ evaluator=dict(
1041
+ language='zh',
1042
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1043
+ pred_role='BOT'),
1044
+ infer_cfg=dict(
1045
+ inferencer=dict(
1046
+ max_out_len=512,
1047
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1048
+ prompt_template=dict(
1049
+ template=dict(round=[
1050
+ dict(
1051
+ prompt=
1052
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
1053
+ role='HUMAN'),
1054
+ ]),
1055
+ type=
1056
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1057
+ retriever=dict(
1058
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1059
+ name='vcsum',
1060
+ path='opencompass/Longbench',
1061
+ reader_cfg=dict(
1062
+ input_columns=[
1063
+ 'context',
1064
+ ],
1065
+ output_column='answers',
1066
+ test_range='[150:175]',
1067
+ test_split='test',
1068
+ train_split='test'),
1069
+ type='opencompass.datasets.LongBenchvcsumDataset'),
1070
+ dict(
1071
+ abbr='LongBench_dureader_6',
1072
+ eval_cfg=dict(
1073
+ evaluator=dict(
1074
+ language='zh',
1075
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1076
+ pred_role='BOT'),
1077
+ infer_cfg=dict(
1078
+ inferencer=dict(
1079
+ max_out_len=128,
1080
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1081
+ prompt_template=dict(
1082
+ template=dict(round=[
1083
+ dict(
1084
+ prompt=
1085
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
1086
+ role='HUMAN'),
1087
+ ]),
1088
+ type=
1089
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1090
+ retriever=dict(
1091
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1092
+ name='dureader',
1093
+ path='opencompass/Longbench',
1094
+ reader_cfg=dict(
1095
+ input_columns=[
1096
+ 'context',
1097
+ 'input',
1098
+ ],
1099
+ output_column='answers',
1100
+ test_range='[150:175]',
1101
+ test_split='test',
1102
+ train_split='test'),
1103
+ type='opencompass.datasets.LongBenchdureaderDataset'),
1104
+ dict(
1105
+ abbr='LongBench_lcc_6',
1106
+ eval_cfg=dict(
1107
+ evaluator=dict(
1108
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1109
+ pred_role='BOT'),
1110
+ infer_cfg=dict(
1111
+ inferencer=dict(
1112
+ max_out_len=64,
1113
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1114
+ prompt_template=dict(
1115
+ template=dict(round=[
1116
+ dict(
1117
+ prompt=
1118
+ 'Please complete the code given below. \n{context}Next line of code:\n',
1119
+ role='HUMAN'),
1120
+ ]),
1121
+ type=
1122
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1123
+ retriever=dict(
1124
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1125
+ name='lcc',
1126
+ path='opencompass/Longbench',
1127
+ reader_cfg=dict(
1128
+ input_columns=[
1129
+ 'context',
1130
+ ],
1131
+ output_column='answers',
1132
+ test_range='[378:441]',
1133
+ test_split='test',
1134
+ train_split='test'),
1135
+ type='opencompass.datasets.LongBenchlccDataset'),
1136
+ dict(
1137
+ abbr='LongBench_repobench-p_6',
1138
+ eval_cfg=dict(
1139
+ evaluator=dict(
1140
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1141
+ pred_role='BOT'),
1142
+ infer_cfg=dict(
1143
+ inferencer=dict(
1144
+ max_out_len=64,
1145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1146
+ prompt_template=dict(
1147
+ template=dict(round=[
1148
+ dict(
1149
+ prompt=
1150
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
1151
+ role='HUMAN'),
1152
+ ]),
1153
+ type=
1154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1155
+ retriever=dict(
1156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1157
+ name='repobench-p',
1158
+ path='opencompass/Longbench',
1159
+ reader_cfg=dict(
1160
+ input_columns=[
1161
+ 'context',
1162
+ 'input',
1163
+ ],
1164
+ output_column='answers',
1165
+ test_range='[378:441]',
1166
+ test_split='test',
1167
+ train_split='test'),
1168
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
1169
+ dict(
1170
+ abbr='LongBench_passage_retrieval_en_6',
1171
+ eval_cfg=dict(
1172
+ evaluator=dict(
1173
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1174
+ pred_role='BOT'),
1175
+ infer_cfg=dict(
1176
+ inferencer=dict(
1177
+ max_out_len=32,
1178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1179
+ prompt_template=dict(
1180
+ template=dict(round=[
1181
+ dict(
1182
+ prompt=
1183
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
1184
+ role='HUMAN'),
1185
+ ]),
1186
+ type=
1187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1188
+ retriever=dict(
1189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1190
+ name='passage_retrieval_en',
1191
+ path='opencompass/Longbench',
1192
+ reader_cfg=dict(
1193
+ input_columns=[
1194
+ 'context',
1195
+ 'input',
1196
+ ],
1197
+ output_column='answers',
1198
+ test_range='[150:175]',
1199
+ test_split='test',
1200
+ train_split='test'),
1201
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
1202
+ dict(
1203
+ abbr='LongBench_passage_retrieval_zh_6',
1204
+ eval_cfg=dict(
1205
+ evaluator=dict(
1206
+ language='zh',
1207
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1208
+ pred_role='BOT'),
1209
+ infer_cfg=dict(
1210
+ inferencer=dict(
1211
+ max_out_len=32,
1212
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1213
+ prompt_template=dict(
1214
+ template=dict(round=[
1215
+ dict(
1216
+ prompt=
1217
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
1218
+ role='HUMAN'),
1219
+ ]),
1220
+ type=
1221
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1222
+ retriever=dict(
1223
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1224
+ name='passage_retrieval_zh',
1225
+ path='opencompass/Longbench',
1226
+ reader_cfg=dict(
1227
+ input_columns=[
1228
+ 'context',
1229
+ 'input',
1230
+ ],
1231
+ output_column='answers',
1232
+ test_range='[150:175]',
1233
+ test_split='test',
1234
+ train_split='test'),
1235
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
1236
+ dict(
1237
+ abbr='LongBench_passage_count_6',
1238
+ eval_cfg=dict(
1239
+ evaluator=dict(
1240
+ type='opencompass.datasets.LongBenchCountEvaluator'),
1241
+ pred_role='BOT'),
1242
+ infer_cfg=dict(
1243
+ inferencer=dict(
1244
+ max_out_len=32,
1245
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1246
+ prompt_template=dict(
1247
+ template=dict(round=[
1248
+ dict(
1249
+ prompt=
1250
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
1251
+ role='HUMAN'),
1252
+ ]),
1253
+ type=
1254
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1255
+ retriever=dict(
1256
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1257
+ name='passage_count',
1258
+ path='opencompass/Longbench',
1259
+ reader_cfg=dict(
1260
+ input_columns=[
1261
+ 'context',
1262
+ 'input',
1263
+ ],
1264
+ output_column='answers',
1265
+ test_range='[150:175]',
1266
+ test_split='test',
1267
+ train_split='test'),
1268
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
1269
+ dict(
1270
+ abbr='LongBench_trec_6',
1271
+ eval_cfg=dict(
1272
+ evaluator=dict(
1273
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1274
+ ),
1275
+ pred_postprocessor=dict(
1276
+ type='opencompass.datasets.trec_postprocess'),
1277
+ pred_role='BOT'),
1278
+ infer_cfg=dict(
1279
+ inferencer=dict(
1280
+ max_out_len=64,
1281
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1282
+ prompt_template=dict(
1283
+ template=dict(round=[
1284
+ dict(
1285
+ prompt=
1286
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
1287
+ role='HUMAN'),
1288
+ ]),
1289
+ type=
1290
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1291
+ retriever=dict(
1292
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1293
+ name='trec',
1294
+ path='opencompass/Longbench',
1295
+ reader_cfg=dict(
1296
+ input_columns=[
1297
+ 'context',
1298
+ 'input',
1299
+ ],
1300
+ output_column='all_labels',
1301
+ test_range='[150:175]',
1302
+ test_split='test',
1303
+ train_split='test'),
1304
+ type='opencompass.datasets.LongBenchtrecDataset'),
1305
+ dict(
1306
+ abbr='LongBench_lsht_6',
1307
+ eval_cfg=dict(
1308
+ evaluator=dict(
1309
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1310
+ ),
1311
+ pred_postprocessor=dict(
1312
+ type='opencompass.datasets.lsht_postprocess'),
1313
+ pred_role='BOT'),
1314
+ infer_cfg=dict(
1315
+ inferencer=dict(
1316
+ max_out_len=64,
1317
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1318
+ prompt_template=dict(
1319
+ template=dict(round=[
1320
+ dict(
1321
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
1322
+ role='HUMAN'),
1323
+ ]),
1324
+ type=
1325
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1326
+ retriever=dict(
1327
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1328
+ name='lsht',
1329
+ path='opencompass/Longbench',
1330
+ reader_cfg=dict(
1331
+ input_columns=[
1332
+ 'context',
1333
+ 'input',
1334
+ ],
1335
+ output_column='all_labels',
1336
+ test_range='[150:175]',
1337
+ test_split='test',
1338
+ train_split='test'),
1339
+ type='opencompass.datasets.LongBenchlshtDataset'),
1340
+ dict(
1341
+ abbr='LongBench_multi_news_6',
1342
+ eval_cfg=dict(
1343
+ evaluator=dict(
1344
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1345
+ pred_role='BOT'),
1346
+ infer_cfg=dict(
1347
+ inferencer=dict(
1348
+ max_out_len=512,
1349
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1350
+ prompt_template=dict(
1351
+ template=dict(round=[
1352
+ dict(
1353
+ prompt=
1354
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
1355
+ role='HUMAN'),
1356
+ ]),
1357
+ type=
1358
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1359
+ retriever=dict(
1360
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1361
+ name='multi_news',
1362
+ path='opencompass/Longbench',
1363
+ reader_cfg=dict(
1364
+ input_columns=[
1365
+ 'context',
1366
+ ],
1367
+ output_column='answers',
1368
+ test_range='[150:175]',
1369
+ test_split='test',
1370
+ train_split='test'),
1371
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
1372
+ dict(
1373
+ abbr='LongBench_samsum_6',
1374
+ eval_cfg=dict(
1375
+ evaluator=dict(
1376
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1377
+ pred_postprocessor=dict(
1378
+ type='opencompass.datasets.samsum_postprocess'),
1379
+ pred_role='BOT'),
1380
+ infer_cfg=dict(
1381
+ inferencer=dict(
1382
+ max_out_len=128,
1383
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1384
+ prompt_template=dict(
1385
+ template=dict(round=[
1386
+ dict(
1387
+ prompt=
1388
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
1389
+ role='HUMAN'),
1390
+ ]),
1391
+ type=
1392
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1393
+ retriever=dict(
1394
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1395
+ name='samsum',
1396
+ path='opencompass/Longbench',
1397
+ reader_cfg=dict(
1398
+ input_columns=[
1399
+ 'context',
1400
+ 'input',
1401
+ ],
1402
+ output_column='answers',
1403
+ test_range='[150:175]',
1404
+ test_split='test',
1405
+ train_split='test'),
1406
+ type='opencompass.datasets.LongBenchsamsumDataset'),
1407
+ ],
1408
+ ]
1409
+ models = [
1410
+ dict(
1411
+ abbr='delta_net-1.3B',
1412
+ batch_size=16,
1413
+ max_out_len=100,
1414
+ max_seq_len=16384,
1415
+ path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1416
+ run_cfg=dict(num_gpus=1),
1417
+ tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1418
+ type='opencompass.models.HuggingFaceCausalLM'),
1419
+ ]
1420
+ work_dir = 'outputs/default/20251127_163453'
tmp/0d03fed5-a949-4dc0-815b-cf2f740d6181_params.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_split='test',
33
+ train_split='test'),
34
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
35
+ ],
36
+ ]
37
+ eval = dict(runner=dict(task=dict(dump_details=True)))
38
+ models = [
39
+ dict(
40
+ abbr='gated_deltanet',
41
+ batch_size=128,
42
+ max_seq_len=2048,
43
+ model_kwargs=dict(
44
+ device_map='auto',
45
+ torch_dtype='torch.bfloat16',
46
+ trust_remote_code=True),
47
+ path='download_model/hgrn2-1.3B-100B',
48
+ run_cfg=dict(num_gpus=1),
49
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
50
+ tokenizer_path='download_model/hgrn2-1.3B-100B',
51
+ type='opencompass.models.HuggingFaceBaseModel'),
52
+ ]
53
+ work_dir = 'outputs/default/20251219_163447'
tmp/0d2ff363-9d6a-489c-b18d-e978d436a065_params.py ADDED
File without changes
tmp/10481e04-ca08-4f83-972f-e8fccc958b91_params.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='triviaqa_wiki_1shot_0',
5
+ eval_cfg=dict(
6
+ evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'),
7
+ pred_role='BOT'),
8
+ infer_cfg=dict(
9
+ ice_template=dict(
10
+ template='Q: {question}\nA: {answer}.\n',
11
+ type=
12
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
13
+ inferencer=dict(
14
+ max_out_len=50,
15
+ stopping_criteria=[
16
+ 'Q:',
17
+ '\n',
18
+ ],
19
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
20
+ prompt_template=dict(
21
+ ice_token='</E>',
22
+ template='</E>Q: {question}\nA: ',
23
+ type=
24
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
25
+ retriever=dict(
26
+ fix_id_list=[
27
+ 0,
28
+ ],
29
+ type='opencompass.openicl.icl_retriever.FixKRetriever')),
30
+ path='opencompass/trivia_qa',
31
+ reader_cfg=dict(
32
+ input_columns=[
33
+ 'question',
34
+ ],
35
+ output_column='answer',
36
+ test_range='[0:1000]',
37
+ test_split='validation',
38
+ train_split='train'),
39
+ type='opencompass.datasets.TriviaQADatasetV2'),
40
+ ],
41
+ ]
42
+ models = [
43
+ dict(
44
+ abbr='mask_gdn_1B_hrr-rank4_hf',
45
+ batch_size=8,
46
+ generation_kwargs=dict(),
47
+ max_out_len=256,
48
+ max_seq_len=None,
49
+ model_kwargs=dict(),
50
+ pad_token_id=None,
51
+ path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4',
52
+ peft_kwargs=dict(),
53
+ peft_path=None,
54
+ run_cfg=dict(num_gpus=1),
55
+ stop_words=[],
56
+ tokenizer_kwargs=dict(),
57
+ tokenizer_path=None,
58
+ type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel'
59
+ ),
60
+ ]
61
+ work_dir = 'outputs/default/20251127_190244'
tmp/104a1807-a194-4864-99ea-1a9fe1a47bac_params.py ADDED
File without changes
tmp/11308d03-3ab0-43b0-9f06-64b71c4140c1_params.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_lsht',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
8
+ ),
9
+ pred_postprocessor=dict(
10
+ type='opencompass.datasets.lsht_postprocess'),
11
+ pred_role='BOT'),
12
+ infer_cfg=dict(
13
+ inferencer=dict(
14
+ max_out_len=64,
15
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
16
+ prompt_template=dict(
17
+ template=dict(round=[
18
+ dict(
19
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
20
+ role='HUMAN'),
21
+ ]),
22
+ type=
23
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
24
+ retriever=dict(
25
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
26
+ name='lsht',
27
+ path='opencompass/Longbench',
28
+ reader_cfg=dict(
29
+ input_columns=[
30
+ 'context',
31
+ 'input',
32
+ ],
33
+ output_column='all_labels',
34
+ test_split='test',
35
+ train_split='test'),
36
+ type='opencompass.datasets.LongBenchlshtDataset'),
37
+ ],
38
+ ]
39
+ eval = dict(runner=dict(task=dict(dump_details=True)))
40
+ models = [
41
+ dict(
42
+ abbr='retnet',
43
+ batch_size=128,
44
+ max_seq_len=2048,
45
+ model_kwargs=dict(
46
+ device_map='auto',
47
+ torch_dtype='torch.bfloat16',
48
+ trust_remote_code=True),
49
+ path='/mnt/jfzn/msj/retnet-1.3B-100B',
50
+ run_cfg=dict(num_gpus=1),
51
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
52
+ tokenizer_path='/mnt/jfzn/msj/retnet-1.3B-100B',
53
+ type='opencompass.models.HuggingFaceBaseModel'),
54
+ ]
55
+ work_dir = 'outputs/default/20251207_222645'
tmp/1405e46f-8be4-462d-a794-3b47ef9839c2_params.py ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets = [
2
+ [
3
+ dict(
4
+ abbr='LongBench_2wikimqa_4',
5
+ eval_cfg=dict(
6
+ evaluator=dict(
7
+ type='opencompass.datasets.LongBenchF1Evaluator'),
8
+ pred_role='BOT'),
9
+ infer_cfg=dict(
10
+ inferencer=dict(
11
+ max_out_len=32,
12
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
13
+ prompt_template=dict(
14
+ template=dict(round=[
15
+ dict(
16
+ prompt=
17
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
18
+ role='HUMAN'),
19
+ ]),
20
+ type=
21
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
22
+ retriever=dict(
23
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
24
+ name='2wikimqa',
25
+ path='opencompass/Longbench',
26
+ reader_cfg=dict(
27
+ input_columns=[
28
+ 'context',
29
+ 'input',
30
+ ],
31
+ output_column='answers',
32
+ test_range='[100:125]',
33
+ test_split='test',
34
+ train_split='test'),
35
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
36
+ dict(
37
+ abbr='LongBench_hotpotqa_4',
38
+ eval_cfg=dict(
39
+ evaluator=dict(
40
+ type='opencompass.datasets.LongBenchF1Evaluator'),
41
+ pred_role='BOT'),
42
+ infer_cfg=dict(
43
+ inferencer=dict(
44
+ max_out_len=32,
45
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
46
+ prompt_template=dict(
47
+ template=dict(round=[
48
+ dict(
49
+ prompt=
50
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
51
+ role='HUMAN'),
52
+ ]),
53
+ type=
54
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
55
+ retriever=dict(
56
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
57
+ name='hotpotqa',
58
+ path='opencompass/Longbench',
59
+ reader_cfg=dict(
60
+ input_columns=[
61
+ 'context',
62
+ 'input',
63
+ ],
64
+ output_column='answers',
65
+ test_range='[100:125]',
66
+ test_split='test',
67
+ train_split='test'),
68
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
69
+ dict(
70
+ abbr='LongBench_musique_4',
71
+ eval_cfg=dict(
72
+ evaluator=dict(
73
+ type='opencompass.datasets.LongBenchF1Evaluator'),
74
+ pred_role='BOT'),
75
+ infer_cfg=dict(
76
+ inferencer=dict(
77
+ max_out_len=32,
78
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
79
+ prompt_template=dict(
80
+ template=dict(round=[
81
+ dict(
82
+ prompt=
83
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
84
+ role='HUMAN'),
85
+ ]),
86
+ type=
87
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
88
+ retriever=dict(
89
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
90
+ name='musique',
91
+ path='opencompass/Longbench',
92
+ reader_cfg=dict(
93
+ input_columns=[
94
+ 'context',
95
+ 'input',
96
+ ],
97
+ output_column='answers',
98
+ test_range='[100:125]',
99
+ test_split='test',
100
+ train_split='test'),
101
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
102
+ dict(
103
+ abbr='LongBench_multifieldqa_en_4',
104
+ eval_cfg=dict(
105
+ evaluator=dict(
106
+ type='opencompass.datasets.LongBenchF1Evaluator'),
107
+ pred_role='BOT'),
108
+ infer_cfg=dict(
109
+ inferencer=dict(
110
+ max_out_len=64,
111
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
112
+ prompt_template=dict(
113
+ template=dict(round=[
114
+ dict(
115
+ prompt=
116
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
117
+ role='HUMAN'),
118
+ ]),
119
+ type=
120
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
121
+ retriever=dict(
122
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
123
+ name='multifieldqa_en',
124
+ path='opencompass/Longbench',
125
+ reader_cfg=dict(
126
+ input_columns=[
127
+ 'context',
128
+ 'input',
129
+ ],
130
+ output_column='answers',
131
+ test_range='[76:95]',
132
+ test_split='test',
133
+ train_split='test'),
134
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
135
+ dict(
136
+ abbr='LongBench_multifieldqa_zh_4',
137
+ eval_cfg=dict(
138
+ evaluator=dict(
139
+ language='zh',
140
+ type='opencompass.datasets.LongBenchF1Evaluator'),
141
+ pred_role='BOT'),
142
+ infer_cfg=dict(
143
+ inferencer=dict(
144
+ max_out_len=64,
145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
146
+ prompt_template=dict(
147
+ template=dict(round=[
148
+ dict(
149
+ prompt=
150
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
151
+ role='HUMAN'),
152
+ ]),
153
+ type=
154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
155
+ retriever=dict(
156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
157
+ name='multifieldqa_zh',
158
+ path='opencompass/Longbench',
159
+ reader_cfg=dict(
160
+ input_columns=[
161
+ 'context',
162
+ 'input',
163
+ ],
164
+ output_column='answers',
165
+ test_range='[100:125]',
166
+ test_split='test',
167
+ train_split='test'),
168
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
169
+ dict(
170
+ abbr='LongBench_narrativeqa_4',
171
+ eval_cfg=dict(
172
+ evaluator=dict(
173
+ type='opencompass.datasets.LongBenchF1Evaluator'),
174
+ pred_role='BOT'),
175
+ infer_cfg=dict(
176
+ inferencer=dict(
177
+ max_out_len=128,
178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
179
+ prompt_template=dict(
180
+ template=dict(round=[
181
+ dict(
182
+ prompt=
183
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
184
+ role='HUMAN'),
185
+ ]),
186
+ type=
187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
188
+ retriever=dict(
189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
190
+ name='narrativeqa',
191
+ path='opencompass/Longbench',
192
+ reader_cfg=dict(
193
+ input_columns=[
194
+ 'context',
195
+ 'input',
196
+ ],
197
+ output_column='answers',
198
+ test_range='[100:125]',
199
+ test_split='test',
200
+ train_split='test'),
201
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
202
+ dict(
203
+ abbr='LongBench_qasper_4',
204
+ eval_cfg=dict(
205
+ evaluator=dict(
206
+ type='opencompass.datasets.LongBenchF1Evaluator'),
207
+ pred_role='BOT'),
208
+ infer_cfg=dict(
209
+ inferencer=dict(
210
+ max_out_len=32,
211
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
212
+ prompt_template=dict(
213
+ template=dict(round=[
214
+ dict(
215
+ prompt=
216
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
217
+ role='HUMAN'),
218
+ ]),
219
+ type=
220
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
221
+ retriever=dict(
222
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
223
+ name='qasper',
224
+ path='opencompass/Longbench',
225
+ reader_cfg=dict(
226
+ input_columns=[
227
+ 'context',
228
+ 'input',
229
+ ],
230
+ output_column='answers',
231
+ test_range='[100:125]',
232
+ test_split='test',
233
+ train_split='test'),
234
+ type='opencompass.datasets.LongBenchqasperDataset'),
235
+ dict(
236
+ abbr='LongBench_triviaqa_4',
237
+ eval_cfg=dict(
238
+ evaluator=dict(
239
+ type='opencompass.datasets.LongBenchF1Evaluator'),
240
+ pred_postprocessor=dict(
241
+ type='opencompass.datasets.triviaqa_postprocess'),
242
+ pred_role='BOT'),
243
+ infer_cfg=dict(
244
+ inferencer=dict(
245
+ max_out_len=32,
246
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
247
+ prompt_template=dict(
248
+ template=dict(round=[
249
+ dict(
250
+ prompt=
251
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
252
+ role='HUMAN'),
253
+ ]),
254
+ type=
255
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
256
+ retriever=dict(
257
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
258
+ name='triviaqa',
259
+ path='opencompass/Longbench',
260
+ reader_cfg=dict(
261
+ input_columns=[
262
+ 'context',
263
+ 'input',
264
+ ],
265
+ output_column='answers',
266
+ test_range='[100:125]',
267
+ test_split='test',
268
+ train_split='test'),
269
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
270
+ dict(
271
+ abbr='LongBench_gov_report_4',
272
+ eval_cfg=dict(
273
+ evaluator=dict(
274
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
275
+ pred_role='BOT'),
276
+ infer_cfg=dict(
277
+ inferencer=dict(
278
+ max_out_len=512,
279
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
280
+ prompt_template=dict(
281
+ template=dict(round=[
282
+ dict(
283
+ prompt=
284
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
285
+ role='HUMAN'),
286
+ ]),
287
+ type=
288
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
289
+ retriever=dict(
290
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
291
+ name='gov_report',
292
+ path='opencompass/Longbench',
293
+ reader_cfg=dict(
294
+ input_columns=[
295
+ 'context',
296
+ ],
297
+ output_column='answers',
298
+ test_range='[100:125]',
299
+ test_split='test',
300
+ train_split='test'),
301
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
302
+ dict(
303
+ abbr='LongBench_qmsum_4',
304
+ eval_cfg=dict(
305
+ evaluator=dict(
306
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
307
+ pred_role='BOT'),
308
+ infer_cfg=dict(
309
+ inferencer=dict(
310
+ max_out_len=512,
311
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
312
+ prompt_template=dict(
313
+ template=dict(round=[
314
+ dict(
315
+ prompt=
316
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
317
+ role='HUMAN'),
318
+ ]),
319
+ type=
320
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
321
+ retriever=dict(
322
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
323
+ name='qmsum',
324
+ path='opencompass/Longbench',
325
+ reader_cfg=dict(
326
+ input_columns=[
327
+ 'context',
328
+ 'input',
329
+ ],
330
+ output_column='answers',
331
+ test_range='[100:125]',
332
+ test_split='test',
333
+ train_split='test'),
334
+ type='opencompass.datasets.LongBenchqmsumDataset'),
335
+ dict(
336
+ abbr='LongBench_vcsum_4',
337
+ eval_cfg=dict(
338
+ evaluator=dict(
339
+ language='zh',
340
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
341
+ pred_role='BOT'),
342
+ infer_cfg=dict(
343
+ inferencer=dict(
344
+ max_out_len=512,
345
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
346
+ prompt_template=dict(
347
+ template=dict(round=[
348
+ dict(
349
+ prompt=
350
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
351
+ role='HUMAN'),
352
+ ]),
353
+ type=
354
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
355
+ retriever=dict(
356
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
357
+ name='vcsum',
358
+ path='opencompass/Longbench',
359
+ reader_cfg=dict(
360
+ input_columns=[
361
+ 'context',
362
+ ],
363
+ output_column='answers',
364
+ test_range='[100:125]',
365
+ test_split='test',
366
+ train_split='test'),
367
+ type='opencompass.datasets.LongBenchvcsumDataset'),
368
+ dict(
369
+ abbr='LongBench_dureader_4',
370
+ eval_cfg=dict(
371
+ evaluator=dict(
372
+ language='zh',
373
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
374
+ pred_role='BOT'),
375
+ infer_cfg=dict(
376
+ inferencer=dict(
377
+ max_out_len=128,
378
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
379
+ prompt_template=dict(
380
+ template=dict(round=[
381
+ dict(
382
+ prompt=
383
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
384
+ role='HUMAN'),
385
+ ]),
386
+ type=
387
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
388
+ retriever=dict(
389
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
390
+ name='dureader',
391
+ path='opencompass/Longbench',
392
+ reader_cfg=dict(
393
+ input_columns=[
394
+ 'context',
395
+ 'input',
396
+ ],
397
+ output_column='answers',
398
+ test_range='[100:125]',
399
+ test_split='test',
400
+ train_split='test'),
401
+ type='opencompass.datasets.LongBenchdureaderDataset'),
402
+ dict(
403
+ abbr='LongBench_lcc_4',
404
+ eval_cfg=dict(
405
+ evaluator=dict(
406
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
407
+ pred_role='BOT'),
408
+ infer_cfg=dict(
409
+ inferencer=dict(
410
+ max_out_len=64,
411
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
412
+ prompt_template=dict(
413
+ template=dict(round=[
414
+ dict(
415
+ prompt=
416
+ 'Please complete the code given below. \n{context}Next line of code:\n',
417
+ role='HUMAN'),
418
+ ]),
419
+ type=
420
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
421
+ retriever=dict(
422
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
423
+ name='lcc',
424
+ path='opencompass/Longbench',
425
+ reader_cfg=dict(
426
+ input_columns=[
427
+ 'context',
428
+ ],
429
+ output_column='answers',
430
+ test_range='[252:315]',
431
+ test_split='test',
432
+ train_split='test'),
433
+ type='opencompass.datasets.LongBenchlccDataset'),
434
+ dict(
435
+ abbr='LongBench_repobench-p_4',
436
+ eval_cfg=dict(
437
+ evaluator=dict(
438
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
439
+ pred_role='BOT'),
440
+ infer_cfg=dict(
441
+ inferencer=dict(
442
+ max_out_len=64,
443
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
444
+ prompt_template=dict(
445
+ template=dict(round=[
446
+ dict(
447
+ prompt=
448
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
449
+ role='HUMAN'),
450
+ ]),
451
+ type=
452
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
453
+ retriever=dict(
454
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
455
+ name='repobench-p',
456
+ path='opencompass/Longbench',
457
+ reader_cfg=dict(
458
+ input_columns=[
459
+ 'context',
460
+ 'input',
461
+ ],
462
+ output_column='answers',
463
+ test_range='[252:315]',
464
+ test_split='test',
465
+ train_split='test'),
466
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
467
+ dict(
468
+ abbr='LongBench_passage_retrieval_en_4',
469
+ eval_cfg=dict(
470
+ evaluator=dict(
471
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
472
+ pred_role='BOT'),
473
+ infer_cfg=dict(
474
+ inferencer=dict(
475
+ max_out_len=32,
476
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
477
+ prompt_template=dict(
478
+ template=dict(round=[
479
+ dict(
480
+ prompt=
481
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
482
+ role='HUMAN'),
483
+ ]),
484
+ type=
485
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
486
+ retriever=dict(
487
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
488
+ name='passage_retrieval_en',
489
+ path='opencompass/Longbench',
490
+ reader_cfg=dict(
491
+ input_columns=[
492
+ 'context',
493
+ 'input',
494
+ ],
495
+ output_column='answers',
496
+ test_range='[100:125]',
497
+ test_split='test',
498
+ train_split='test'),
499
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
500
+ dict(
501
+ abbr='LongBench_passage_retrieval_zh_4',
502
+ eval_cfg=dict(
503
+ evaluator=dict(
504
+ language='zh',
505
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
506
+ pred_role='BOT'),
507
+ infer_cfg=dict(
508
+ inferencer=dict(
509
+ max_out_len=32,
510
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
511
+ prompt_template=dict(
512
+ template=dict(round=[
513
+ dict(
514
+ prompt=
515
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
516
+ role='HUMAN'),
517
+ ]),
518
+ type=
519
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
520
+ retriever=dict(
521
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
522
+ name='passage_retrieval_zh',
523
+ path='opencompass/Longbench',
524
+ reader_cfg=dict(
525
+ input_columns=[
526
+ 'context',
527
+ 'input',
528
+ ],
529
+ output_column='answers',
530
+ test_range='[100:125]',
531
+ test_split='test',
532
+ train_split='test'),
533
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
534
+ dict(
535
+ abbr='LongBench_passage_count_4',
536
+ eval_cfg=dict(
537
+ evaluator=dict(
538
+ type='opencompass.datasets.LongBenchCountEvaluator'),
539
+ pred_role='BOT'),
540
+ infer_cfg=dict(
541
+ inferencer=dict(
542
+ max_out_len=32,
543
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
544
+ prompt_template=dict(
545
+ template=dict(round=[
546
+ dict(
547
+ prompt=
548
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
549
+ role='HUMAN'),
550
+ ]),
551
+ type=
552
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
553
+ retriever=dict(
554
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
555
+ name='passage_count',
556
+ path='opencompass/Longbench',
557
+ reader_cfg=dict(
558
+ input_columns=[
559
+ 'context',
560
+ 'input',
561
+ ],
562
+ output_column='answers',
563
+ test_range='[100:125]',
564
+ test_split='test',
565
+ train_split='test'),
566
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
567
+ dict(
568
+ abbr='LongBench_trec_4',
569
+ eval_cfg=dict(
570
+ evaluator=dict(
571
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
572
+ ),
573
+ pred_postprocessor=dict(
574
+ type='opencompass.datasets.trec_postprocess'),
575
+ pred_role='BOT'),
576
+ infer_cfg=dict(
577
+ inferencer=dict(
578
+ max_out_len=64,
579
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
580
+ prompt_template=dict(
581
+ template=dict(round=[
582
+ dict(
583
+ prompt=
584
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
585
+ role='HUMAN'),
586
+ ]),
587
+ type=
588
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
589
+ retriever=dict(
590
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
591
+ name='trec',
592
+ path='opencompass/Longbench',
593
+ reader_cfg=dict(
594
+ input_columns=[
595
+ 'context',
596
+ 'input',
597
+ ],
598
+ output_column='all_labels',
599
+ test_range='[100:125]',
600
+ test_split='test',
601
+ train_split='test'),
602
+ type='opencompass.datasets.LongBenchtrecDataset'),
603
+ dict(
604
+ abbr='LongBench_lsht_4',
605
+ eval_cfg=dict(
606
+ evaluator=dict(
607
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
608
+ ),
609
+ pred_postprocessor=dict(
610
+ type='opencompass.datasets.lsht_postprocess'),
611
+ pred_role='BOT'),
612
+ infer_cfg=dict(
613
+ inferencer=dict(
614
+ max_out_len=64,
615
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
616
+ prompt_template=dict(
617
+ template=dict(round=[
618
+ dict(
619
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
620
+ role='HUMAN'),
621
+ ]),
622
+ type=
623
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
624
+ retriever=dict(
625
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
626
+ name='lsht',
627
+ path='opencompass/Longbench',
628
+ reader_cfg=dict(
629
+ input_columns=[
630
+ 'context',
631
+ 'input',
632
+ ],
633
+ output_column='all_labels',
634
+ test_range='[100:125]',
635
+ test_split='test',
636
+ train_split='test'),
637
+ type='opencompass.datasets.LongBenchlshtDataset'),
638
+ dict(
639
+ abbr='LongBench_multi_news_4',
640
+ eval_cfg=dict(
641
+ evaluator=dict(
642
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
643
+ pred_role='BOT'),
644
+ infer_cfg=dict(
645
+ inferencer=dict(
646
+ max_out_len=512,
647
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
648
+ prompt_template=dict(
649
+ template=dict(round=[
650
+ dict(
651
+ prompt=
652
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
653
+ role='HUMAN'),
654
+ ]),
655
+ type=
656
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
657
+ retriever=dict(
658
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
659
+ name='multi_news',
660
+ path='opencompass/Longbench',
661
+ reader_cfg=dict(
662
+ input_columns=[
663
+ 'context',
664
+ ],
665
+ output_column='answers',
666
+ test_range='[100:125]',
667
+ test_split='test',
668
+ train_split='test'),
669
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
670
+ dict(
671
+ abbr='LongBench_samsum_4',
672
+ eval_cfg=dict(
673
+ evaluator=dict(
674
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
675
+ pred_postprocessor=dict(
676
+ type='opencompass.datasets.samsum_postprocess'),
677
+ pred_role='BOT'),
678
+ infer_cfg=dict(
679
+ inferencer=dict(
680
+ max_out_len=128,
681
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
682
+ prompt_template=dict(
683
+ template=dict(round=[
684
+ dict(
685
+ prompt=
686
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
687
+ role='HUMAN'),
688
+ ]),
689
+ type=
690
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
691
+ retriever=dict(
692
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
693
+ name='samsum',
694
+ path='opencompass/Longbench',
695
+ reader_cfg=dict(
696
+ input_columns=[
697
+ 'context',
698
+ 'input',
699
+ ],
700
+ output_column='answers',
701
+ test_range='[100:125]',
702
+ test_split='test',
703
+ train_split='test'),
704
+ type='opencompass.datasets.LongBenchsamsumDataset'),
705
+ dict(
706
+ abbr='LongBench_2wikimqa_4',
707
+ eval_cfg=dict(
708
+ evaluator=dict(
709
+ type='opencompass.datasets.LongBenchF1Evaluator'),
710
+ pred_role='BOT'),
711
+ infer_cfg=dict(
712
+ inferencer=dict(
713
+ max_out_len=32,
714
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
715
+ prompt_template=dict(
716
+ template=dict(round=[
717
+ dict(
718
+ prompt=
719
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
720
+ role='HUMAN'),
721
+ ]),
722
+ type=
723
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
724
+ retriever=dict(
725
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
726
+ name='2wikimqa',
727
+ path='opencompass/Longbench',
728
+ reader_cfg=dict(
729
+ input_columns=[
730
+ 'context',
731
+ 'input',
732
+ ],
733
+ output_column='answers',
734
+ test_range='[100:125]',
735
+ test_split='test',
736
+ train_split='test'),
737
+ type='opencompass.datasets.LongBench2wikimqaDataset'),
738
+ dict(
739
+ abbr='LongBench_hotpotqa_4',
740
+ eval_cfg=dict(
741
+ evaluator=dict(
742
+ type='opencompass.datasets.LongBenchF1Evaluator'),
743
+ pred_role='BOT'),
744
+ infer_cfg=dict(
745
+ inferencer=dict(
746
+ max_out_len=32,
747
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
748
+ prompt_template=dict(
749
+ template=dict(round=[
750
+ dict(
751
+ prompt=
752
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
753
+ role='HUMAN'),
754
+ ]),
755
+ type=
756
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
757
+ retriever=dict(
758
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
759
+ name='hotpotqa',
760
+ path='opencompass/Longbench',
761
+ reader_cfg=dict(
762
+ input_columns=[
763
+ 'context',
764
+ 'input',
765
+ ],
766
+ output_column='answers',
767
+ test_range='[100:125]',
768
+ test_split='test',
769
+ train_split='test'),
770
+ type='opencompass.datasets.LongBenchhotpotqaDataset'),
771
+ dict(
772
+ abbr='LongBench_musique_4',
773
+ eval_cfg=dict(
774
+ evaluator=dict(
775
+ type='opencompass.datasets.LongBenchF1Evaluator'),
776
+ pred_role='BOT'),
777
+ infer_cfg=dict(
778
+ inferencer=dict(
779
+ max_out_len=32,
780
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
781
+ prompt_template=dict(
782
+ template=dict(round=[
783
+ dict(
784
+ prompt=
785
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
786
+ role='HUMAN'),
787
+ ]),
788
+ type=
789
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
790
+ retriever=dict(
791
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
792
+ name='musique',
793
+ path='opencompass/Longbench',
794
+ reader_cfg=dict(
795
+ input_columns=[
796
+ 'context',
797
+ 'input',
798
+ ],
799
+ output_column='answers',
800
+ test_range='[100:125]',
801
+ test_split='test',
802
+ train_split='test'),
803
+ type='opencompass.datasets.LongBenchmusiqueDataset'),
804
+ dict(
805
+ abbr='LongBench_multifieldqa_en_4',
806
+ eval_cfg=dict(
807
+ evaluator=dict(
808
+ type='opencompass.datasets.LongBenchF1Evaluator'),
809
+ pred_role='BOT'),
810
+ infer_cfg=dict(
811
+ inferencer=dict(
812
+ max_out_len=64,
813
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
814
+ prompt_template=dict(
815
+ template=dict(round=[
816
+ dict(
817
+ prompt=
818
+ 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
819
+ role='HUMAN'),
820
+ ]),
821
+ type=
822
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
823
+ retriever=dict(
824
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
825
+ name='multifieldqa_en',
826
+ path='opencompass/Longbench',
827
+ reader_cfg=dict(
828
+ input_columns=[
829
+ 'context',
830
+ 'input',
831
+ ],
832
+ output_column='answers',
833
+ test_range='[76:95]',
834
+ test_split='test',
835
+ train_split='test'),
836
+ type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
837
+ dict(
838
+ abbr='LongBench_multifieldqa_zh_4',
839
+ eval_cfg=dict(
840
+ evaluator=dict(
841
+ language='zh',
842
+ type='opencompass.datasets.LongBenchF1Evaluator'),
843
+ pred_role='BOT'),
844
+ infer_cfg=dict(
845
+ inferencer=dict(
846
+ max_out_len=64,
847
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
848
+ prompt_template=dict(
849
+ template=dict(round=[
850
+ dict(
851
+ prompt=
852
+ '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
853
+ role='HUMAN'),
854
+ ]),
855
+ type=
856
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
857
+ retriever=dict(
858
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
859
+ name='multifieldqa_zh',
860
+ path='opencompass/Longbench',
861
+ reader_cfg=dict(
862
+ input_columns=[
863
+ 'context',
864
+ 'input',
865
+ ],
866
+ output_column='answers',
867
+ test_range='[100:125]',
868
+ test_split='test',
869
+ train_split='test'),
870
+ type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
871
+ dict(
872
+ abbr='LongBench_narrativeqa_4',
873
+ eval_cfg=dict(
874
+ evaluator=dict(
875
+ type='opencompass.datasets.LongBenchF1Evaluator'),
876
+ pred_role='BOT'),
877
+ infer_cfg=dict(
878
+ inferencer=dict(
879
+ max_out_len=128,
880
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
881
+ prompt_template=dict(
882
+ template=dict(round=[
883
+ dict(
884
+ prompt=
885
+ 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
886
+ role='HUMAN'),
887
+ ]),
888
+ type=
889
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
890
+ retriever=dict(
891
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
892
+ name='narrativeqa',
893
+ path='opencompass/Longbench',
894
+ reader_cfg=dict(
895
+ input_columns=[
896
+ 'context',
897
+ 'input',
898
+ ],
899
+ output_column='answers',
900
+ test_range='[100:125]',
901
+ test_split='test',
902
+ train_split='test'),
903
+ type='opencompass.datasets.LongBenchnarrativeqaDataset'),
904
+ dict(
905
+ abbr='LongBench_qasper_4',
906
+ eval_cfg=dict(
907
+ evaluator=dict(
908
+ type='opencompass.datasets.LongBenchF1Evaluator'),
909
+ pred_role='BOT'),
910
+ infer_cfg=dict(
911
+ inferencer=dict(
912
+ max_out_len=32,
913
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
914
+ prompt_template=dict(
915
+ template=dict(round=[
916
+ dict(
917
+ prompt=
918
+ 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
919
+ role='HUMAN'),
920
+ ]),
921
+ type=
922
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
923
+ retriever=dict(
924
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
925
+ name='qasper',
926
+ path='opencompass/Longbench',
927
+ reader_cfg=dict(
928
+ input_columns=[
929
+ 'context',
930
+ 'input',
931
+ ],
932
+ output_column='answers',
933
+ test_range='[100:125]',
934
+ test_split='test',
935
+ train_split='test'),
936
+ type='opencompass.datasets.LongBenchqasperDataset'),
937
+ dict(
938
+ abbr='LongBench_triviaqa_4',
939
+ eval_cfg=dict(
940
+ evaluator=dict(
941
+ type='opencompass.datasets.LongBenchF1Evaluator'),
942
+ pred_postprocessor=dict(
943
+ type='opencompass.datasets.triviaqa_postprocess'),
944
+ pred_role='BOT'),
945
+ infer_cfg=dict(
946
+ inferencer=dict(
947
+ max_out_len=32,
948
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
949
+ prompt_template=dict(
950
+ template=dict(round=[
951
+ dict(
952
+ prompt=
953
+ 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
954
+ role='HUMAN'),
955
+ ]),
956
+ type=
957
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
958
+ retriever=dict(
959
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
960
+ name='triviaqa',
961
+ path='opencompass/Longbench',
962
+ reader_cfg=dict(
963
+ input_columns=[
964
+ 'context',
965
+ 'input',
966
+ ],
967
+ output_column='answers',
968
+ test_range='[100:125]',
969
+ test_split='test',
970
+ train_split='test'),
971
+ type='opencompass.datasets.LongBenchtriviaqaDataset'),
972
+ dict(
973
+ abbr='LongBench_gov_report_4',
974
+ eval_cfg=dict(
975
+ evaluator=dict(
976
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
977
+ pred_role='BOT'),
978
+ infer_cfg=dict(
979
+ inferencer=dict(
980
+ max_out_len=512,
981
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
982
+ prompt_template=dict(
983
+ template=dict(round=[
984
+ dict(
985
+ prompt=
986
+ 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
987
+ role='HUMAN'),
988
+ ]),
989
+ type=
990
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
991
+ retriever=dict(
992
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
993
+ name='gov_report',
994
+ path='opencompass/Longbench',
995
+ reader_cfg=dict(
996
+ input_columns=[
997
+ 'context',
998
+ ],
999
+ output_column='answers',
1000
+ test_range='[100:125]',
1001
+ test_split='test',
1002
+ train_split='test'),
1003
+ type='opencompass.datasets.LongBenchgov_reportDataset'),
1004
+ dict(
1005
+ abbr='LongBench_qmsum_4',
1006
+ eval_cfg=dict(
1007
+ evaluator=dict(
1008
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1009
+ pred_role='BOT'),
1010
+ infer_cfg=dict(
1011
+ inferencer=dict(
1012
+ max_out_len=512,
1013
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1014
+ prompt_template=dict(
1015
+ template=dict(round=[
1016
+ dict(
1017
+ prompt=
1018
+ 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
1019
+ role='HUMAN'),
1020
+ ]),
1021
+ type=
1022
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1023
+ retriever=dict(
1024
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1025
+ name='qmsum',
1026
+ path='opencompass/Longbench',
1027
+ reader_cfg=dict(
1028
+ input_columns=[
1029
+ 'context',
1030
+ 'input',
1031
+ ],
1032
+ output_column='answers',
1033
+ test_range='[100:125]',
1034
+ test_split='test',
1035
+ train_split='test'),
1036
+ type='opencompass.datasets.LongBenchqmsumDataset'),
1037
+ dict(
1038
+ abbr='LongBench_vcsum_4',
1039
+ eval_cfg=dict(
1040
+ evaluator=dict(
1041
+ language='zh',
1042
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1043
+ pred_role='BOT'),
1044
+ infer_cfg=dict(
1045
+ inferencer=dict(
1046
+ max_out_len=512,
1047
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1048
+ prompt_template=dict(
1049
+ template=dict(round=[
1050
+ dict(
1051
+ prompt=
1052
+ '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
1053
+ role='HUMAN'),
1054
+ ]),
1055
+ type=
1056
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1057
+ retriever=dict(
1058
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1059
+ name='vcsum',
1060
+ path='opencompass/Longbench',
1061
+ reader_cfg=dict(
1062
+ input_columns=[
1063
+ 'context',
1064
+ ],
1065
+ output_column='answers',
1066
+ test_range='[100:125]',
1067
+ test_split='test',
1068
+ train_split='test'),
1069
+ type='opencompass.datasets.LongBenchvcsumDataset'),
1070
+ dict(
1071
+ abbr='LongBench_dureader_4',
1072
+ eval_cfg=dict(
1073
+ evaluator=dict(
1074
+ language='zh',
1075
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1076
+ pred_role='BOT'),
1077
+ infer_cfg=dict(
1078
+ inferencer=dict(
1079
+ max_out_len=128,
1080
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1081
+ prompt_template=dict(
1082
+ template=dict(round=[
1083
+ dict(
1084
+ prompt=
1085
+ '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
1086
+ role='HUMAN'),
1087
+ ]),
1088
+ type=
1089
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1090
+ retriever=dict(
1091
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1092
+ name='dureader',
1093
+ path='opencompass/Longbench',
1094
+ reader_cfg=dict(
1095
+ input_columns=[
1096
+ 'context',
1097
+ 'input',
1098
+ ],
1099
+ output_column='answers',
1100
+ test_range='[100:125]',
1101
+ test_split='test',
1102
+ train_split='test'),
1103
+ type='opencompass.datasets.LongBenchdureaderDataset'),
1104
+ dict(
1105
+ abbr='LongBench_lcc_4',
1106
+ eval_cfg=dict(
1107
+ evaluator=dict(
1108
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1109
+ pred_role='BOT'),
1110
+ infer_cfg=dict(
1111
+ inferencer=dict(
1112
+ max_out_len=64,
1113
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1114
+ prompt_template=dict(
1115
+ template=dict(round=[
1116
+ dict(
1117
+ prompt=
1118
+ 'Please complete the code given below. \n{context}Next line of code:\n',
1119
+ role='HUMAN'),
1120
+ ]),
1121
+ type=
1122
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1123
+ retriever=dict(
1124
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1125
+ name='lcc',
1126
+ path='opencompass/Longbench',
1127
+ reader_cfg=dict(
1128
+ input_columns=[
1129
+ 'context',
1130
+ ],
1131
+ output_column='answers',
1132
+ test_range='[252:315]',
1133
+ test_split='test',
1134
+ train_split='test'),
1135
+ type='opencompass.datasets.LongBenchlccDataset'),
1136
+ dict(
1137
+ abbr='LongBench_repobench-p_4',
1138
+ eval_cfg=dict(
1139
+ evaluator=dict(
1140
+ type='opencompass.datasets.LongBenchCodeSimEvaluator'),
1141
+ pred_role='BOT'),
1142
+ infer_cfg=dict(
1143
+ inferencer=dict(
1144
+ max_out_len=64,
1145
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1146
+ prompt_template=dict(
1147
+ template=dict(round=[
1148
+ dict(
1149
+ prompt=
1150
+ 'Please complete the code given below. \n{context}{input}Next line of code:\n',
1151
+ role='HUMAN'),
1152
+ ]),
1153
+ type=
1154
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1155
+ retriever=dict(
1156
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1157
+ name='repobench-p',
1158
+ path='opencompass/Longbench',
1159
+ reader_cfg=dict(
1160
+ input_columns=[
1161
+ 'context',
1162
+ 'input',
1163
+ ],
1164
+ output_column='answers',
1165
+ test_range='[252:315]',
1166
+ test_split='test',
1167
+ train_split='test'),
1168
+ type='opencompass.datasets.LongBenchrepobenchDataset'),
1169
+ dict(
1170
+ abbr='LongBench_passage_retrieval_en_4',
1171
+ eval_cfg=dict(
1172
+ evaluator=dict(
1173
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1174
+ pred_role='BOT'),
1175
+ infer_cfg=dict(
1176
+ inferencer=dict(
1177
+ max_out_len=32,
1178
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1179
+ prompt_template=dict(
1180
+ template=dict(round=[
1181
+ dict(
1182
+ prompt=
1183
+ 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
1184
+ role='HUMAN'),
1185
+ ]),
1186
+ type=
1187
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1188
+ retriever=dict(
1189
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1190
+ name='passage_retrieval_en',
1191
+ path='opencompass/Longbench',
1192
+ reader_cfg=dict(
1193
+ input_columns=[
1194
+ 'context',
1195
+ 'input',
1196
+ ],
1197
+ output_column='answers',
1198
+ test_range='[100:125]',
1199
+ test_split='test',
1200
+ train_split='test'),
1201
+ type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
1202
+ dict(
1203
+ abbr='LongBench_passage_retrieval_zh_4',
1204
+ eval_cfg=dict(
1205
+ evaluator=dict(
1206
+ language='zh',
1207
+ type='opencompass.datasets.LongBenchRetrievalEvaluator'),
1208
+ pred_role='BOT'),
1209
+ infer_cfg=dict(
1210
+ inferencer=dict(
1211
+ max_out_len=32,
1212
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1213
+ prompt_template=dict(
1214
+ template=dict(round=[
1215
+ dict(
1216
+ prompt=
1217
+ '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
1218
+ role='HUMAN'),
1219
+ ]),
1220
+ type=
1221
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1222
+ retriever=dict(
1223
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1224
+ name='passage_retrieval_zh',
1225
+ path='opencompass/Longbench',
1226
+ reader_cfg=dict(
1227
+ input_columns=[
1228
+ 'context',
1229
+ 'input',
1230
+ ],
1231
+ output_column='answers',
1232
+ test_range='[100:125]',
1233
+ test_split='test',
1234
+ train_split='test'),
1235
+ type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
1236
+ dict(
1237
+ abbr='LongBench_passage_count_4',
1238
+ eval_cfg=dict(
1239
+ evaluator=dict(
1240
+ type='opencompass.datasets.LongBenchCountEvaluator'),
1241
+ pred_role='BOT'),
1242
+ infer_cfg=dict(
1243
+ inferencer=dict(
1244
+ max_out_len=32,
1245
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1246
+ prompt_template=dict(
1247
+ template=dict(round=[
1248
+ dict(
1249
+ prompt=
1250
+ 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
1251
+ role='HUMAN'),
1252
+ ]),
1253
+ type=
1254
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1255
+ retriever=dict(
1256
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1257
+ name='passage_count',
1258
+ path='opencompass/Longbench',
1259
+ reader_cfg=dict(
1260
+ input_columns=[
1261
+ 'context',
1262
+ 'input',
1263
+ ],
1264
+ output_column='answers',
1265
+ test_range='[100:125]',
1266
+ test_split='test',
1267
+ train_split='test'),
1268
+ type='opencompass.datasets.LongBenchpassage_countDataset'),
1269
+ dict(
1270
+ abbr='LongBench_trec_4',
1271
+ eval_cfg=dict(
1272
+ evaluator=dict(
1273
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1274
+ ),
1275
+ pred_postprocessor=dict(
1276
+ type='opencompass.datasets.trec_postprocess'),
1277
+ pred_role='BOT'),
1278
+ infer_cfg=dict(
1279
+ inferencer=dict(
1280
+ max_out_len=64,
1281
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1282
+ prompt_template=dict(
1283
+ template=dict(round=[
1284
+ dict(
1285
+ prompt=
1286
+ 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
1287
+ role='HUMAN'),
1288
+ ]),
1289
+ type=
1290
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1291
+ retriever=dict(
1292
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1293
+ name='trec',
1294
+ path='opencompass/Longbench',
1295
+ reader_cfg=dict(
1296
+ input_columns=[
1297
+ 'context',
1298
+ 'input',
1299
+ ],
1300
+ output_column='all_labels',
1301
+ test_range='[100:125]',
1302
+ test_split='test',
1303
+ train_split='test'),
1304
+ type='opencompass.datasets.LongBenchtrecDataset'),
1305
+ dict(
1306
+ abbr='LongBench_lsht_4',
1307
+ eval_cfg=dict(
1308
+ evaluator=dict(
1309
+ type='opencompass.datasets.LongBenchClassificationEvaluator'
1310
+ ),
1311
+ pred_postprocessor=dict(
1312
+ type='opencompass.datasets.lsht_postprocess'),
1313
+ pred_role='BOT'),
1314
+ infer_cfg=dict(
1315
+ inferencer=dict(
1316
+ max_out_len=64,
1317
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1318
+ prompt_template=dict(
1319
+ template=dict(round=[
1320
+ dict(
1321
+ prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
1322
+ role='HUMAN'),
1323
+ ]),
1324
+ type=
1325
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1326
+ retriever=dict(
1327
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1328
+ name='lsht',
1329
+ path='opencompass/Longbench',
1330
+ reader_cfg=dict(
1331
+ input_columns=[
1332
+ 'context',
1333
+ 'input',
1334
+ ],
1335
+ output_column='all_labels',
1336
+ test_range='[100:125]',
1337
+ test_split='test',
1338
+ train_split='test'),
1339
+ type='opencompass.datasets.LongBenchlshtDataset'),
1340
+ dict(
1341
+ abbr='LongBench_multi_news_4',
1342
+ eval_cfg=dict(
1343
+ evaluator=dict(
1344
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1345
+ pred_role='BOT'),
1346
+ infer_cfg=dict(
1347
+ inferencer=dict(
1348
+ max_out_len=512,
1349
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1350
+ prompt_template=dict(
1351
+ template=dict(round=[
1352
+ dict(
1353
+ prompt=
1354
+ 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
1355
+ role='HUMAN'),
1356
+ ]),
1357
+ type=
1358
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1359
+ retriever=dict(
1360
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1361
+ name='multi_news',
1362
+ path='opencompass/Longbench',
1363
+ reader_cfg=dict(
1364
+ input_columns=[
1365
+ 'context',
1366
+ ],
1367
+ output_column='answers',
1368
+ test_range='[100:125]',
1369
+ test_split='test',
1370
+ train_split='test'),
1371
+ type='opencompass.datasets.LongBenchmulti_newsDataset'),
1372
+ dict(
1373
+ abbr='LongBench_samsum_4',
1374
+ eval_cfg=dict(
1375
+ evaluator=dict(
1376
+ type='opencompass.datasets.LongBenchRougeEvaluator'),
1377
+ pred_postprocessor=dict(
1378
+ type='opencompass.datasets.samsum_postprocess'),
1379
+ pred_role='BOT'),
1380
+ infer_cfg=dict(
1381
+ inferencer=dict(
1382
+ max_out_len=128,
1383
+ type='opencompass.openicl.icl_inferencer.GenInferencer'),
1384
+ prompt_template=dict(
1385
+ template=dict(round=[
1386
+ dict(
1387
+ prompt=
1388
+ 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
1389
+ role='HUMAN'),
1390
+ ]),
1391
+ type=
1392
+ 'opencompass.openicl.icl_prompt_template.PromptTemplate'),
1393
+ retriever=dict(
1394
+ type='opencompass.openicl.icl_retriever.ZeroRetriever')),
1395
+ name='samsum',
1396
+ path='opencompass/Longbench',
1397
+ reader_cfg=dict(
1398
+ input_columns=[
1399
+ 'context',
1400
+ 'input',
1401
+ ],
1402
+ output_column='answers',
1403
+ test_range='[100:125]',
1404
+ test_split='test',
1405
+ train_split='test'),
1406
+ type='opencompass.datasets.LongBenchsamsumDataset'),
1407
+ ],
1408
+ ]
1409
+ models = [
1410
+ dict(
1411
+ abbr='delta_net',
1412
+ batch_size=128,
1413
+ max_seq_len=2048,
1414
+ model_kwargs=dict(
1415
+ device_map='auto',
1416
+ torch_dtype='torch.bfloat16',
1417
+ trust_remote_code=True),
1418
+ path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1419
+ run_cfg=dict(num_gpus=1),
1420
+ tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
1421
+ tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
1422
+ type='opencompass.models.HuggingFaceBaseModel'),
1423
+ ]
1424
+ work_dir = 'outputs/default/20251127_221150'