Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- examples/eval_OlympiadBench.py +36 -0
- examples/eval_attack.py +28 -0
- examples/eval_base_demo.py +14 -0
- examples/eval_charm_rea.py +66 -0
- examples/eval_chat_agent_baseline.py +38 -0
- examples/eval_code_passk.py +53 -0
- examples/eval_corebench_2409_base_objective.py +175 -0
- examples/eval_deepseek_r1.py +212 -0
- examples/eval_ds1000_interpreter.py +45 -0
- examples/eval_eese_api_judge.py +47 -0
- examples/eval_gpt4.py +44 -0
- examples/eval_hf_llama_7b.py +8 -0
- examples/eval_inference_ppl.py +51 -0
- examples/eval_internLM.py +9 -0
- examples/eval_internlm_7b.py +9 -0
- examples/eval_internlm_chat_turbomind.py +96 -0
- examples/eval_internlm_turbomind.py +55 -0
- examples/eval_judge_dataset_all.py +61 -0
- examples/eval_judgebench.py +52 -0
- examples/eval_judgerbench.py +58 -0
- examples/eval_judgerbenchv2.py +53 -0
- examples/eval_korbench.py +14 -0
- examples/eval_livestembench.py +66 -0
- examples/eval_llm_judge.py +116 -0
- examples/eval_lmdeploy_demo.py +10 -0
- examples/eval_longbenchv2.py +28 -0
- examples/eval_math_llm_judge.py +136 -0
- examples/eval_math_verify.py +77 -0
- examples/eval_mmlu_cf.py +36 -0
- examples/eval_mmlu_pro.py +39 -0
- examples/eval_mmlu_with_zero_retriever_overwritten.py +16 -0
- examples/eval_multi_prompt_demo.py +52 -0
- examples/eval_musr.py +34 -0
- examples/eval_needlebench_v2.py +27 -0
- examples/eval_qwen3.py +142 -0
- examples/eval_qwen_7b_chat.py +58 -0
- examples/eval_qwen_7b_chat_lawbench.py +13 -0
- examples/eval_rewardbench.py +53 -0
- examples/eval_rmb.py +53 -0
- examples/eval_ruler.py +97 -0
- examples/eval_rwkv5_3b.py +7 -0
- examples/eval_simpleqa.py +45 -0
- examples/eval_subjective.py +104 -0
- examples/eval_subjective_bradleyterry.py +120 -0
- examples/eval_teval.py +81 -0
- examples/eval_with_model_dataset_combinations.py +45 -0
- tmp/38bf021a-c80f-4a23-9021-f2adc82afa5d_params.py +1424 -0
- tmp/3baffa8c-bc69-4789-aa49-f30266896eb4_params.py +0 -0
- tmp/3bc1afd5-60f6-4b89-9fc0-909218b5c248_params.py +53 -0
- tmp/401500cf-6431-490c-9e43-14532e24796f_params.py +1424 -0
examples/eval_OlympiadBench.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_gen_be8b13 import olympiadbench_datasets
|
| 5 |
+
|
| 6 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
| 7 |
+
|
| 8 |
+
from opencompass.configs.summarizers.OlympiadBench import summarizer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
|
| 12 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 13 |
+
|
| 14 |
+
from opencompass.runners import LocalRunner
|
| 15 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 16 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 17 |
+
|
| 18 |
+
infer = dict(
|
| 19 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 20 |
+
runner=dict(
|
| 21 |
+
type=LocalRunner,
|
| 22 |
+
max_num_workers=8,
|
| 23 |
+
task=dict(type=OpenICLInferTask)
|
| 24 |
+
),
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
eval = dict(
|
| 28 |
+
partitioner=dict(type=NaivePartitioner, n=10),
|
| 29 |
+
runner=dict(
|
| 30 |
+
type=LocalRunner,
|
| 31 |
+
max_num_workers=256,
|
| 32 |
+
task=dict(type=OpenICLEvalTask)
|
| 33 |
+
),
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
work_dir = 'outputs/debug/OlympiadBench'
|
examples/eval_attack.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.partitioners import NaivePartitioner
|
| 4 |
+
from opencompass.runners import LocalRunner
|
| 5 |
+
from opencompass.tasks import OpenICLAttackTask
|
| 6 |
+
|
| 7 |
+
with read_base():
|
| 8 |
+
# choose a list of datasets
|
| 9 |
+
from opencompass.configs.datasets.promptbench.promptbench_wnli_gen_50662f import \
|
| 10 |
+
wnli_datasets
|
| 11 |
+
from opencompass.configs.models.qwen.hf_qwen2_1_5b import models
|
| 12 |
+
|
| 13 |
+
datasets = wnli_datasets
|
| 14 |
+
|
| 15 |
+
# Please run whole dataset at a time, aka use `NaivePartitioner` only
|
| 16 |
+
# Please use `OpenICLAttackTask` if want to perform attack experiment
|
| 17 |
+
infer = dict(
|
| 18 |
+
partitioner=dict(type=NaivePartitioner),
|
| 19 |
+
runner=dict(type=LocalRunner,
|
| 20 |
+
max_num_workers=8,
|
| 21 |
+
task=dict(type=OpenICLAttackTask)),
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
attack = dict(
|
| 25 |
+
attack='textfooler',
|
| 26 |
+
query_budget=100,
|
| 27 |
+
prompt_topk=1,
|
| 28 |
+
)
|
examples/eval_base_demo.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import \
|
| 5 |
+
gsm8k_datasets
|
| 6 |
+
from opencompass.configs.datasets.demo.demo_math_base_gen import \
|
| 7 |
+
math_datasets
|
| 8 |
+
from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import \
|
| 9 |
+
models as hf_internlm2_1_8b_models
|
| 10 |
+
from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
|
| 11 |
+
models as hf_qwen2_1_5b_models
|
| 12 |
+
|
| 13 |
+
datasets = gsm8k_datasets + math_datasets
|
| 14 |
+
models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models
|
examples/eval_charm_rea.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import \
|
| 5 |
+
charm_reason_datasets as datasets
|
| 6 |
+
|
| 7 |
+
# ------>>>>>> https://arxiv.org/abs/2403.14112
|
| 8 |
+
# from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
|
| 9 |
+
# from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
|
| 10 |
+
# from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
|
| 11 |
+
# from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
|
| 12 |
+
# from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
|
| 13 |
+
# from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
|
| 14 |
+
# from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
|
| 15 |
+
# from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
|
| 16 |
+
# from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1
|
| 17 |
+
# from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1
|
| 18 |
+
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
|
| 19 |
+
# from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
|
| 20 |
+
# from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
|
| 21 |
+
# from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
|
| 22 |
+
# from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
|
| 23 |
+
# from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
|
| 24 |
+
# from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
|
| 25 |
+
# from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
|
| 26 |
+
# from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
|
| 27 |
+
# <<<<<<------ https://arxiv.org/abs/2403.14112
|
| 28 |
+
# from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
|
| 29 |
+
# from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
|
| 30 |
+
# from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
|
| 31 |
+
# from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
|
| 32 |
+
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
|
| 33 |
+
# from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
|
| 34 |
+
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
|
| 35 |
+
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
|
| 36 |
+
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
|
| 37 |
+
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
|
| 38 |
+
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
|
| 39 |
+
# from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
|
| 40 |
+
# from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
|
| 41 |
+
# from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
|
| 42 |
+
# from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
|
| 43 |
+
# from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
|
| 44 |
+
from .summarizers.charm_reason import summarizer
|
| 45 |
+
|
| 46 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 47 |
+
work_dir = './outputs/CHARM_rea/chat/'
|
| 48 |
+
|
| 49 |
+
# dataset version metric mode internlm2-chat-7b-turbomind
|
| 50 |
+
# ------------------------------------------------------------- --------- ------------- ------ -----------------------------
|
| 51 |
+
# charm-reason-Direct - naive_average gen 49.51
|
| 52 |
+
# charm-reason-ZH-CoT - naive_average gen 61.33
|
| 53 |
+
# charm-reason-EN-CoT - naive_average gen 54.55
|
| 54 |
+
# charm-reason-XLT - naive_average gen 58.46
|
| 55 |
+
# charm-reason-Translate-EN - naive_average gen 56.15
|
| 56 |
+
# - - - -
|
| 57 |
+
# charm-reason-Chinese_Direct - naive_average gen 47.14
|
| 58 |
+
# charm-reason-Chinese_ZH-CoT - naive_average gen 58.40
|
| 59 |
+
# charm-reason-Chinese_EN-CoT - naive_average gen 48.31
|
| 60 |
+
# charm-reason-Chinese_XLT - naive_average gen 53.57
|
| 61 |
+
# charm-reason-Chinese_Translate-EN - naive_average gen 48.21
|
| 62 |
+
# charm-reason-Global_Direct - naive_average gen 51.88
|
| 63 |
+
# charm-reason-Global_ZH-CoT - naive_average gen 64.26
|
| 64 |
+
# charm-reason-Global_EN-CoT - naive_average gen 60.79
|
| 65 |
+
# charm-reason-Global_XLT - naive_average gen 63.36
|
| 66 |
+
# charm-reason-Global_Translate-EN - naive_average gen 64.10
|
examples/eval_chat_agent_baseline.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.models.openai_api import OpenAI
|
| 4 |
+
from opencompass.partitioners import SizePartitioner
|
| 5 |
+
from opencompass.runners import LocalRunner
|
| 6 |
+
from opencompass.tasks import OpenICLInferTask
|
| 7 |
+
|
| 8 |
+
with read_base():
|
| 9 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
|
| 10 |
+
gsm8k_datasets
|
| 11 |
+
from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
|
| 12 |
+
from opencompass.configs.datasets.MathBench.mathbench_gen import \
|
| 13 |
+
mathbench_datasets
|
| 14 |
+
from opencompass.configs.summarizers.math_baseline import summarizer
|
| 15 |
+
|
| 16 |
+
datasets = []
|
| 17 |
+
datasets += gsm8k_datasets
|
| 18 |
+
datasets += math_datasets
|
| 19 |
+
datasets += mathbench_datasets
|
| 20 |
+
|
| 21 |
+
models = [
|
| 22 |
+
dict(
|
| 23 |
+
abbr='gpt-3.5-react',
|
| 24 |
+
type=OpenAI,
|
| 25 |
+
path='gpt-3.5-turbo',
|
| 26 |
+
key='ENV',
|
| 27 |
+
query_per_second=1,
|
| 28 |
+
max_seq_len=4096,
|
| 29 |
+
batch_size=1,
|
| 30 |
+
),
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
infer = dict(
|
| 34 |
+
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
| 35 |
+
runner=dict(type=LocalRunner,
|
| 36 |
+
max_num_workers=16,
|
| 37 |
+
task=dict(type=OpenICLInferTask)),
|
| 38 |
+
)
|
examples/eval_code_passk.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This config is used for pass@k evaluation with `num_return_sequences`
|
| 2 |
+
# That model can generate multiple responses for single input
|
| 3 |
+
from mmengine.config import read_base
|
| 4 |
+
|
| 5 |
+
from opencompass.models import HuggingFaceCausalLM
|
| 6 |
+
from opencompass.partitioners import SizePartitioner
|
| 7 |
+
from opencompass.runners import LocalRunner
|
| 8 |
+
from opencompass.tasks import OpenICLInferTask
|
| 9 |
+
|
| 10 |
+
with read_base():
|
| 11 |
+
from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
|
| 12 |
+
humaneval_datasets
|
| 13 |
+
from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
|
| 14 |
+
mbpp_datasets
|
| 15 |
+
from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
|
| 16 |
+
sanitized_mbpp_datasets
|
| 17 |
+
|
| 18 |
+
datasets = []
|
| 19 |
+
datasets += humaneval_datasets
|
| 20 |
+
datasets += mbpp_datasets
|
| 21 |
+
datasets += sanitized_mbpp_datasets
|
| 22 |
+
|
| 23 |
+
models = [
|
| 24 |
+
dict(
|
| 25 |
+
type=HuggingFaceCausalLM,
|
| 26 |
+
abbr='CodeLlama-7b-Python',
|
| 27 |
+
path='codellama/CodeLlama-7b-Python-hf',
|
| 28 |
+
tokenizer_path='codellama/CodeLlama-7b-Python-hf',
|
| 29 |
+
tokenizer_kwargs=dict(
|
| 30 |
+
padding_side='left',
|
| 31 |
+
truncation_side='left',
|
| 32 |
+
trust_remote_code=True,
|
| 33 |
+
),
|
| 34 |
+
max_out_len=1024,
|
| 35 |
+
max_seq_len=2048,
|
| 36 |
+
batch_size=8,
|
| 37 |
+
model_kwargs=dict(trust_remote_code=True, device_map='auto'),
|
| 38 |
+
generation_kwargs=dict(
|
| 39 |
+
num_return_sequences=10,
|
| 40 |
+
do_sample=True,
|
| 41 |
+
top_p=0.95,
|
| 42 |
+
temperature=0.8,
|
| 43 |
+
),
|
| 44 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 45 |
+
),
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
infer = dict(
|
| 49 |
+
partitioner=dict(type=SizePartitioner, max_task_size=300),
|
| 50 |
+
runner=dict(type=LocalRunner,
|
| 51 |
+
max_num_workers=16,
|
| 52 |
+
task=dict(type=OpenICLInferTask)),
|
| 53 |
+
)
|
examples/eval_corebench_2409_base_objective.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
|
| 3 |
+
from mmengine.config import read_base
|
| 4 |
+
|
| 5 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 6 |
+
from opencompass.runners import LocalRunner
|
| 7 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 8 |
+
|
| 9 |
+
#######################################################################
|
| 10 |
+
# PART 0 Essential Configs #
|
| 11 |
+
#######################################################################
|
| 12 |
+
with read_base():
|
| 13 |
+
# Datasets Part
|
| 14 |
+
## Core Set
|
| 15 |
+
# ## Examination
|
| 16 |
+
# ## Reasoning
|
| 17 |
+
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
|
| 18 |
+
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
|
| 19 |
+
cmmlu_datasets
|
| 20 |
+
from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
|
| 21 |
+
# ## Scientific
|
| 22 |
+
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
|
| 23 |
+
gpqa_datasets
|
| 24 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
|
| 25 |
+
gsm8k_datasets
|
| 26 |
+
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
|
| 27 |
+
hellaswag_datasets
|
| 28 |
+
# ## Coding
|
| 29 |
+
from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import \
|
| 30 |
+
humaneval_datasets
|
| 31 |
+
# ## Math
|
| 32 |
+
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
|
| 33 |
+
math_datasets
|
| 34 |
+
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
|
| 35 |
+
mathbench_datasets
|
| 36 |
+
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
|
| 37 |
+
sanitized_mbpp_datasets
|
| 38 |
+
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
| 39 |
+
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
|
| 40 |
+
mmlu_pro_datasets
|
| 41 |
+
# Model List
|
| 42 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
|
| 43 |
+
models as lmdeploy_qwen2_5_1_5b_model
|
| 44 |
+
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
|
| 45 |
+
from opencompass.configs.summarizers.groups.cmmlu import \
|
| 46 |
+
cmmlu_summary_groups
|
| 47 |
+
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
|
| 48 |
+
mathbench_2024_summary_groups
|
| 49 |
+
# TODO: Add LiveCodeBench
|
| 50 |
+
# ## Instruction Following
|
| 51 |
+
# from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
|
| 52 |
+
# Summarizer
|
| 53 |
+
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
|
| 54 |
+
from opencompass.configs.summarizers.groups.mmlu_pro import \
|
| 55 |
+
mmlu_pro_summary_groups
|
| 56 |
+
|
| 57 |
+
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
|
| 58 |
+
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
|
| 59 |
+
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
|
| 60 |
+
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
|
| 61 |
+
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
|
| 62 |
+
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
|
| 63 |
+
|
| 64 |
+
#######################################################################
|
| 65 |
+
# PART 1 Datasets List #
|
| 66 |
+
#######################################################################
|
| 67 |
+
# datasets list for evaluation
|
| 68 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
| 69 |
+
|
| 70 |
+
#######################################################################
|
| 71 |
+
# PART 2 Datset Summarizer #
|
| 72 |
+
#######################################################################
|
| 73 |
+
# with read_base():
|
| 74 |
+
|
| 75 |
+
core_summary_groups = [
|
| 76 |
+
{
|
| 77 |
+
'name':
|
| 78 |
+
'core_average',
|
| 79 |
+
'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
|
| 80 |
+
['cmmlu', 'accuracy'], ['bbh', 'naive_average'],
|
| 81 |
+
['hellaswag', 'accuracy'], ['drop', 'accuracy'],
|
| 82 |
+
['math', 'accuracy'], ['gsm8k', 'accuracy'],
|
| 83 |
+
['mathbench-t (average)', 'naive_average'],
|
| 84 |
+
['GPQA_diamond', 'accuracy'],
|
| 85 |
+
['openai_humaneval', 'humaneval_pass@1'],
|
| 86 |
+
['IFEval', 'Prompt-level-strict-accuracy'],
|
| 87 |
+
['sanitized_mbpp', 'score'],
|
| 88 |
+
['mathbench-t (average)', 'naive_average']],
|
| 89 |
+
},
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
summarizer = dict(
|
| 93 |
+
dataset_abbrs=[
|
| 94 |
+
['mmlu', 'accuracy'],
|
| 95 |
+
['mmlu_pro', 'accuracy'],
|
| 96 |
+
['cmmlu', 'accuracy'],
|
| 97 |
+
['bbh', 'naive_average'],
|
| 98 |
+
['hellaswag', 'accuracy'],
|
| 99 |
+
['drop', 'accuracy'],
|
| 100 |
+
['math', 'accuracy'],
|
| 101 |
+
['gsm8k', 'accuracy'],
|
| 102 |
+
['mathbench-t (average)', 'naive_average'],
|
| 103 |
+
['GPQA_diamond', 'accuracy'],
|
| 104 |
+
['openai_humaneval', 'humaneval_pass@1'],
|
| 105 |
+
['IFEval', 'Prompt-level-strict-accuracy'],
|
| 106 |
+
['sanitized_mbpp', 'score'],
|
| 107 |
+
'mathbench-a (average)',
|
| 108 |
+
'mathbench-t (average)'
|
| 109 |
+
'',
|
| 110 |
+
['mmlu', 'accuracy'],
|
| 111 |
+
['mmlu-stem', 'accuracy'],
|
| 112 |
+
['mmlu-social-science', 'accuracy'],
|
| 113 |
+
['mmlu-humanities', 'accuracy'],
|
| 114 |
+
['mmlu-other', 'accuracy'],
|
| 115 |
+
'',
|
| 116 |
+
['mmlu_pro', 'accuracy'],
|
| 117 |
+
['mmlu_pro_math', 'accuracy'],
|
| 118 |
+
['mmlu_pro_physics', 'accuracy'],
|
| 119 |
+
['mmlu_pro_chemistry', 'accuracy'],
|
| 120 |
+
['mmlu_pro_law', 'accuracy'],
|
| 121 |
+
['mmlu_pro_engineering', 'accuracy'],
|
| 122 |
+
['mmlu_pro_other', 'accuracy'],
|
| 123 |
+
['mmlu_pro_economics', 'accuracy'],
|
| 124 |
+
['mmlu_pro_health', 'accuracy'],
|
| 125 |
+
['mmlu_pro_psychology', 'accuracy'],
|
| 126 |
+
['mmlu_pro_business', 'accuracy'],
|
| 127 |
+
['mmlu_pro_biology', 'accuracy'],
|
| 128 |
+
['mmlu_pro_philosophy', 'accuracy'],
|
| 129 |
+
['mmlu_pro_computer_science', 'accuracy'],
|
| 130 |
+
['mmlu_pro_history', 'accuracy'],
|
| 131 |
+
'',
|
| 132 |
+
['cmmlu', 'accuracy'],
|
| 133 |
+
['cmmlu-stem', 'accuracy'],
|
| 134 |
+
['cmmlu-social-science', 'accuracy'],
|
| 135 |
+
['cmmlu-humanities', 'accuracy'],
|
| 136 |
+
['cmmlu-other', 'accuracy'],
|
| 137 |
+
['cmmlu-china-specific', 'accuracy'],
|
| 138 |
+
],
|
| 139 |
+
summary_groups=sum(
|
| 140 |
+
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
#######################################################################
|
| 144 |
+
# PART 3 Models List #
|
| 145 |
+
#######################################################################
|
| 146 |
+
|
| 147 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 148 |
+
|
| 149 |
+
#######################################################################
|
| 150 |
+
# PART 4 Inference/Evaluation Configuaration #
|
| 151 |
+
#######################################################################
|
| 152 |
+
|
| 153 |
+
# Local Runner
|
| 154 |
+
infer = dict(
|
| 155 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 156 |
+
runner=dict(
|
| 157 |
+
type=LocalRunner,
|
| 158 |
+
max_num_workers=16,
|
| 159 |
+
retry=0, # Modify if needed
|
| 160 |
+
task=dict(type=OpenICLInferTask)),
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# eval with local runner
|
| 164 |
+
eval = dict(
|
| 165 |
+
partitioner=dict(type=NaivePartitioner, n=10),
|
| 166 |
+
runner=dict(type=LocalRunner,
|
| 167 |
+
max_num_workers=16,
|
| 168 |
+
task=dict(type=OpenICLEvalTask)),
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
#######################################################################
|
| 172 |
+
# PART 5 Utils Configuaration #
|
| 173 |
+
#######################################################################
|
| 174 |
+
base_exp_dir = 'outputs/corebench_2409_objective/'
|
| 175 |
+
work_dir = osp.join(base_exp_dir, 'base_objective')
|
examples/eval_deepseek_r1.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Support AIME-2024 with Repeat8
|
| 2 |
+
# Support MATH-500
|
| 3 |
+
# Support OlympiadBench
|
| 4 |
+
# Support OmniMath
|
| 5 |
+
# Support LiveMathBench-202412-Hard
|
| 6 |
+
|
| 7 |
+
import os.path as osp
|
| 8 |
+
from itertools import product
|
| 9 |
+
from opencompass.models import OpenAISDK
|
| 10 |
+
from mmengine.config import read_base
|
| 11 |
+
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
| 12 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 13 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 14 |
+
from opencompass.runners import LocalRunner
|
| 15 |
+
from opencompass.models import (
|
| 16 |
+
TurboMindModelwithChatTemplate,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
#######################################################################
|
| 20 |
+
# PART 1 Datasets List #
|
| 21 |
+
#######################################################################
|
| 22 |
+
with read_base():
|
| 23 |
+
# You can comment out the datasets you don't want to evaluate
|
| 24 |
+
|
| 25 |
+
# Datasets
|
| 26 |
+
# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
|
| 27 |
+
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
|
| 28 |
+
# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
|
| 29 |
+
# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
|
| 30 |
+
# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Summarizer
|
| 34 |
+
from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
|
| 35 |
+
|
| 36 |
+
datasets = sum(
|
| 37 |
+
(v for k, v in locals().items() if k.endswith('_datasets')),
|
| 38 |
+
[],
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Set LLM Verifier used for each dataset
|
| 42 |
+
|
| 43 |
+
verifier_cfg = dict(
|
| 44 |
+
abbr='qwen2-5-32B-Instruct',
|
| 45 |
+
type=OpenAISDK,
|
| 46 |
+
path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
|
| 47 |
+
key='sk-1234', # You need to set your own API key
|
| 48 |
+
openai_api_base=[
|
| 49 |
+
'http://172.30.56.1:4000/v1', # You need to set your own API base
|
| 50 |
+
],
|
| 51 |
+
meta_template=dict(
|
| 52 |
+
round=[
|
| 53 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 54 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 55 |
+
],
|
| 56 |
+
),
|
| 57 |
+
query_per_second=16,
|
| 58 |
+
batch_size=1024,
|
| 59 |
+
temperature=0.001,
|
| 60 |
+
tokenizer_path='gpt-4o-2024-05-13',
|
| 61 |
+
verbose=True,
|
| 62 |
+
max_out_len=16384,
|
| 63 |
+
# max_seq_len=32768,
|
| 64 |
+
max_seq_len=49152,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
for item in datasets:
|
| 68 |
+
# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
|
| 69 |
+
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
| 70 |
+
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
#######################################################################
|
| 74 |
+
# PART 2 Model List #
|
| 75 |
+
#######################################################################
|
| 76 |
+
|
| 77 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 78 |
+
|
| 79 |
+
models += [
|
| 80 |
+
# You can comment out the models you don't want to evaluate
|
| 81 |
+
# All models use sampling mode
|
| 82 |
+
dict(
|
| 83 |
+
type=TurboMindModelwithChatTemplate,
|
| 84 |
+
abbr='deepseek-r1-distill-qwen-7b-turbomind',
|
| 85 |
+
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
| 86 |
+
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
|
| 87 |
+
gen_config=dict(
|
| 88 |
+
do_sample=True,
|
| 89 |
+
temperature=0.6,
|
| 90 |
+
top_p=0.95,
|
| 91 |
+
max_new_tokens=32768),
|
| 92 |
+
max_seq_len=32768,
|
| 93 |
+
max_out_len=32768,
|
| 94 |
+
batch_size=64,
|
| 95 |
+
run_cfg=dict(num_gpus=1),
|
| 96 |
+
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
| 97 |
+
),
|
| 98 |
+
# dict(
|
| 99 |
+
# type=TurboMindModelwithChatTemplate,
|
| 100 |
+
# abbr='deepseek-r1-distill-qwen-14b-turbomind',
|
| 101 |
+
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
|
| 102 |
+
# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
|
| 103 |
+
# gen_config=dict(
|
| 104 |
+
# do_sample=True,
|
| 105 |
+
# temperature=0.6,
|
| 106 |
+
# top_p=0.95,
|
| 107 |
+
# max_new_tokens=32768),
|
| 108 |
+
# max_seq_len=32768,
|
| 109 |
+
# max_out_len=32768,
|
| 110 |
+
# batch_size=128,
|
| 111 |
+
# run_cfg=dict(num_gpus=2),
|
| 112 |
+
# pred_postprocessor=dict(type=extract_non_reasoning_content)
|
| 113 |
+
# ),
|
| 114 |
+
# dict(
|
| 115 |
+
# type=TurboMindModelwithChatTemplate,
|
| 116 |
+
# abbr='deepseek-r1-distill-qwen-32b-turbomind',
|
| 117 |
+
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
| 118 |
+
# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
|
| 119 |
+
# gen_config=dict(
|
| 120 |
+
# do_sample=True,
|
| 121 |
+
# temperature=0.6,
|
| 122 |
+
# top_p=0.95,
|
| 123 |
+
# max_new_tokens=16384),
|
| 124 |
+
# max_seq_len=32768,
|
| 125 |
+
# max_out_len=16384,
|
| 126 |
+
# batch_size=128,
|
| 127 |
+
# run_cfg=dict(num_gpus=4),
|
| 128 |
+
# pred_postprocessor=dict(type=extract_non_reasoning_content)
|
| 129 |
+
# ),
|
| 130 |
+
]
|
| 131 |
+
|
| 132 |
+
#######################################################################
|
| 133 |
+
# PART 3 Inference/Evaluation #
|
| 134 |
+
#######################################################################
|
| 135 |
+
|
| 136 |
+
# Inference configuration
|
| 137 |
+
infer = dict(
|
| 138 |
+
partitioner=dict(
|
| 139 |
+
type=NumWorkerPartitioner,
|
| 140 |
+
num_worker=1
|
| 141 |
+
# Similar with data-parallelism, how many workers for evaluation,
|
| 142 |
+
# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
|
| 143 |
+
# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
|
| 144 |
+
# to max-utilize the GPUs.
|
| 145 |
+
# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
|
| 146 |
+
),
|
| 147 |
+
runner=dict(
|
| 148 |
+
type=LocalRunner,
|
| 149 |
+
task=dict(type=OpenICLInferTask)
|
| 150 |
+
),
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Evaluation configuration
|
| 154 |
+
eval = dict(
|
| 155 |
+
partitioner=dict(
|
| 156 |
+
type=NaivePartitioner, n=8
|
| 157 |
+
),
|
| 158 |
+
runner=dict(
|
| 159 |
+
type=LocalRunner,
|
| 160 |
+
task=dict(
|
| 161 |
+
type=OpenICLEvalTask)
|
| 162 |
+
),
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
#######################################################################
|
| 167 |
+
# PART 4 Summarizer #
|
| 168 |
+
#######################################################################
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
summary_groups = sum(
|
| 172 |
+
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
summary_groups.extend([
|
| 176 |
+
{
|
| 177 |
+
'name': 'AIME2024-Aveage8',
|
| 178 |
+
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
'name': 'LiveMathBench-v202412-Hard-Aveage8',
|
| 182 |
+
'subsets':[[
|
| 183 |
+
f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
|
| 184 |
+
for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
|
| 185 |
+
]
|
| 186 |
+
}
|
| 187 |
+
])
|
| 188 |
+
|
| 189 |
+
# Summarizer
|
| 190 |
+
summarizer = dict(
|
| 191 |
+
dataset_abbrs=[
|
| 192 |
+
'MATH',
|
| 193 |
+
# ['LiveMathBench-k1-n1', 'pass@1'],
|
| 194 |
+
# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
|
| 195 |
+
# ['aime2024', 'accuracy'],
|
| 196 |
+
['math_prm800k_500-llmjudge', 'accuracy'],
|
| 197 |
+
['AIME2024-Aveage8', 'naive_average'],
|
| 198 |
+
['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
|
| 199 |
+
['OlympiadBenchMath', 'accuracy'],
|
| 200 |
+
['OmniMath', 'accuracy'],
|
| 201 |
+
],
|
| 202 |
+
summary_groups=summary_groups,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
#######################################################################
|
| 207 |
+
# PART 5 Utils #
|
| 208 |
+
#######################################################################
|
| 209 |
+
|
| 210 |
+
work_dir = 'outputs/deepseek_r1_reasoning'
|
| 211 |
+
|
| 212 |
+
|
examples/eval_ds1000_interpreter.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
|
| 4 |
+
from opencompass.models import OpenAI
|
| 5 |
+
from opencompass.models.lagent import CodeAgent
|
| 6 |
+
from opencompass.partitioners import SizePartitioner
|
| 7 |
+
from opencompass.runners import LocalRunner
|
| 8 |
+
from opencompass.tasks import OpenICLInferTask
|
| 9 |
+
|
| 10 |
+
PYTHON_INTERPRETER_DESCRIPTION = """\
|
| 11 |
+
It can run a Python code. The code must be a valid code that contains only python method.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
actions = [
|
| 15 |
+
dict(
|
| 16 |
+
type=PythonInterpreter,
|
| 17 |
+
description=PYTHON_INTERPRETER_DESCRIPTION,
|
| 18 |
+
answer_expr=None,
|
| 19 |
+
)
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
with read_base():
|
| 23 |
+
from opencompass.configs.datasets.ds1000.ds1000_gen_5c4bec import \
|
| 24 |
+
ds1000_datasets as datasets
|
| 25 |
+
|
| 26 |
+
models = [
|
| 27 |
+
dict(abbr='gpt-3.5-react',
|
| 28 |
+
type=CodeAgent,
|
| 29 |
+
llm=dict(
|
| 30 |
+
type=OpenAI,
|
| 31 |
+
path='gpt-3.5-turbo',
|
| 32 |
+
key='ENV',
|
| 33 |
+
query_per_second=1,
|
| 34 |
+
max_seq_len=4096,
|
| 35 |
+
),
|
| 36 |
+
actions=actions,
|
| 37 |
+
batch_size=8),
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
infer = dict(
|
| 41 |
+
partitioner=dict(type=SizePartitioner, max_task_size=40000),
|
| 42 |
+
runner=dict(type=LocalRunner,
|
| 43 |
+
max_num_workers=16,
|
| 44 |
+
task=dict(type=OpenICLInferTask)),
|
| 45 |
+
)
|
examples/eval_eese_api_judge.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from mmengine.config import read_base
|
| 3 |
+
|
| 4 |
+
with read_base():
|
| 5 |
+
from opencompass.configs.datasets.eese.eese_judge_gen import \
|
| 6 |
+
eese_datasets
|
| 7 |
+
# 选择一个感兴趣的模型
|
| 8 |
+
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
|
| 9 |
+
models as gpt4
|
| 10 |
+
|
| 11 |
+
from opencompass.models import OpenAISDK
|
| 12 |
+
|
| 13 |
+
# 配置评判模型
|
| 14 |
+
api_meta_template = dict(round=[
|
| 15 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 16 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 17 |
+
], )
|
| 18 |
+
|
| 19 |
+
judge_cfg = dict(
|
| 20 |
+
abbr='model-judge',
|
| 21 |
+
type=OpenAISDK,
|
| 22 |
+
path='model-name',
|
| 23 |
+
key='your-api-key',
|
| 24 |
+
openai_api_base=['openai-url'],
|
| 25 |
+
meta_template=api_meta_template,
|
| 26 |
+
query_per_second=16,
|
| 27 |
+
batch_size=1,
|
| 28 |
+
temperature=0.001,
|
| 29 |
+
tokenizer_path='gpt-4o',
|
| 30 |
+
verbose=True,
|
| 31 |
+
max_out_len=16384,
|
| 32 |
+
max_seq_len=49152,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
datasets = eese_datasets
|
| 36 |
+
models = gpt4
|
| 37 |
+
|
| 38 |
+
# 为每个数据集增加judge_cfg信息,而不是覆盖
|
| 39 |
+
for dataset in datasets:
|
| 40 |
+
if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
|
| 41 |
+
# 获取现有的judge_cfg,如果不存在则创建空字典
|
| 42 |
+
existing_judge_cfg = dataset['eval_cfg']['evaluator'].get('judge_cfg', {})
|
| 43 |
+
# 更新现有的judge_cfg,保留原有配置并添加新配置
|
| 44 |
+
existing_judge_cfg.update(judge_cfg)
|
| 45 |
+
# 将更新后的配置设置回去
|
| 46 |
+
dataset['eval_cfg']['evaluator']['judge_cfg'] = existing_judge_cfg
|
| 47 |
+
|
examples/eval_gpt4.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.models import OpenAI
|
| 4 |
+
from opencompass.partitioners import NaivePartitioner
|
| 5 |
+
from opencompass.runners import LocalRunner
|
| 6 |
+
from opencompass.tasks import OpenICLInferTask
|
| 7 |
+
|
| 8 |
+
with read_base():
|
| 9 |
+
from opencompass.configs.datasets.collections.chat_medium import datasets
|
| 10 |
+
from opencompass.configs.summarizers.medium import summarizer
|
| 11 |
+
|
| 12 |
+
# GPT4 needs a special humaneval postprocessor
|
| 13 |
+
from opencompass.datasets.humaneval import humaneval_gpt_postprocess
|
| 14 |
+
|
| 15 |
+
for _dataset in datasets:
|
| 16 |
+
if _dataset['path'] == 'openai_humaneval':
|
| 17 |
+
_dataset['eval_cfg']['pred_postprocessor'][
|
| 18 |
+
'type'] = humaneval_gpt_postprocess
|
| 19 |
+
|
| 20 |
+
api_meta_template = dict(round=[
|
| 21 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 22 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 23 |
+
], )
|
| 24 |
+
|
| 25 |
+
models = [
|
| 26 |
+
dict(
|
| 27 |
+
abbr='GPT4',
|
| 28 |
+
type=OpenAI,
|
| 29 |
+
path='gpt-4-0613',
|
| 30 |
+
key=
|
| 31 |
+
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
| 32 |
+
meta_template=api_meta_template,
|
| 33 |
+
query_per_second=1,
|
| 34 |
+
max_out_len=2048,
|
| 35 |
+
max_seq_len=2048,
|
| 36 |
+
batch_size=8),
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
infer = dict(
|
| 40 |
+
partitioner=dict(type=NaivePartitioner),
|
| 41 |
+
runner=dict(type=LocalRunner,
|
| 42 |
+
max_num_workers=4,
|
| 43 |
+
task=dict(type=OpenICLInferTask)),
|
| 44 |
+
)
|
examples/eval_hf_llama_7b.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.collections.base_medium_llama import (
|
| 5 |
+
piqa_datasets, siqa_datasets)
|
| 6 |
+
from opencompass.configs.models.hf_llama.hf_llama_7b import models
|
| 7 |
+
|
| 8 |
+
datasets = [*piqa_datasets, *siqa_datasets]
|
examples/eval_inference_ppl.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
# Inference PPL datasets
|
| 5 |
+
from opencompass.configs.datasets.inference_ppl.inference_ppl import inference_ppl_datasets
|
| 6 |
+
|
| 7 |
+
# Model configs
|
| 8 |
+
from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
|
| 9 |
+
from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
|
| 10 |
+
from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
|
| 11 |
+
from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
|
| 12 |
+
|
| 13 |
+
from opencompass.partitioners import NaivePartitioner
|
| 14 |
+
from opencompass.runners import LocalRunner
|
| 15 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 16 |
+
|
| 17 |
+
# -------------Inference Stage ----------------------------------------
|
| 18 |
+
|
| 19 |
+
datasets = [*inference_ppl_datasets]
|
| 20 |
+
workdir = 'outputs/inference_ppl'
|
| 21 |
+
|
| 22 |
+
models = [
|
| 23 |
+
*qwen1_5_7b,
|
| 24 |
+
*qwen1_5_14b,
|
| 25 |
+
*llama2_7b,
|
| 26 |
+
*llama2_13b,
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
# Set custom batch_size and num_gpus for faster loss calculation
|
| 30 |
+
# Smaller batch_size should give more precise results, at the cost of worse efficiency
|
| 31 |
+
model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
|
| 32 |
+
|
| 33 |
+
for mdl in models:
|
| 34 |
+
mdl.update(model_cfg)
|
| 35 |
+
|
| 36 |
+
infer = dict(
|
| 37 |
+
partitioner=dict(type=NaivePartitioner),
|
| 38 |
+
runner=dict(
|
| 39 |
+
type=LocalRunner,
|
| 40 |
+
task=dict(type=OpenICLInferTask),
|
| 41 |
+
max_num_workers=256, # Maximum concurrent evaluation task count
|
| 42 |
+
),
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# -------------Evaluation Stage ----------------------------------------
|
| 46 |
+
eval = dict(partitioner=dict(type=NaivePartitioner),
|
| 47 |
+
runner=dict(
|
| 48 |
+
type=LocalRunner,
|
| 49 |
+
task=dict(type=OpenICLEvalTask),
|
| 50 |
+
max_num_workers=256,
|
| 51 |
+
))
|
examples/eval_internLM.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
# choose a list of datasets
|
| 5 |
+
from opencompass.configs.datasets.collections.base_medium import datasets
|
| 6 |
+
# choose a model of interest
|
| 7 |
+
from opencompass.configs.models.internlm.internlm_7b import models
|
| 8 |
+
# and output the results in a choosen format
|
| 9 |
+
from opencompass.configs.summarizers.medium import summarizer
|
examples/eval_internlm_7b.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
# choose a list of datasets
|
| 5 |
+
from opencompass.configs.datasets.collections.base_medium import datasets
|
| 6 |
+
# choose a model of interest
|
| 7 |
+
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
|
| 8 |
+
# and output the results in a choosen format
|
| 9 |
+
from opencompass.configs.summarizers.medium import summarizer
|
examples/eval_internlm_chat_turbomind.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.models.turbomind import TurboMindModel
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
# choose a list of datasets
|
| 7 |
+
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
|
| 8 |
+
ceval_datasets
|
| 9 |
+
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import \
|
| 10 |
+
crowspairs_datasets
|
| 11 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
|
| 12 |
+
gsm8k_datasets
|
| 13 |
+
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
| 14 |
+
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
| 15 |
+
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
|
| 16 |
+
WiC_datasets
|
| 17 |
+
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
|
| 18 |
+
WSC_datasets
|
| 19 |
+
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
|
| 20 |
+
triviaqa_datasets
|
| 21 |
+
# and output the results in a choosen format
|
| 22 |
+
from opencompass.configs.summarizers.medium import summarizer
|
| 23 |
+
|
| 24 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
| 25 |
+
|
| 26 |
+
internlm_meta_template = dict(round=[
|
| 27 |
+
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
| 28 |
+
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
| 29 |
+
],
|
| 30 |
+
eos_token_id=103028)
|
| 31 |
+
|
| 32 |
+
internlm2_meta_template = dict(round=[
|
| 33 |
+
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
| 34 |
+
dict(role='BOT',
|
| 35 |
+
begin='<|im_start|>assistant\n',
|
| 36 |
+
end='<|im_end|>\n',
|
| 37 |
+
generate=True),
|
| 38 |
+
],
|
| 39 |
+
eos_token_id=92542)
|
| 40 |
+
|
| 41 |
+
# config for internlm-chat-7b
|
| 42 |
+
internlm_chat_7b = dict(
|
| 43 |
+
type=TurboMindModel,
|
| 44 |
+
abbr='internlm-chat-7b-turbomind',
|
| 45 |
+
path='internlm/internlm-chat-7b',
|
| 46 |
+
engine_config=dict(session_len=2048,
|
| 47 |
+
max_batch_size=32,
|
| 48 |
+
rope_scaling_factor=1.0),
|
| 49 |
+
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
|
| 50 |
+
max_out_len=100,
|
| 51 |
+
max_seq_len=2048,
|
| 52 |
+
batch_size=32,
|
| 53 |
+
concurrency=32,
|
| 54 |
+
meta_template=internlm_meta_template,
|
| 55 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 56 |
+
end_str='<eoa>',
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# config for internlm-chat-7b
|
| 60 |
+
internlm2_chat_7b = dict(type=TurboMindModel,
|
| 61 |
+
abbr='internlm2-chat-7b-turbomind',
|
| 62 |
+
path='internlm/internlm2-chat-7b',
|
| 63 |
+
engine_config=dict(session_len=2048,
|
| 64 |
+
max_batch_size=32,
|
| 65 |
+
rope_scaling_factor=1.0),
|
| 66 |
+
gen_config=dict(top_k=1,
|
| 67 |
+
top_p=0.8,
|
| 68 |
+
temperature=1.0,
|
| 69 |
+
max_new_tokens=100),
|
| 70 |
+
max_out_len=100,
|
| 71 |
+
max_seq_len=2048,
|
| 72 |
+
batch_size=32,
|
| 73 |
+
concurrency=32,
|
| 74 |
+
meta_template=internlm2_meta_template,
|
| 75 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 76 |
+
end_str='<|im_end|>')
|
| 77 |
+
|
| 78 |
+
# config for internlm-chat-20b
|
| 79 |
+
internlm_chat_20b = dict(
|
| 80 |
+
type=TurboMindModel,
|
| 81 |
+
abbr='internlm-chat-20b-turbomind',
|
| 82 |
+
path='internlm/internlm-chat-20b',
|
| 83 |
+
engine_config=dict(session_len=2048,
|
| 84 |
+
max_batch_size=8,
|
| 85 |
+
rope_scaling_factor=1.0),
|
| 86 |
+
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
|
| 87 |
+
max_out_len=100,
|
| 88 |
+
max_seq_len=2048,
|
| 89 |
+
batch_size=8,
|
| 90 |
+
concurrency=8,
|
| 91 |
+
meta_template=internlm_meta_template,
|
| 92 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 93 |
+
end_str='<eoa>',
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
models = [internlm_chat_20b]
|
examples/eval_internlm_turbomind.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.models.turbomind import TurboMindModel
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
# choose a list of datasets
|
| 7 |
+
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
|
| 8 |
+
ceval_datasets
|
| 9 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
|
| 10 |
+
gsm8k_datasets
|
| 11 |
+
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
|
| 12 |
+
humaneval_datasets
|
| 13 |
+
from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
| 14 |
+
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
|
| 15 |
+
WiC_datasets
|
| 16 |
+
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
|
| 17 |
+
triviaqa_datasets
|
| 18 |
+
# and output the results in a choosen format
|
| 19 |
+
from opencompass.configs.summarizers.medium import summarizer
|
| 20 |
+
|
| 21 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
| 22 |
+
|
| 23 |
+
# # config for internlm-7b model
|
| 24 |
+
internlm_7b = dict(
|
| 25 |
+
type=TurboMindModel,
|
| 26 |
+
abbr='internlm-7b-turbomind',
|
| 27 |
+
path='internlm/internlm-7b',
|
| 28 |
+
engine_config=dict(session_len=2048,
|
| 29 |
+
max_batch_size=32,
|
| 30 |
+
rope_scaling_factor=1.0),
|
| 31 |
+
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
|
| 32 |
+
max_out_len=100,
|
| 33 |
+
max_seq_len=2048,
|
| 34 |
+
batch_size=32,
|
| 35 |
+
concurrency=32,
|
| 36 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# config for internlm-20b model
|
| 40 |
+
internlm_20b = dict(
|
| 41 |
+
type=TurboMindModel,
|
| 42 |
+
abbr='internlm-20b-turbomind',
|
| 43 |
+
path='internlm/internlm-20b',
|
| 44 |
+
engine_config=dict(session_len=2048,
|
| 45 |
+
max_batch_size=8,
|
| 46 |
+
rope_scaling_factor=1.0),
|
| 47 |
+
gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
|
| 48 |
+
max_out_len=100,
|
| 49 |
+
max_seq_len=2048,
|
| 50 |
+
batch_size=8,
|
| 51 |
+
concurrency=8,
|
| 52 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
models = [internlm_20b]
|
examples/eval_judge_dataset_all.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
|
| 4 |
+
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
|
| 5 |
+
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
|
| 6 |
+
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
|
| 7 |
+
|
| 8 |
+
from opencompass.configs.summarizers.judgedataset_all import summarizer
|
| 9 |
+
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
| 10 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
| 11 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 12 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 13 |
+
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
| 14 |
+
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
| 15 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 16 |
+
from opencompass.tasks import OpenICLInferTask
|
| 17 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 18 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 19 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
api_meta_template = dict(
|
| 23 |
+
round=[
|
| 24 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 25 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 26 |
+
]
|
| 27 |
+
)
|
| 28 |
+
datasets = sum(
|
| 29 |
+
(v for k, v in locals().items() if k.endswith('_datasets')),
|
| 30 |
+
[],
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
models = [
|
| 35 |
+
dict(
|
| 36 |
+
type=TurboMindModelwithChatTemplate,
|
| 37 |
+
abbr='qwen-7b-hf',
|
| 38 |
+
path='Qwen/Qwen-7B',
|
| 39 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
| 40 |
+
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
| 41 |
+
max_seq_len=16384,
|
| 42 |
+
max_out_len=2048,
|
| 43 |
+
batch_size=16,
|
| 44 |
+
run_cfg=dict(num_gpus=1),
|
| 45 |
+
),
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
infer = dict(
|
| 51 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 52 |
+
runner=dict(
|
| 53 |
+
type=LocalRunner,
|
| 54 |
+
max_num_workers=72,
|
| 55 |
+
task=dict(type=OpenICLInferTask),
|
| 56 |
+
),
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
work_dir = './outputs/judge_dataset_all/'
|
examples/eval_judgebench.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
|
| 4 |
+
|
| 5 |
+
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
| 6 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
| 7 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 8 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 9 |
+
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
| 10 |
+
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
| 11 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 12 |
+
from opencompass.tasks import OpenICLInferTask
|
| 13 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 14 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 15 |
+
|
| 16 |
+
api_meta_template = dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 19 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 20 |
+
]
|
| 21 |
+
)
|
| 22 |
+
datasets = [*get_judgebench_datasets]
|
| 23 |
+
|
| 24 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 25 |
+
|
| 26 |
+
models = [
|
| 27 |
+
dict(
|
| 28 |
+
type=TurboMindModelwithChatTemplate,
|
| 29 |
+
abbr='qwen-7b-hf',
|
| 30 |
+
path='Qwen/Qwen-7B',
|
| 31 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
| 32 |
+
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
| 33 |
+
max_seq_len=16384,
|
| 34 |
+
max_out_len=2048,
|
| 35 |
+
batch_size=16,
|
| 36 |
+
run_cfg=dict(num_gpus=1),
|
| 37 |
+
),
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
infer = dict(
|
| 42 |
+
partitioner=dict(type=NaivePartitioner),
|
| 43 |
+
runner=dict(
|
| 44 |
+
type=LocalRunner,
|
| 45 |
+
max_num_workers=72,
|
| 46 |
+
task=dict(type=OpenICLInferTask),
|
| 47 |
+
),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
work_dir = './outputs/judgebench/'
|
examples/eval_judgerbench.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
|
| 5 |
+
|
| 6 |
+
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
|
| 7 |
+
HuggingFaceChatGLM3, OpenAI,
|
| 8 |
+
TurboMindModelwithChatTemplate)
|
| 9 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
| 10 |
+
from opencompass.runners import LocalRunner, SlurmSequentialRunner
|
| 11 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 12 |
+
|
| 13 |
+
api_meta_template = dict(round=[
|
| 14 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 15 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 16 |
+
])
|
| 17 |
+
|
| 18 |
+
# -------------Inference Stage ----------------------------------------
|
| 19 |
+
# For subjective evaluation, we often set do sample for models
|
| 20 |
+
models = [
|
| 21 |
+
dict(
|
| 22 |
+
type=TurboMindModelwithChatTemplate,
|
| 23 |
+
abbr='CompassJudger-1-7B-Instruct',
|
| 24 |
+
path='opencompass/CompassJudger-1-7B-Instruct',
|
| 25 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
| 26 |
+
gen_config=dict(top_k=1,
|
| 27 |
+
temperature=1e-6,
|
| 28 |
+
top_p=0.9,
|
| 29 |
+
max_new_tokens=2048),
|
| 30 |
+
max_seq_len=16384,
|
| 31 |
+
max_out_len=2048,
|
| 32 |
+
batch_size=16,
|
| 33 |
+
run_cfg=dict(num_gpus=1),
|
| 34 |
+
)
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
datasets = judgerbench_datasets
|
| 38 |
+
|
| 39 |
+
infer = dict(
|
| 40 |
+
partitioner=dict(type=NaivePartitioner),
|
| 41 |
+
runner=dict(type=LocalRunner,
|
| 42 |
+
max_num_workers=16,
|
| 43 |
+
task=dict(type=OpenICLInferTask)),
|
| 44 |
+
)
|
| 45 |
+
# -------------Evalation Stage ----------------------------------------
|
| 46 |
+
|
| 47 |
+
## ------------- Evaluation Configuration
|
| 48 |
+
eval = dict(
|
| 49 |
+
partitioner=dict(
|
| 50 |
+
type=NaivePartitioner,
|
| 51 |
+
n=10,
|
| 52 |
+
),
|
| 53 |
+
runner=dict(type=LocalRunner,
|
| 54 |
+
max_num_workers=16,
|
| 55 |
+
task=dict(type=OpenICLEvalTask)),
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
work_dir = 'outputs/judgerbench/'
|
examples/eval_judgerbenchv2.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset
|
| 4 |
+
from opencompass.configs.summarizers.judgerbenchv2 import summarizer
|
| 5 |
+
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
| 6 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
| 7 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 8 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 9 |
+
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
| 10 |
+
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
| 11 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 12 |
+
from opencompass.tasks import OpenICLInferTask
|
| 13 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 14 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 15 |
+
|
| 16 |
+
api_meta_template = dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 19 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 20 |
+
]
|
| 21 |
+
)
|
| 22 |
+
datasets = [*get_judgerbenchv2_dataset]
|
| 23 |
+
|
| 24 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 25 |
+
|
| 26 |
+
models = [
|
| 27 |
+
dict(
|
| 28 |
+
type=TurboMindModelwithChatTemplate,
|
| 29 |
+
abbr='qwen-7b-hf',
|
| 30 |
+
path='Qwen/Qwen-7B',
|
| 31 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
| 32 |
+
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
| 33 |
+
max_seq_len=16384,
|
| 34 |
+
max_out_len=2048,
|
| 35 |
+
batch_size=16,
|
| 36 |
+
run_cfg=dict(num_gpus=1),
|
| 37 |
+
),
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
infer = dict(
|
| 42 |
+
# partitioner=dict(type=NaivePartitioner),
|
| 43 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=2),
|
| 44 |
+
runner=dict(
|
| 45 |
+
type=LocalRunner,
|
| 46 |
+
max_num_workers=72,
|
| 47 |
+
task=dict(type=OpenICLInferTask),
|
| 48 |
+
),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
work_dir = './outputs/judgerbenchv2/'
|
examples/eval_korbench.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import \
|
| 5 |
+
korbench_mixed_datasets as mixed_datasets
|
| 6 |
+
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
|
| 7 |
+
korbench_0shot_single_datasets as zero_shot_datasets
|
| 8 |
+
from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import \
|
| 9 |
+
korbench_3shot_single_datasets as three_shot_datasets
|
| 10 |
+
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
|
| 11 |
+
models as hf_internlm2_5_7b
|
| 12 |
+
|
| 13 |
+
datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
|
| 14 |
+
models = hf_internlm2_5_7b
|
examples/eval_livestembench.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.models import OpenAISDK
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
# 选择一个数据集列表
|
| 7 |
+
from opencompass.configs.datasets.livestembench.livestembench_gen_3e3c50 import \
|
| 8 |
+
livestembench_datasets
|
| 9 |
+
# 选择一个感兴趣的模型
|
| 10 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
|
| 11 |
+
models as qwen2_5_7b_instruct_lmdeploy_model
|
| 12 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
|
| 13 |
+
models as qwen2_5_72b_instruct_lmdeploy_model
|
| 14 |
+
|
| 15 |
+
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
| 16 |
+
models = [
|
| 17 |
+
*qwen2_5_7b_instruct_lmdeploy_model, *qwen2_5_72b_instruct_lmdeploy_model
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
# Judge 模型配置
|
| 21 |
+
api_meta_template = dict(round=[
|
| 22 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 23 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 24 |
+
], )
|
| 25 |
+
|
| 26 |
+
judge_cfg = dict(
|
| 27 |
+
abbr='qwen2-5-72b-instruct',
|
| 28 |
+
type=OpenAISDK,
|
| 29 |
+
path='YOUR_SERVER_MODEL_NAME', # 你的部署的模型名称
|
| 30 |
+
key='None',
|
| 31 |
+
openai_api_base=[
|
| 32 |
+
'http://localhost:23333/v1', # 你的模型部署的地址
|
| 33 |
+
],
|
| 34 |
+
meta_template=api_meta_template,
|
| 35 |
+
query_per_second=16,
|
| 36 |
+
batch_size=16,
|
| 37 |
+
temperature=0.001,
|
| 38 |
+
max_completion_tokens=32768,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
for dataset in datasets:
|
| 42 |
+
dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
| 43 |
+
|
| 44 |
+
# -------------Inferen Stage ----------------------------------------
|
| 45 |
+
|
| 46 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 47 |
+
from opencompass.runners import LocalRunner
|
| 48 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 49 |
+
|
| 50 |
+
infer = dict(
|
| 51 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 52 |
+
runner=dict(type=LocalRunner,
|
| 53 |
+
max_num_workers=8,
|
| 54 |
+
task=dict(type=OpenICLInferTask)),
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
eval = dict(
|
| 58 |
+
partitioner=dict(type=NaivePartitioner, n=8),
|
| 59 |
+
runner=dict(
|
| 60 |
+
type=LocalRunner,
|
| 61 |
+
max_num_workers=256,
|
| 62 |
+
task=dict(type=OpenICLEvalTask),
|
| 63 |
+
),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
work_dir = './outputs/livestembench'
|
examples/eval_llm_judge.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
from opencompass.models.openai_api import OpenAISDK
|
| 3 |
+
|
| 4 |
+
# Import pre-configured models from OpenCompass
|
| 5 |
+
with read_base():
|
| 6 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
|
| 7 |
+
models as lmdeploy_qwen2_5_7b_instruct_model,
|
| 8 |
+
)
|
| 9 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
|
| 10 |
+
models as lmdeploy_qwen2_5_14b_instruct_model,
|
| 11 |
+
)
|
| 12 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 13 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 14 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 15 |
+
from opencompass.evaluator import GenericLLMEvaluator
|
| 16 |
+
from opencompass.datasets import generic_llmjudge_postprocess
|
| 17 |
+
from opencompass.datasets import CustomDataset
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Dataset reader configuration
|
| 21 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
|
| 22 |
+
|
| 23 |
+
# Inference configuration
|
| 24 |
+
math_infer_cfg = dict(
|
| 25 |
+
prompt_template=dict(
|
| 26 |
+
type=PromptTemplate,
|
| 27 |
+
template=dict(
|
| 28 |
+
round=[
|
| 29 |
+
dict(
|
| 30 |
+
role='HUMAN',
|
| 31 |
+
prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
|
| 32 |
+
),
|
| 33 |
+
]
|
| 34 |
+
),
|
| 35 |
+
),
|
| 36 |
+
retriever=dict(type=ZeroRetriever),
|
| 37 |
+
inferencer=dict(type=GenInferencer),
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# Template for the LLM judge
|
| 42 |
+
GRADER_TEMPLATE = """
|
| 43 |
+
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
| 44 |
+
|
| 45 |
+
Here are some evaluation criteria:
|
| 46 |
+
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
| 47 |
+
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
| 48 |
+
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
| 49 |
+
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
| 50 |
+
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
| 51 |
+
|
| 52 |
+
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
| 53 |
+
A: CORRECT
|
| 54 |
+
B: INCORRECT
|
| 55 |
+
Just return the letters "A" or "B", with no text around it.
|
| 56 |
+
|
| 57 |
+
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
| 61 |
+
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
| 62 |
+
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
| 63 |
+
|
| 64 |
+
Judging the correctness of candidates' answers:
|
| 65 |
+
""".strip()
|
| 66 |
+
|
| 67 |
+
# Evaluation configuration using LLM as judge
|
| 68 |
+
math_eval_cfg = dict(
|
| 69 |
+
evaluator=dict(
|
| 70 |
+
type=GenericLLMEvaluator,
|
| 71 |
+
prompt_template=dict(
|
| 72 |
+
type=PromptTemplate,
|
| 73 |
+
template=dict(
|
| 74 |
+
begin=[
|
| 75 |
+
dict(
|
| 76 |
+
role='SYSTEM',
|
| 77 |
+
fallback_role='HUMAN',
|
| 78 |
+
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
| 79 |
+
)
|
| 80 |
+
],
|
| 81 |
+
round=[
|
| 82 |
+
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
| 83 |
+
],
|
| 84 |
+
),
|
| 85 |
+
),
|
| 86 |
+
dataset_cfg=dict(
|
| 87 |
+
type=CustomDataset,
|
| 88 |
+
path='opencompass/math',
|
| 89 |
+
file_name='test_prm800k_500.jsonl',
|
| 90 |
+
reader_cfg=math_reader_cfg,
|
| 91 |
+
),
|
| 92 |
+
judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
|
| 93 |
+
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
| 94 |
+
),
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Dataset configuration
|
| 98 |
+
datasets = [
|
| 99 |
+
dict(
|
| 100 |
+
type=CustomDataset,
|
| 101 |
+
path='opencompass/math',
|
| 102 |
+
file_name='test_prm800k_500.jsonl',
|
| 103 |
+
reader_cfg=math_reader_cfg,
|
| 104 |
+
infer_cfg=math_infer_cfg,
|
| 105 |
+
eval_cfg=math_eval_cfg,
|
| 106 |
+
)
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
# Model to be evaluated
|
| 110 |
+
models = lmdeploy_qwen2_5_7b_instruct_model
|
| 111 |
+
|
| 112 |
+
# Limiting test to first 8 examples for quick testing
|
| 113 |
+
math_reader_cfg['test_range'] = '[0:8]'
|
| 114 |
+
|
| 115 |
+
# Output directory
|
| 116 |
+
work_dir = 'outputs/llm_judge'
|
examples/eval_lmdeploy_demo.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
|
| 5 |
+
gsm8k_datasets
|
| 6 |
+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_1_8b_chat import \
|
| 7 |
+
models
|
| 8 |
+
|
| 9 |
+
datasets = gsm8k_datasets
|
| 10 |
+
models = models
|
examples/eval_longbenchv2.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
# Models
|
| 5 |
+
# Datasets
|
| 6 |
+
from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
|
| 7 |
+
LongBenchv2_datasets as LongBenchv2_datasets
|
| 8 |
+
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
|
| 9 |
+
models as lmdeploy_glm4_9b_chat_model
|
| 10 |
+
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
|
| 11 |
+
models as lmdeploy_llama3_1_8b_instruct_model
|
| 12 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
|
| 13 |
+
models as lmdeploy_qwen2_5_7b_instruct_model
|
| 14 |
+
|
| 15 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
| 16 |
+
|
| 17 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 18 |
+
|
| 19 |
+
for model in models:
|
| 20 |
+
model['max_seq_len'] = 128 * 1024
|
| 21 |
+
model['engine_config']['session_len'] = 128 * 1024
|
| 22 |
+
model['engine_config']['tp'] = 2
|
| 23 |
+
model['run_cfg']['num_gpus'] = 2
|
| 24 |
+
# Drop middle tokens to make input length shorter than session_len, use 128k to keep sync with Longbenchv2 original code
|
| 25 |
+
# Drop middle now only support LMDeploy models
|
| 26 |
+
model['drop_middle'] = True
|
| 27 |
+
|
| 28 |
+
work_dir = './outputs/longbenchv2'
|
examples/eval_math_llm_judge.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
|
| 2 |
+
from mmengine.config import read_base
|
| 3 |
+
|
| 4 |
+
with read_base():
|
| 5 |
+
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
|
| 6 |
+
from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
|
| 7 |
+
from opencompass.configs.datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
|
| 8 |
+
|
| 9 |
+
from opencompass.datasets import math_judement_preprocess
|
| 10 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 11 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 12 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
| 13 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 14 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 15 |
+
from opencompass.runners import LocalRunner, SlurmSequentialRunner
|
| 16 |
+
from opencompass.summarizers import AllObjSummarizer
|
| 17 |
+
from opencompass.tasks import OpenICLInferTask
|
| 18 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 19 |
+
|
| 20 |
+
# -------------Prompt Settings ----------------------------------------
|
| 21 |
+
eng_obj_prompt = """
|
| 22 |
+
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
|
| 23 |
+
|
| 24 |
+
Examples:
|
| 25 |
+
|
| 26 |
+
Expression 1: $2x+3$
|
| 27 |
+
Expression 2: $3+2x$
|
| 28 |
+
|
| 29 |
+
[Yes]
|
| 30 |
+
|
| 31 |
+
Expression 1: 3/2
|
| 32 |
+
Expression 2: 1.5
|
| 33 |
+
|
| 34 |
+
[Yes]
|
| 35 |
+
|
| 36 |
+
Expression 1: $x^2+2x+1$
|
| 37 |
+
Expression 2: $y^2+2y+1$
|
| 38 |
+
|
| 39 |
+
[No]
|
| 40 |
+
|
| 41 |
+
Expression 1: $x^2+2x+1$
|
| 42 |
+
Expression 2: $(x+1)^2$
|
| 43 |
+
|
| 44 |
+
[Yes]
|
| 45 |
+
|
| 46 |
+
Expression 1: 3245/5
|
| 47 |
+
Expression 2: 649
|
| 48 |
+
|
| 49 |
+
[No]
|
| 50 |
+
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
|
| 51 |
+
|
| 52 |
+
Expression 1: 2/(-3)
|
| 53 |
+
Expression 2: -2/3
|
| 54 |
+
|
| 55 |
+
[Yes]
|
| 56 |
+
(trivial simplifications are allowed)
|
| 57 |
+
|
| 58 |
+
Expression 1: 72 degrees
|
| 59 |
+
Expression 2: 72
|
| 60 |
+
|
| 61 |
+
[Yes]
|
| 62 |
+
(give benefit of the doubt to units)
|
| 63 |
+
|
| 64 |
+
Expression 1: 64
|
| 65 |
+
Expression 2: 64 square feet
|
| 66 |
+
|
| 67 |
+
[Yes]
|
| 68 |
+
(give benefit of the doubt to units)
|
| 69 |
+
|
| 70 |
+
Expression 1: 64
|
| 71 |
+
Expression 2:
|
| 72 |
+
|
| 73 |
+
[No]
|
| 74 |
+
(only mark as equivalent if both expressions are nonempty)
|
| 75 |
+
|
| 76 |
+
---
|
| 77 |
+
|
| 78 |
+
YOUR TASK
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
|
| 82 |
+
Expression 1: {obj_gold}
|
| 83 |
+
Expression 2: {prediction}
|
| 84 |
+
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
# -------------Inferen Stage ----------------------------------------
|
| 88 |
+
# eval models
|
| 89 |
+
models = [*hf_llama3_8b_instruct_model]
|
| 90 |
+
# judge models
|
| 91 |
+
judge_models = hf_llama3_70b_instruct_model
|
| 92 |
+
|
| 93 |
+
eng_datasets = [*math_datasets]
|
| 94 |
+
chn_datasets = []
|
| 95 |
+
datasets = eng_datasets + chn_datasets
|
| 96 |
+
work_dir = 'outputs/obj_all/'
|
| 97 |
+
|
| 98 |
+
for d in eng_datasets:
|
| 99 |
+
d['eval_cfg'] = dict(
|
| 100 |
+
evaluator=dict(
|
| 101 |
+
type=LMEvaluator,
|
| 102 |
+
# If you need to preprocess the prediction before judging,
|
| 103 |
+
# you can specify the pred_postprocessor function here
|
| 104 |
+
pred_postprocessor=dict(type=math_judement_preprocess),
|
| 105 |
+
prompt_template=dict(
|
| 106 |
+
type=PromptTemplate,
|
| 107 |
+
template=dict(round=[
|
| 108 |
+
dict(role='HUMAN', prompt=eng_obj_prompt),
|
| 109 |
+
]),
|
| 110 |
+
),
|
| 111 |
+
),
|
| 112 |
+
pred_role='BOT',
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
infer = dict(
|
| 116 |
+
partitioner=dict(type=SizePartitioner, max_task_size=40000),
|
| 117 |
+
runner=dict(type=LocalRunner,
|
| 118 |
+
max_num_workers=256,
|
| 119 |
+
task=dict(type=OpenICLInferTask)),
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# ------------- Evaluation Configuration --------------------------------
|
| 123 |
+
eval = dict(
|
| 124 |
+
partitioner=dict(
|
| 125 |
+
type=SubjectiveSizePartitioner,
|
| 126 |
+
max_task_size=80000,
|
| 127 |
+
mode='singlescore',
|
| 128 |
+
models=models,
|
| 129 |
+
judge_models=judge_models,
|
| 130 |
+
),
|
| 131 |
+
runner=dict(type=LocalRunner,
|
| 132 |
+
max_num_workers=16,
|
| 133 |
+
task=dict(type=SubjectiveEvalTask)),
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
summarizer = dict(type=AllObjSummarizer)
|
examples/eval_math_verify.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 3 |
+
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
from opencompass.configs.datasets.math.math_500_gen import math_datasets
|
| 7 |
+
|
| 8 |
+
models = [
|
| 9 |
+
dict(
|
| 10 |
+
type=TurboMindModelwithChatTemplate,
|
| 11 |
+
abbr='deepseek-r1-distill-llama-8b-turbomind',
|
| 12 |
+
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
|
| 13 |
+
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
|
| 14 |
+
gen_config=dict(
|
| 15 |
+
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
| 16 |
+
),
|
| 17 |
+
max_seq_len=32768,
|
| 18 |
+
max_out_len=32768,
|
| 19 |
+
batch_size=32,
|
| 20 |
+
run_cfg=dict(num_gpus=1),
|
| 21 |
+
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
| 22 |
+
),
|
| 23 |
+
dict(
|
| 24 |
+
type=TurboMindModelwithChatTemplate,
|
| 25 |
+
abbr='deepseek-r1-distill-qwen-7b-turbomind',
|
| 26 |
+
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
| 27 |
+
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
|
| 28 |
+
gen_config=dict(
|
| 29 |
+
temperature=0.6,
|
| 30 |
+
top_p=0.95,
|
| 31 |
+
max_new_tokens=32768,
|
| 32 |
+
do_sample=True,
|
| 33 |
+
),
|
| 34 |
+
max_seq_len=32768,
|
| 35 |
+
max_out_len=32768,
|
| 36 |
+
batch_size=32,
|
| 37 |
+
run_cfg=dict(num_gpus=1),
|
| 38 |
+
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
| 39 |
+
),
|
| 40 |
+
dict(
|
| 41 |
+
type=TurboMindModelwithChatTemplate,
|
| 42 |
+
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
|
| 43 |
+
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
|
| 44 |
+
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
|
| 45 |
+
gen_config=dict(
|
| 46 |
+
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
| 47 |
+
),
|
| 48 |
+
max_seq_len=32768,
|
| 49 |
+
max_out_len=32768,
|
| 50 |
+
batch_size=32,
|
| 51 |
+
run_cfg=dict(num_gpus=1),
|
| 52 |
+
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
| 53 |
+
),
|
| 54 |
+
dict(
|
| 55 |
+
type=TurboMindModelwithChatTemplate,
|
| 56 |
+
abbr='deepseek-r1-distill-qwen-14b-turbomind',
|
| 57 |
+
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
|
| 58 |
+
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
|
| 59 |
+
gen_config=dict(
|
| 60 |
+
top_k=1,
|
| 61 |
+
temperature=0.6,
|
| 62 |
+
top_p=0.95,
|
| 63 |
+
max_new_tokens=32768,
|
| 64 |
+
do_sample=True,
|
| 65 |
+
),
|
| 66 |
+
max_seq_len=32768,
|
| 67 |
+
max_out_len=32768,
|
| 68 |
+
batch_size=16,
|
| 69 |
+
run_cfg=dict(num_gpus=2),
|
| 70 |
+
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
| 71 |
+
),
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
datasets = [*math_datasets]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
work_dir = './outputs/math_500'
|
examples/eval_mmlu_cf.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import \
|
| 5 |
+
mmlu_cf_datasets
|
| 6 |
+
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
|
| 7 |
+
models as lmdeploy_llama3_8b_instruct_model
|
| 8 |
+
from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import \
|
| 9 |
+
models as hf_qwen2_5_7b_instruct_model
|
| 10 |
+
from opencompass.configs.summarizers.mmlu_cf import summarizer
|
| 11 |
+
|
| 12 |
+
datasets = sum([
|
| 13 |
+
v
|
| 14 |
+
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
|
| 15 |
+
], [])
|
| 16 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 17 |
+
|
| 18 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 19 |
+
from opencompass.runners import LocalRunner
|
| 20 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 21 |
+
|
| 22 |
+
infer = dict(
|
| 23 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 24 |
+
runner=dict(type=LocalRunner,
|
| 25 |
+
max_num_workers=8,
|
| 26 |
+
task=dict(type=OpenICLInferTask)),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
eval = dict(
|
| 30 |
+
partitioner=dict(type=NaivePartitioner, n=10),
|
| 31 |
+
runner=dict(type=LocalRunner,
|
| 32 |
+
max_num_workers=256,
|
| 33 |
+
task=dict(type=OpenICLEvalTask)),
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
work_dir = 'outputs/debug/mmlu_cf'
|
examples/eval_mmlu_pro.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
|
| 5 |
+
mmlu_pro_datasets
|
| 6 |
+
from opencompass.configs.internal.clusters.local import eval
|
| 7 |
+
from opencompass.configs.internal.clusters.local import \
|
| 8 |
+
infer_num_worker as infer
|
| 9 |
+
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
|
| 10 |
+
models as lmdeploy_llama3_8b_instruct_model
|
| 11 |
+
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
|
| 12 |
+
models as lmdeploy_qwen2_7b_instruct_model
|
| 13 |
+
from opencompass.configs.summarizers.mmlu_pro import summarizer
|
| 14 |
+
|
| 15 |
+
datasets = sum([
|
| 16 |
+
v
|
| 17 |
+
for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
|
| 18 |
+
], [])
|
| 19 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 20 |
+
|
| 21 |
+
work_dir = 'outputs/debug/mmlu_pro'
|
| 22 |
+
|
| 23 |
+
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind
|
| 24 |
+
# ------------------------- --------- ------------- ------ ----------------------------- -------------------------------
|
| 25 |
+
# mmlu_pro - naive_average gen 46.18 43.92
|
| 26 |
+
# mmlu_pro_biology 736233 accuracy gen 63.74 64.02
|
| 27 |
+
# mmlu_pro_business 736233 accuracy gen 53.23 46.01
|
| 28 |
+
# mmlu_pro_chemistry 736233 accuracy gen 35.25 32.42
|
| 29 |
+
# mmlu_pro_computer_science 736233 accuracy gen 47.07 44.88
|
| 30 |
+
# mmlu_pro_economics 736233 accuracy gen 59.00 53.79
|
| 31 |
+
# mmlu_pro_engineering 736233 accuracy gen 26.73 33.54
|
| 32 |
+
# mmlu_pro_health 736233 accuracy gen 47.31 51.34
|
| 33 |
+
# mmlu_pro_history 736233 accuracy gen 42.78 42.26
|
| 34 |
+
# mmlu_pro_law 736233 accuracy gen 28.07 26.98
|
| 35 |
+
# mmlu_pro_math 736233 accuracy gen 53.59 37.53
|
| 36 |
+
# mmlu_pro_philosophy 736233 accuracy gen 42.28 42.48
|
| 37 |
+
# mmlu_pro_physics 736233 accuracy gen 39.11 33.64
|
| 38 |
+
# mmlu_pro_psychology 736233 accuracy gen 60.90 59.65
|
| 39 |
+
# mmlu_pro_other 736233 accuracy gen 47.40 46.32
|
examples/eval_mmlu_with_zero_retriever_overwritten.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from copy import deepcopy
|
| 2 |
+
|
| 3 |
+
from mmengine.config import read_base
|
| 4 |
+
|
| 5 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 6 |
+
|
| 7 |
+
with read_base():
|
| 8 |
+
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
|
| 9 |
+
mmlu_datasets # this is a dataset evaluated with 5-shot
|
| 10 |
+
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
|
| 11 |
+
|
| 12 |
+
datasets = []
|
| 13 |
+
for d in mmlu_datasets:
|
| 14 |
+
d = deepcopy(d)
|
| 15 |
+
d['infer_cfg']['retriever'] = dict(type=ZeroRetriever)
|
| 16 |
+
datasets.append(d)
|
examples/eval_multi_prompt_demo.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.models import HuggingFaceCausalLM
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
from opencompass.configs.datasets.winogrande.winogrande_gen_a027b6 import \
|
| 7 |
+
winogrande_datasets
|
| 8 |
+
|
| 9 |
+
datasets = [*winogrande_datasets]
|
| 10 |
+
|
| 11 |
+
_meta_template = dict(round=[
|
| 12 |
+
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
| 13 |
+
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
| 14 |
+
], )
|
| 15 |
+
|
| 16 |
+
models = [
|
| 17 |
+
dict(
|
| 18 |
+
type=HuggingFaceCausalLM,
|
| 19 |
+
abbr='internlm-chat-7b-hf',
|
| 20 |
+
path='internlm/internlm-chat-7b',
|
| 21 |
+
tokenizer_path='internlm/internlm-chat-7b',
|
| 22 |
+
tokenizer_kwargs=dict(
|
| 23 |
+
padding_side='left',
|
| 24 |
+
truncation_side='left',
|
| 25 |
+
use_fast=False,
|
| 26 |
+
trust_remote_code=True,
|
| 27 |
+
),
|
| 28 |
+
max_out_len=100,
|
| 29 |
+
max_seq_len=2048,
|
| 30 |
+
batch_size=8,
|
| 31 |
+
meta_template=_meta_template,
|
| 32 |
+
model_kwargs=dict(
|
| 33 |
+
trust_remote_code=True,
|
| 34 |
+
device_map='auto',
|
| 35 |
+
),
|
| 36 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 37 |
+
)
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
_winogrande_all = [d['abbr'] for d in winogrande_datasets]
|
| 41 |
+
|
| 42 |
+
summarizer = dict(summary_groups=[
|
| 43 |
+
{
|
| 44 |
+
'name': 'winogrande',
|
| 45 |
+
'subsets': _winogrande_all
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
'name': 'winogrande_std',
|
| 49 |
+
'subsets': _winogrande_all,
|
| 50 |
+
'std': True
|
| 51 |
+
},
|
| 52 |
+
])
|
examples/eval_musr.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
|
| 3 |
+
from mmengine.config import read_base
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
from opencompass.configs.datasets.musr.musr_gen_3c6e15 import musr_datasets
|
| 7 |
+
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
|
| 8 |
+
models as lmdeploy_glm4_9b_chat_model
|
| 9 |
+
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
|
| 10 |
+
models as lmdeploy_gemma_9b_it_model
|
| 11 |
+
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
|
| 12 |
+
models as lmdeploy_gemma_27b_it_model
|
| 13 |
+
# from opencompass.configs.models.hf_internlm.hf_internlm2_5_1_8b_chat import models
|
| 14 |
+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
| 15 |
+
models as lmdeploy_internlm2_5_7b_chat_model
|
| 16 |
+
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
|
| 17 |
+
models as lmdeploy_llama3_1_8b_instruct_model
|
| 18 |
+
from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
|
| 19 |
+
models as lmdeploy_ministral_8b_instruct_2410_model
|
| 20 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
|
| 21 |
+
models as lmdeploy_qwen2_5_7b_instruct_model
|
| 22 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
|
| 23 |
+
models as lmdeploy_qwen2_5_14b_instruct_model
|
| 24 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
|
| 25 |
+
models as lmdeploy_qwen2_5_32b_instruct_model
|
| 26 |
+
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
|
| 27 |
+
models as lmdeploy_yi_1_5_9b_chat_model
|
| 28 |
+
from opencompass.configs.summarizers.groups.musr_average import summarizer
|
| 29 |
+
|
| 30 |
+
datasets = [*musr_datasets]
|
| 31 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 32 |
+
|
| 33 |
+
base_exp_dir = 'outputs/musr/'
|
| 34 |
+
work_dir = osp.join(base_exp_dir, 'musr_eval')
|
examples/eval_needlebench_v2.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
# we use mmengine.config to import other config files
|
| 3 |
+
|
| 4 |
+
with read_base():
|
| 5 |
+
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b
|
| 6 |
+
|
| 7 |
+
# Evaluate needlebench_32k, adjust the configuration to use 4k, 32k, 128k, 200k, or 1000k if necessary.
|
| 8 |
+
# from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_32k import needlebench_datasets
|
| 9 |
+
# from opencompass.configs.summarizers.needlebench import needlebench_32k_summarizer as summarizer
|
| 10 |
+
|
| 11 |
+
# only eval original "needle in a haystack test" in needlebench_32k
|
| 12 |
+
from opencompass.configs.datasets.needlebench_v2.needlebench_v2_32k.needlebench_v2_single_32k import needlebench_zh_datasets, needlebench_en_datasets
|
| 13 |
+
from opencompass.configs.summarizers.needlebench import needlebench_v2_32k_summarizer as summarizer
|
| 14 |
+
|
| 15 |
+
# eval Ancestral Tracing Challenge(ATC)
|
| 16 |
+
# from opencompass.configs.datasets.needlebench_v2.atc.atc_0shot_nocot_2_power_en import needlebench_datasets
|
| 17 |
+
# ATC use default summarizer thus no need to import summarizer
|
| 18 |
+
|
| 19 |
+
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
|
| 20 |
+
|
| 21 |
+
for m in internlm2_chat_7b:
|
| 22 |
+
m['max_seq_len'] = 32768 # Ensure InternLM2-7B model can receive the full long text; for other models, adjust according to their supported maximum sequence length.
|
| 23 |
+
m['max_out_len'] = 4096
|
| 24 |
+
|
| 25 |
+
models = internlm2_chat_7b
|
| 26 |
+
|
| 27 |
+
work_dir = './outputs/needlebench'
|
examples/eval_qwen3.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os.path as osp
|
| 3 |
+
from opencompass.models import OpenAISDK
|
| 4 |
+
from mmengine.config import read_base
|
| 5 |
+
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
| 6 |
+
from opencompass.runners import LocalRunner
|
| 7 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 8 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 9 |
+
|
| 10 |
+
with read_base():
|
| 11 |
+
from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
|
| 12 |
+
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
|
| 13 |
+
from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
|
| 14 |
+
|
| 15 |
+
#######################################################################
|
| 16 |
+
# PART 0 Meta Info #
|
| 17 |
+
#######################################################################
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
api_meta_template = dict(round=[
|
| 21 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 22 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 23 |
+
],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
judge_cfg = dict(
|
| 28 |
+
abbr='qwen2-5-32B-Instruct',
|
| 29 |
+
type=OpenAISDK,
|
| 30 |
+
path='Qwen/Qwen2.5-32B-Instruct',
|
| 31 |
+
key='sk-1234',
|
| 32 |
+
openai_api_base=[
|
| 33 |
+
'http://x.x.x.x:4000/v1',
|
| 34 |
+
],
|
| 35 |
+
meta_template=api_meta_template,
|
| 36 |
+
query_per_second=8,
|
| 37 |
+
batch_size=256,
|
| 38 |
+
temperature=0.001,
|
| 39 |
+
# max_completion_tokens=32768,
|
| 40 |
+
tokenizer_path='gpt-4o-2024-05-13',
|
| 41 |
+
# verbose=True,
|
| 42 |
+
max_out_len=16384,
|
| 43 |
+
max_seq_len=32768,
|
| 44 |
+
# max_seq_len=49152,
|
| 45 |
+
mode='mid',
|
| 46 |
+
retry=10
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
#######################################################################
|
| 50 |
+
# PART 1 Datasets List #
|
| 51 |
+
#######################################################################
|
| 52 |
+
|
| 53 |
+
repeated_info = [
|
| 54 |
+
(math_datasets, 4),
|
| 55 |
+
(aime2024_datasets, 32),
|
| 56 |
+
(aime2025_datasets, 32),
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
for datasets_, num in repeated_info:
|
| 60 |
+
for dataset_ in datasets_:
|
| 61 |
+
dataset_['n'] = num
|
| 62 |
+
|
| 63 |
+
datasets = sum(
|
| 64 |
+
(v for k, v in locals().items() if k.endswith('_datasets')),
|
| 65 |
+
[],
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
for item in datasets:
|
| 69 |
+
item['infer_cfg']['inferencer']['max_out_len'] = 32768
|
| 70 |
+
try:
|
| 71 |
+
if 'judge_cfg' in item['eval_cfg']['evaluator']:
|
| 72 |
+
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
| 73 |
+
elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
|
| 74 |
+
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
|
| 75 |
+
except:
|
| 76 |
+
pass
|
| 77 |
+
#######################################################################
|
| 78 |
+
# PART 2 Dataset Summarizer #
|
| 79 |
+
#######################################################################
|
| 80 |
+
|
| 81 |
+
summarizer = dict(
|
| 82 |
+
dataset_abbrs=[
|
| 83 |
+
'MATH',
|
| 84 |
+
['math_prm800k_500', 'accuracy (4 runs average)'],
|
| 85 |
+
['aime2024', 'accuracy (32 runs average)'],
|
| 86 |
+
['aime2025', 'accuracy (32 runs average)'],
|
| 87 |
+
['livemathbench_hard', 'naive_average'],
|
| 88 |
+
['OlympiadBenchMath', 'accuracy'],
|
| 89 |
+
['olymmath', 'naive_average'],
|
| 90 |
+
],
|
| 91 |
+
summary_groups = sum(
|
| 92 |
+
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
|
| 93 |
+
),
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
#######################################################################
|
| 97 |
+
# PART 3 Models List #
|
| 98 |
+
#######################################################################
|
| 99 |
+
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
| 100 |
+
models += [
|
| 101 |
+
|
| 102 |
+
dict(
|
| 103 |
+
abbr='Qwen_Qwen3-235B-A22B',
|
| 104 |
+
type=OpenAISDK,
|
| 105 |
+
path='Qwen/Qwen3-235B-A22B',
|
| 106 |
+
key='sk-admin',
|
| 107 |
+
openai_api_base=[
|
| 108 |
+
'http://106.15.231.215:40007/v1/',
|
| 109 |
+
],
|
| 110 |
+
meta_template=dict(
|
| 111 |
+
# begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
|
| 112 |
+
round=[
|
| 113 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 114 |
+
# XXX: all system roles are mapped to human in purpose
|
| 115 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 116 |
+
]
|
| 117 |
+
),
|
| 118 |
+
query_per_second=16,
|
| 119 |
+
batch_size=128,
|
| 120 |
+
# batch_size=1,
|
| 121 |
+
temperature=0.6,
|
| 122 |
+
# max_completion_tokens=32768,
|
| 123 |
+
tokenizer_path='gpt-4',
|
| 124 |
+
# verbose=True,
|
| 125 |
+
max_out_len=32768,
|
| 126 |
+
max_seq_len=32768,
|
| 127 |
+
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
| 128 |
+
),
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
infer = dict(
|
| 132 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 133 |
+
runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
eval = dict(
|
| 137 |
+
partitioner=dict(type=NaivePartitioner, n=8),
|
| 138 |
+
runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
base_exp_dir = 'outputs/qwen3_reasoning'
|
| 142 |
+
work_dir = osp.join(base_exp_dir, 'chat_objective')
|
examples/eval_qwen_7b_chat.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.collections.leaderboard.qwen_chat import \
|
| 5 |
+
datasets
|
| 6 |
+
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
|
| 7 |
+
from opencompass.configs.summarizers.leaderboard import summarizer
|
| 8 |
+
'''
|
| 9 |
+
dataset version metric mode qwen-7b-chat-hf
|
| 10 |
+
-------------------------------------- --------- ---------------- ------ -----------------
|
| 11 |
+
--------- 考试 Exam --------- - - - -
|
| 12 |
+
ceval - naive_average gen 56.07
|
| 13 |
+
agieval - naive_average mixed 39.51
|
| 14 |
+
mmlu - naive_average gen 53.49
|
| 15 |
+
cmmlu - naive_average gen 55.29
|
| 16 |
+
GaokaoBench - weighted_average gen 48.01
|
| 17 |
+
ARC-c ca1e8e accuracy ppl 74.92
|
| 18 |
+
ARC-e ca1e8e accuracy ppl 85.71
|
| 19 |
+
--------- 语言 Language --------- - - - -
|
| 20 |
+
WiC efbd01 accuracy gen 51.41
|
| 21 |
+
chid-dev 25f3d3 accuracy ppl 77.72
|
| 22 |
+
afqmc-dev 4a1636 accuracy gen 69.00
|
| 23 |
+
WSC 678cb5 accuracy ppl 67.31
|
| 24 |
+
tydiqa-goldp - naive_average gen 15.32
|
| 25 |
+
flores_100 - naive_average gen 10.00
|
| 26 |
+
--------- 知识 Knowledge --------- - - - -
|
| 27 |
+
BoolQ 463fee accuracy ppl 83.18
|
| 28 |
+
commonsense_qa ddaabf accuracy gen 76.41
|
| 29 |
+
triviaqa b6904f score gen 43.25
|
| 30 |
+
nq 23dc1a score gen 16.26
|
| 31 |
+
--------- 理解 Understanding --------- - - - -
|
| 32 |
+
C3 e6778d accuracy gen 81.53
|
| 33 |
+
race-middle e0908b accuracy gen 83.01
|
| 34 |
+
race-high e0908b accuracy gen 77.79
|
| 35 |
+
openbookqa_fact 49689a accuracy ppl 86.40
|
| 36 |
+
csl_dev 3c4211 accuracy ppl 64.38
|
| 37 |
+
lcsts 0b3969 rouge1 gen 12.75
|
| 38 |
+
Xsum 207e69 rouge1 gen 20.21
|
| 39 |
+
eprstmt-dev ed0c5d accuracy ppl 85.00
|
| 40 |
+
lambada de1af2 accuracy gen 59.19
|
| 41 |
+
--------- 推理 Reasoning --------- - - - -
|
| 42 |
+
cmnli 15e783 accuracy ppl 48.08
|
| 43 |
+
ocnli 15e783 accuracy ppl 51.40
|
| 44 |
+
AX_b 689df1 accuracy ppl 65.67
|
| 45 |
+
AX_g 808a19 accuracy ppl 76.12
|
| 46 |
+
RTE 808a19 accuracy ppl 68.95
|
| 47 |
+
COPA 59f42c accuracy gen 92.00
|
| 48 |
+
ReCoRD 6f7cfc score gen 0.16
|
| 49 |
+
hellaswag 8d79e0 accuracy ppl 69.28
|
| 50 |
+
piqa 34eee7 accuracy ppl 72.20
|
| 51 |
+
siqa ea30d1 accuracy ppl 72.88
|
| 52 |
+
math 2c0b9e accuracy gen 7.84
|
| 53 |
+
gsm8k 4c7f6e accuracy gen 45.41
|
| 54 |
+
drop 53a0a7 score gen 39.62
|
| 55 |
+
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
|
| 56 |
+
mbpp 60ca11 score gen 20.60
|
| 57 |
+
bbh - naive_average gen 42.61
|
| 58 |
+
'''
|
examples/eval_qwen_7b_chat_lawbench.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.lawbench.lawbench_one_shot_gen_002588 import \
|
| 5 |
+
lawbench_datasets as lawbench_one_shot_datasets
|
| 6 |
+
from opencompass.configs.datasets.lawbench.lawbench_zero_shot_gen_002588 import \
|
| 7 |
+
lawbench_datasets as lawbench_zero_shot_datasets
|
| 8 |
+
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
|
| 9 |
+
from opencompass.configs.summarizers.lawbench import summarizer
|
| 10 |
+
|
| 11 |
+
datasets = lawbench_zero_shot_datasets + lawbench_one_shot_datasets
|
| 12 |
+
for d in datasets:
|
| 13 |
+
d['infer_cfg']['inferencer']['save_every'] = 1
|
examples/eval_rewardbench.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
|
| 4 |
+
from opencompass.configs.summarizers.rewardbench import summarizer
|
| 5 |
+
|
| 6 |
+
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
| 7 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
| 8 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 9 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 10 |
+
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
| 11 |
+
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
| 12 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 13 |
+
from opencompass.tasks import OpenICLInferTask
|
| 14 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 15 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 16 |
+
|
| 17 |
+
api_meta_template = dict(
|
| 18 |
+
round=[
|
| 19 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 20 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 21 |
+
]
|
| 22 |
+
)
|
| 23 |
+
datasets = [*get_rewardbench_datasets]
|
| 24 |
+
|
| 25 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 26 |
+
|
| 27 |
+
models = [
|
| 28 |
+
dict(
|
| 29 |
+
type=TurboMindModelwithChatTemplate,
|
| 30 |
+
abbr='qwen-7b-hf',
|
| 31 |
+
path='Qwen/Qwen-7B',
|
| 32 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
| 33 |
+
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
| 34 |
+
max_seq_len=16384,
|
| 35 |
+
max_out_len=2048,
|
| 36 |
+
batch_size=16,
|
| 37 |
+
run_cfg=dict(num_gpus=1),
|
| 38 |
+
),
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
infer = dict(
|
| 43 |
+
partitioner=dict(type=NaivePartitioner),
|
| 44 |
+
runner=dict(
|
| 45 |
+
type=LocalRunner,
|
| 46 |
+
max_num_workers=72,
|
| 47 |
+
task=dict(type=OpenICLInferTask),
|
| 48 |
+
),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
work_dir = './outputs/rewardbench/'
|
examples/eval_rmb.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from opencompass.configs.datasets.judge.rmb import get_rmb_dataset
|
| 4 |
+
|
| 5 |
+
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
| 6 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
|
| 7 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 8 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 9 |
+
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
| 10 |
+
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
|
| 11 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 12 |
+
from opencompass.tasks import OpenICLInferTask
|
| 13 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 14 |
+
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
|
| 15 |
+
|
| 16 |
+
api_meta_template = dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 19 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 20 |
+
]
|
| 21 |
+
)
|
| 22 |
+
datasets = [*get_rmb_dataset]
|
| 23 |
+
|
| 24 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 25 |
+
|
| 26 |
+
models = [
|
| 27 |
+
dict(
|
| 28 |
+
type=TurboMindModelwithChatTemplate,
|
| 29 |
+
abbr='qwen-7b-hf',
|
| 30 |
+
path='Qwen/Qwen-7B',
|
| 31 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
|
| 32 |
+
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
|
| 33 |
+
max_seq_len=16384,
|
| 34 |
+
max_out_len=2048,
|
| 35 |
+
batch_size=16,
|
| 36 |
+
run_cfg=dict(num_gpus=1),
|
| 37 |
+
),
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
infer = dict(
|
| 42 |
+
# partitioner=dict(type=NaivePartitioner),
|
| 43 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 44 |
+
runner=dict(
|
| 45 |
+
type=LocalRunner,
|
| 46 |
+
max_num_workers=72,
|
| 47 |
+
task=dict(type=OpenICLInferTask),
|
| 48 |
+
),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
work_dir = './outputs/rmb/'
|
examples/eval_ruler.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 4 |
+
from opencompass.runners import LocalRunner
|
| 5 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 6 |
+
|
| 7 |
+
with read_base():
|
| 8 |
+
from opencompass.configs.datasets.ruler.ruler_cwe_gen import cwe_datasets # CWE
|
| 9 |
+
from opencompass.configs.datasets.ruler.ruler_fwe_gen import fwe_datasets # FWE
|
| 10 |
+
from opencompass.configs.datasets.ruler.ruler_niah_gen import niah_datasets # Niah
|
| 11 |
+
from opencompass.configs.datasets.ruler.ruler_qa_gen import qa_datasets # QA
|
| 12 |
+
from opencompass.configs.datasets.ruler.ruler_vt_gen import vt_datasets # VT
|
| 13 |
+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import (
|
| 14 |
+
models as internlm2_5_7b_chat_1m,
|
| 15 |
+
)
|
| 16 |
+
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
|
| 17 |
+
models as llama3_8b_instruct_model,
|
| 18 |
+
)
|
| 19 |
+
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
|
| 20 |
+
models as qwen2_7b_instruct_model,
|
| 21 |
+
)
|
| 22 |
+
from opencompass.configs.summarizers.groups.ruler import ruler_summary_groups
|
| 23 |
+
|
| 24 |
+
import_datasets = sum(
|
| 25 |
+
[niah_datasets, vt_datasets, fwe_datasets, cwe_datasets, qa_datasets], [])
|
| 26 |
+
|
| 27 |
+
# Evaluation config
|
| 28 |
+
NUM_SAMPLES = 500
|
| 29 |
+
# Change the context lengths to be tested
|
| 30 |
+
max_seq_lens = [1024 * 4, 1024 * 8, 1024 * 16, 1024 * 32]
|
| 31 |
+
abbr_suffixs = ['4k', '8k', '16k', '32k']
|
| 32 |
+
work_dir = './outputs/ruler'
|
| 33 |
+
|
| 34 |
+
# Model Settings
|
| 35 |
+
qwen2_7b_instruct_model[0]['max_seq_len'] = 33792
|
| 36 |
+
qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 33792
|
| 37 |
+
qwen2_7b_instruct_model[0]['engine_config']['tp'] = 2
|
| 38 |
+
qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 2
|
| 39 |
+
llama3_8b_instruct_model[0]['max_seq_len'] = 33792
|
| 40 |
+
llama3_8b_instruct_model[0]['engine_config']['session_len'] = 33792
|
| 41 |
+
llama3_8b_instruct_model[0]['engine_config']['tp'] = 2
|
| 42 |
+
llama3_8b_instruct_model[0]['run_cfg']['num_gpus'] = 2
|
| 43 |
+
model_settings = [
|
| 44 |
+
[qwen2_7b_instruct_model[0], 'Qwen/Qwen2-7B-Instruct'],
|
| 45 |
+
[llama3_8b_instruct_model[0], 'meta-llama/Meta-Llama-3-8B-Instruct'],
|
| 46 |
+
[internlm2_5_7b_chat_1m[0], 'internlm/internlm2_5-7b-chat-1m'],
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# Dataset Model Combination
|
| 50 |
+
datasets = []
|
| 51 |
+
models = []
|
| 52 |
+
model_dataset_combinations = []
|
| 53 |
+
|
| 54 |
+
# Different seq length
|
| 55 |
+
for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs):
|
| 56 |
+
for model, model_path in model_settings:
|
| 57 |
+
_tmp_datasets = []
|
| 58 |
+
for dataset in import_datasets:
|
| 59 |
+
tmp_dataset = dataset.deepcopy()
|
| 60 |
+
tmp_dataset['tokenizer_model'] = model_path
|
| 61 |
+
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
|
| 62 |
+
tmp_dataset['num_samples'] = NUM_SAMPLES
|
| 63 |
+
tmp_dataset['max_seq_length'] = max_seq_len
|
| 64 |
+
_tmp_datasets.append(tmp_dataset)
|
| 65 |
+
model_dataset_combinations.append(
|
| 66 |
+
dict(models=[model], datasets=_tmp_datasets))
|
| 67 |
+
models.append(model)
|
| 68 |
+
datasets.extend(_tmp_datasets)
|
| 69 |
+
|
| 70 |
+
infer = dict(
|
| 71 |
+
partitioner=dict(type=NumWorkerPartitioner),
|
| 72 |
+
runner=dict(type=LocalRunner,
|
| 73 |
+
max_num_workers=16,
|
| 74 |
+
task=dict(type=OpenICLInferTask),
|
| 75 |
+
retry=5),
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
eval = dict(
|
| 79 |
+
partitioner=dict(type=NaivePartitioner),
|
| 80 |
+
runner=dict(type=LocalRunner,
|
| 81 |
+
max_num_workers=32,
|
| 82 |
+
task=dict(type=OpenICLEvalTask)),
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
summarizer = dict(
|
| 86 |
+
dataset_abbrs=abbr_suffixs,
|
| 87 |
+
summary_groups=sum([ruler_summary_groups], []),
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 91 |
+
# dataset version metric mode qwen2-7b-instruct-turbomind llama-3-8b-instruct-turbomind internlm2_5-7b-chat-1m-turbomind
|
| 92 |
+
# --------- --------- ------------- ------ ----------------------------- ------------------------------- ----------------------------------
|
| 93 |
+
# 4k - naive_average gen 93.66 93.48 91.20
|
| 94 |
+
# 8k - naive_average gen 88.38 89.95 89.07
|
| 95 |
+
# 16k - naive_average gen 84.27 0.14 87.61
|
| 96 |
+
# 32k - naive_average gen 81.36 0.00 84.59
|
| 97 |
+
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
|
examples/eval_rwkv5_3b.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.collections.base_medium_llama import \
|
| 5 |
+
datasets
|
| 6 |
+
from opencompass.configs.models.rwkv.rwkv5_3b import models
|
| 7 |
+
from opencompass.configs.summarizers.leaderboard import summarizer
|
examples/eval_simpleqa.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
|
| 2 |
+
from mmengine.config import read_base
|
| 3 |
+
|
| 4 |
+
from opencompass.partitioners import NaivePartitioner
|
| 5 |
+
from opencompass.runners import LocalRunner
|
| 6 |
+
from opencompass.summarizers import DefaultSubjectiveSummarizer
|
| 7 |
+
from opencompass.tasks import OpenICLInferTask
|
| 8 |
+
|
| 9 |
+
with read_base():
|
| 10 |
+
from opencompass.configs.datasets.SimpleQA.simpleqa_gen import \
|
| 11 |
+
simpleqa_datasets
|
| 12 |
+
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
|
| 13 |
+
models as gpt_4o_2024_05_13_model
|
| 14 |
+
|
| 15 |
+
models = gpt_4o_2024_05_13_model # model for generation
|
| 16 |
+
judge_models = gpt_4o_2024_05_13_model # model for evaluation
|
| 17 |
+
|
| 18 |
+
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
| 19 |
+
summarizer = dict(type=DefaultSubjectiveSummarizer)
|
| 20 |
+
|
| 21 |
+
# -------------Inferen Stage ----------------------------------------
|
| 22 |
+
|
| 23 |
+
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
| 24 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 25 |
+
from opencompass.runners import LocalRunner
|
| 26 |
+
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
| 27 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 28 |
+
|
| 29 |
+
infer = dict(
|
| 30 |
+
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
| 31 |
+
runner=dict(type=LocalRunner,
|
| 32 |
+
max_num_workers=8,
|
| 33 |
+
task=dict(type=OpenICLInferTask)),
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
eval = dict(
|
| 37 |
+
partitioner=dict(
|
| 38 |
+
type=SubjectiveNaivePartitioner,
|
| 39 |
+
models=models,
|
| 40 |
+
judge_models=judge_models,
|
| 41 |
+
),
|
| 42 |
+
runner=dict(type=LocalRunner,
|
| 43 |
+
max_num_workers=256,
|
| 44 |
+
task=dict(type=SubjectiveEvalTask)),
|
| 45 |
+
)
|
examples/eval_subjective.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
|
| 5 |
+
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
|
| 6 |
+
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
|
| 7 |
+
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
|
| 8 |
+
from opencompass.configs.datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
|
| 9 |
+
from opencompass.configs.datasets.subjective.fofo.fofo_judge import fofo_datasets
|
| 10 |
+
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
|
| 11 |
+
from opencompass.configs.datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
|
| 12 |
+
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
|
| 13 |
+
|
| 14 |
+
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
|
| 15 |
+
HuggingFaceChatGLM3, OpenAI)
|
| 16 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
| 17 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 18 |
+
from opencompass.partitioners.sub_num_worker import \
|
| 19 |
+
SubjectiveNumWorkerPartitioner
|
| 20 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 21 |
+
from opencompass.runners import LocalRunner, SlurmSequentialRunner
|
| 22 |
+
from opencompass.summarizers import SubjectiveSummarizer
|
| 23 |
+
from opencompass.tasks import OpenICLInferTask
|
| 24 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 25 |
+
|
| 26 |
+
api_meta_template = dict(round=[
|
| 27 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 28 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 29 |
+
])
|
| 30 |
+
|
| 31 |
+
# -------------Inference Stage ----------------------------------------
|
| 32 |
+
# For subjective evaluation, we often set do sample for models
|
| 33 |
+
models = [
|
| 34 |
+
dict(
|
| 35 |
+
type=HuggingFaceChatGLM3,
|
| 36 |
+
abbr='chatglm3-6b-hf',
|
| 37 |
+
path='THUDM/chatglm3-6b',
|
| 38 |
+
tokenizer_path='THUDM/chatglm3-6b',
|
| 39 |
+
model_kwargs=dict(
|
| 40 |
+
device_map='auto',
|
| 41 |
+
trust_remote_code=True,
|
| 42 |
+
),
|
| 43 |
+
tokenizer_kwargs=dict(
|
| 44 |
+
padding_side='left',
|
| 45 |
+
truncation_side='left',
|
| 46 |
+
trust_remote_code=True,
|
| 47 |
+
),
|
| 48 |
+
generation_kwargs=dict(
|
| 49 |
+
do_sample=
|
| 50 |
+
True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
|
| 51 |
+
),
|
| 52 |
+
meta_template=api_meta_template,
|
| 53 |
+
max_out_len=2048,
|
| 54 |
+
max_seq_len=4096,
|
| 55 |
+
batch_size=8,
|
| 56 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 57 |
+
)
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
datasets = [
|
| 61 |
+
*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets,
|
| 62 |
+
*compassarena_datasets, *compassbench_datasets, *fofo_datasets,
|
| 63 |
+
*mtbench_datasets, *mtbench101_datasets, *wildbench_datasets
|
| 64 |
+
] # add datasets you want
|
| 65 |
+
|
| 66 |
+
infer = dict(
|
| 67 |
+
partitioner=dict(type=NaivePartitioner),
|
| 68 |
+
runner=dict(type=LocalRunner,
|
| 69 |
+
max_num_workers=16,
|
| 70 |
+
task=dict(type=OpenICLInferTask)),
|
| 71 |
+
)
|
| 72 |
+
# -------------Evalation Stage ----------------------------------------
|
| 73 |
+
|
| 74 |
+
## ------------- JudgeLLM Configuration
|
| 75 |
+
judge_models = [
|
| 76 |
+
dict(
|
| 77 |
+
abbr='GPT4-Turbo',
|
| 78 |
+
type=OpenAI,
|
| 79 |
+
path='gpt-4-1106-preview',
|
| 80 |
+
key=
|
| 81 |
+
'xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
| 82 |
+
meta_template=api_meta_template,
|
| 83 |
+
query_per_second=16,
|
| 84 |
+
max_out_len=2048,
|
| 85 |
+
max_seq_len=2048,
|
| 86 |
+
batch_size=8,
|
| 87 |
+
temperature=0,
|
| 88 |
+
)
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
## ------------- Evaluation Configuration
|
| 92 |
+
eval = dict(
|
| 93 |
+
partitioner=dict(
|
| 94 |
+
type=SubjectiveNaivePartitioner,
|
| 95 |
+
models=models,
|
| 96 |
+
judge_models=judge_models,
|
| 97 |
+
),
|
| 98 |
+
runner=dict(type=LocalRunner,
|
| 99 |
+
max_num_workers=16,
|
| 100 |
+
task=dict(type=SubjectiveEvalTask)),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
|
| 104 |
+
work_dir = 'outputs/subjective/'
|
examples/eval_subjective_bradleyterry.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_bradleyterry import (
|
| 5 |
+
alpacav2_datasets, )
|
| 6 |
+
|
| 7 |
+
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_bradleyterry import (
|
| 8 |
+
arenahard_datasets, )
|
| 9 |
+
|
| 10 |
+
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_bradleyterry import (
|
| 11 |
+
compassarena_datasets, )
|
| 12 |
+
|
| 13 |
+
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_bradleyterry import (
|
| 14 |
+
wildbench_datasets, )
|
| 15 |
+
|
| 16 |
+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
|
| 17 |
+
models as lmdeploy_internlm2_5_7b_chat, )
|
| 18 |
+
|
| 19 |
+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
|
| 20 |
+
models as lmdeploy_internlm2_5_20b_chat, )
|
| 21 |
+
|
| 22 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
|
| 23 |
+
models as lmdeploy_qwen2_5_7b_instruct, )
|
| 24 |
+
|
| 25 |
+
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
|
| 26 |
+
models as lmdeploy_qwen2_5_14b_instruct, )
|
| 27 |
+
|
| 28 |
+
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
|
| 29 |
+
models as lmdeploy_qwen2_7b_instruct, )
|
| 30 |
+
|
| 31 |
+
from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
|
| 32 |
+
HuggingFaceChatGLM3, OpenAI,
|
| 33 |
+
TurboMindModelwithChatTemplate)
|
| 34 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
| 35 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 36 |
+
from opencompass.partitioners.sub_num_worker import \
|
| 37 |
+
SubjectiveNumWorkerPartitioner
|
| 38 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 39 |
+
from opencompass.runners import LocalRunner, SlurmSequentialRunner
|
| 40 |
+
from opencompass.summarizers import (CompassArenaBradleyTerrySummarizer,
|
| 41 |
+
SubjectiveSummarizer)
|
| 42 |
+
from opencompass.tasks import OpenICLInferTask
|
| 43 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 44 |
+
|
| 45 |
+
api_meta_template = dict(round=[
|
| 46 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 47 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 48 |
+
])
|
| 49 |
+
|
| 50 |
+
# -------------Inference Stage ----------------------------------------
|
| 51 |
+
# For subjective evaluation, we often set do sample for models
|
| 52 |
+
models = [
|
| 53 |
+
*lmdeploy_internlm2_5_7b_chat,
|
| 54 |
+
*lmdeploy_internlm2_5_20b_chat,
|
| 55 |
+
*lmdeploy_qwen2_5_14b_instruct,
|
| 56 |
+
*lmdeploy_qwen2_5_7b_instruct,
|
| 57 |
+
*lmdeploy_qwen2_7b_instruct,
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
datasets = [
|
| 61 |
+
*alpacav2_datasets,
|
| 62 |
+
*arenahard_datasets,
|
| 63 |
+
*compassarena_datasets,
|
| 64 |
+
*wildbench_datasets,
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
infer = dict(
|
| 68 |
+
partitioner=dict(type=NaivePartitioner),
|
| 69 |
+
runner=dict(type=LocalRunner,
|
| 70 |
+
max_num_workers=16,
|
| 71 |
+
task=dict(type=OpenICLInferTask)),
|
| 72 |
+
)
|
| 73 |
+
# -------------Evalation Stage ----------------------------------------
|
| 74 |
+
|
| 75 |
+
## ------------- JudgeLLM Configuration
|
| 76 |
+
judge_models = [
|
| 77 |
+
dict(
|
| 78 |
+
type=TurboMindModelwithChatTemplate,
|
| 79 |
+
abbr='CompassJudger-1-32B-Instruct',
|
| 80 |
+
path='opencompass/CompassJudger-1-32B-Instruct',
|
| 81 |
+
engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
|
| 82 |
+
gen_config=dict(top_k=1,
|
| 83 |
+
temperature=1e-6,
|
| 84 |
+
top_p=0.9,
|
| 85 |
+
max_new_tokens=2048),
|
| 86 |
+
max_seq_len=16384,
|
| 87 |
+
max_out_len=2048,
|
| 88 |
+
batch_size=16,
|
| 89 |
+
run_cfg=dict(num_gpus=4),
|
| 90 |
+
)
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
## ------------- Evaluation Configuration
|
| 94 |
+
eval = dict(
|
| 95 |
+
partitioner=dict(
|
| 96 |
+
type=SubjectiveNaivePartitioner,
|
| 97 |
+
models=models,
|
| 98 |
+
judge_models=judge_models,
|
| 99 |
+
),
|
| 100 |
+
runner=dict(type=LocalRunner,
|
| 101 |
+
max_num_workers=16,
|
| 102 |
+
task=dict(type=SubjectiveEvalTask)),
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
## ------------- Summary Configuration
|
| 106 |
+
# This step fits a Bradley-Terry model (statistical model) with an option
|
| 107 |
+
# to include style features and control variables based on groups
|
| 108 |
+
# (group variables must be available in the input dataset for each observation).
|
| 109 |
+
summarizer = dict(
|
| 110 |
+
type=CompassArenaBradleyTerrySummarizer,
|
| 111 |
+
rating_system='bradleyterry',
|
| 112 |
+
report_pred_win_rates=True,
|
| 113 |
+
num_bootstrap=100,
|
| 114 |
+
num_cpu=None,
|
| 115 |
+
with_control_vars=True,
|
| 116 |
+
normalize_style_features=False,
|
| 117 |
+
odds_ratio=True,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
work_dir = 'outputs/subjective/bradleyterry'
|
examples/eval_teval.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from copy import deepcopy
|
| 2 |
+
|
| 3 |
+
from mmengine.config import read_base
|
| 4 |
+
|
| 5 |
+
with read_base():
|
| 6 |
+
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
|
| 7 |
+
teval_datasets as teval_en_datasets
|
| 8 |
+
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
|
| 9 |
+
teval_datasets as teval_zh_datasets
|
| 10 |
+
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
|
| 11 |
+
models as hf_internlm2_chat_7b_model
|
| 12 |
+
from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import \
|
| 13 |
+
models as hf_llama2_7b_chat_model
|
| 14 |
+
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
|
| 15 |
+
models as hf_qwen_7b_chat_model
|
| 16 |
+
from opencompass.configs.summarizers.teval import summarizer
|
| 17 |
+
|
| 18 |
+
meta_template_system_patches = {
|
| 19 |
+
'internlm2-chat-7b-hf':
|
| 20 |
+
dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
|
| 21 |
+
'internlm2-chat-20b-hf':
|
| 22 |
+
dict(role='SYSTEM', begin='<|im_start|>system\n', end='<|im_end|>\n'),
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
|
| 26 |
+
[])
|
| 27 |
+
models = []
|
| 28 |
+
for m in _origin_models:
|
| 29 |
+
m = deepcopy(m)
|
| 30 |
+
if 'meta_template' in m and 'round' in m['meta_template']:
|
| 31 |
+
round = m['meta_template']['round']
|
| 32 |
+
if all(r['role'].upper() != 'SYSTEM'
|
| 33 |
+
for r in round): # no system round
|
| 34 |
+
if m['abbr'] in meta_template_system_patches:
|
| 35 |
+
system_round = meta_template_system_patches[m['abbr']]
|
| 36 |
+
else:
|
| 37 |
+
system_round = [
|
| 38 |
+
r for r in round if r['role'].upper() == 'HUMAN'
|
| 39 |
+
][0]
|
| 40 |
+
system_round = deepcopy(system_round)
|
| 41 |
+
system_round['role'] = 'SYSTEM'
|
| 42 |
+
m['meta_template']['round'].append(system_round)
|
| 43 |
+
else:
|
| 44 |
+
raise ValueError(f'no meta_template.round in {m.get("abbr", None)}')
|
| 45 |
+
|
| 46 |
+
print(
|
| 47 |
+
f'model {m["abbr"]} is using the following meta_template: {m["meta_template"]}'
|
| 48 |
+
)
|
| 49 |
+
models.append(m)
|
| 50 |
+
|
| 51 |
+
datasets = teval_en_datasets + teval_zh_datasets
|
| 52 |
+
work_dir = './outputs/teval'
|
| 53 |
+
"""Dataset version metric mode
|
| 54 |
+
qwen-7b-chat-hf internlm2-chat-7b-hf llama-2-7b-chat-hf.
|
| 55 |
+
|
| 56 |
+
------------------------------------------- --------- -------------- ------- ----------------- ---------------------- --------------------
|
| 57 |
+
teval - naive_average unknown 57.69 78.18 36.63
|
| 58 |
+
teval-instruct_v1 10482d string_metric unknown 28.83 98.08 50.27
|
| 59 |
+
teval-instruct_v1 10482d json_metric unknown 94.32 97.08 0.15
|
| 60 |
+
teval-plan_str_v1 10482d f1_score unknown 66.24 84.12 45.72
|
| 61 |
+
teval-plan_json_v1 10482d f1_score unknown 63.62 77.71 19.95
|
| 62 |
+
teval-reason_str_v1 10482d thought unknown 54.14 63.58 44.92
|
| 63 |
+
teval-reason_retrieve_understand_json_v1 10482d thought unknown 33.77 54.72 21.49
|
| 64 |
+
teval-retrieve_str_v1 10482d name unknown 73.89 85.28 60.6
|
| 65 |
+
teval-reason_retrieve_understand_json_v1 10482d name unknown 31.15 68.97 15.34
|
| 66 |
+
teval-understand_str_v1 10482d args unknown 77.76 93.03 65.61
|
| 67 |
+
teval-reason_retrieve_understand_json_v1 10482d args unknown 44.16 72.23 26.84
|
| 68 |
+
teval-review_str_v1 10482d review_quality unknown 62.22 71.66 44.35
|
| 69 |
+
teval_zh - naive_average unknown 61.31 75.01 32.33
|
| 70 |
+
teval-instruct_v1_zh 10482d string_metric unknown 88.69 98.19 23.64
|
| 71 |
+
teval-instruct_v1_zh 10482d json_metric unknown 75.77 96.62 0.89
|
| 72 |
+
teval-plan_str_v1_zh 10482d f1_score unknown 62.43 70.69 47.82
|
| 73 |
+
teval-plan_json_v1_zh 10482d f1_score unknown 61.46 68.95 15.87
|
| 74 |
+
teval-reason_str_v1_zh 10482d thought unknown 59.43 68.14 46.96
|
| 75 |
+
teval-reason_retrieve_understand_json_v1_zh 10482d thought unknown 39.19 60.37 23.91
|
| 76 |
+
teval-retrieve_str_v1_zh 10482d name unknown 69.41 84.22 54.44
|
| 77 |
+
teval-reason_retrieve_understand_json_v1_zh 10482d name unknown 32.87 70.46 14.16
|
| 78 |
+
teval-understand_str_v1_zh 10482d args unknown 84.39 88.62 77.29
|
| 79 |
+
teval-reason_retrieve_understand_json_v1_zh 10482d args unknown 48.71 72.71 28.83
|
| 80 |
+
teval-review_str_v1_zh 10482d review_quality unknown 56.67 60.57 27.1
|
| 81 |
+
"""
|
examples/eval_with_model_dataset_combinations.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
|
| 5 |
+
ceval_datasets as chat_ceval_datasets
|
| 6 |
+
from opencompass.configs.datasets.ceval.ceval_ppl_578f8d import \
|
| 7 |
+
ceval_datasets as base_ceval_datasets
|
| 8 |
+
from opencompass.configs.internal.clusters.slurm import eval, infer
|
| 9 |
+
from opencompass.configs.models.qwen.hf_qwen_7b import \
|
| 10 |
+
models as hf_qwen_7b_base_models
|
| 11 |
+
from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
|
| 12 |
+
models as hf_qwen_7b_chat_models
|
| 13 |
+
|
| 14 |
+
# from opencompass.configs.internal.clusters.slurm import infer_split as infer, eval
|
| 15 |
+
# from opencompass.configs.internal.clusters.slurm import infer_size as infer, eval
|
| 16 |
+
# from opencompass.configs.internal.clusters.slurm import infer_size_split as infer, eval
|
| 17 |
+
|
| 18 |
+
base_ceval_datasets = base_ceval_datasets[:1]
|
| 19 |
+
chat_ceval_datasets = chat_ceval_datasets[-1:]
|
| 20 |
+
|
| 21 |
+
# If you do not want to run all the combinations of models and datasets, you
|
| 22 |
+
# can specify the combinations you want to run here. This is useful when you
|
| 23 |
+
# deleberately want to skip some subset of the combinations.
|
| 24 |
+
# Models and datasets in different combinations are recommended to be disjoint
|
| 25 |
+
# (different `abbr` in model & dataset configs), as we haven't tested this case
|
| 26 |
+
# throughly.
|
| 27 |
+
model_dataset_combinations = [
|
| 28 |
+
dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
|
| 29 |
+
dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
|
| 30 |
+
# dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
# This union of models and datasets in model_dataset_combinations should be
|
| 34 |
+
# stored in the `models` and `datasets` variables below. Otherwise, modules
|
| 35 |
+
# like summarizer will miss out some information.
|
| 36 |
+
models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
|
| 37 |
+
datasets = [*base_ceval_datasets, *chat_ceval_datasets]
|
| 38 |
+
|
| 39 |
+
work_dir = './outputs/default/mdcomb/'
|
| 40 |
+
"""
|
| 41 |
+
dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
|
| 42 |
+
---------------------- --------- -------- ------ ------------ -----------------
|
| 43 |
+
ceval-computer_network 9b9417 accuracy ppl 52.63 -
|
| 44 |
+
ceval-physician 6e277d accuracy gen - 59.18
|
| 45 |
+
"""
|
tmp/38bf021a-c80f-4a23-9021-f2adc82afa5d_params.py
ADDED
|
@@ -0,0 +1,1424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets = [
|
| 2 |
+
[
|
| 3 |
+
dict(
|
| 4 |
+
abbr='LongBench_2wikimqa_3',
|
| 5 |
+
eval_cfg=dict(
|
| 6 |
+
evaluator=dict(
|
| 7 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 8 |
+
pred_role='BOT'),
|
| 9 |
+
infer_cfg=dict(
|
| 10 |
+
inferencer=dict(
|
| 11 |
+
max_out_len=32,
|
| 12 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
template=dict(round=[
|
| 15 |
+
dict(
|
| 16 |
+
prompt=
|
| 17 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 18 |
+
role='HUMAN'),
|
| 19 |
+
]),
|
| 20 |
+
type=
|
| 21 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 22 |
+
retriever=dict(
|
| 23 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 24 |
+
name='2wikimqa',
|
| 25 |
+
path='opencompass/Longbench',
|
| 26 |
+
reader_cfg=dict(
|
| 27 |
+
input_columns=[
|
| 28 |
+
'context',
|
| 29 |
+
'input',
|
| 30 |
+
],
|
| 31 |
+
output_column='answers',
|
| 32 |
+
test_range='[75:100]',
|
| 33 |
+
test_split='test',
|
| 34 |
+
train_split='test'),
|
| 35 |
+
type='opencompass.datasets.LongBench2wikimqaDataset'),
|
| 36 |
+
dict(
|
| 37 |
+
abbr='LongBench_hotpotqa_3',
|
| 38 |
+
eval_cfg=dict(
|
| 39 |
+
evaluator=dict(
|
| 40 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 41 |
+
pred_role='BOT'),
|
| 42 |
+
infer_cfg=dict(
|
| 43 |
+
inferencer=dict(
|
| 44 |
+
max_out_len=32,
|
| 45 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 46 |
+
prompt_template=dict(
|
| 47 |
+
template=dict(round=[
|
| 48 |
+
dict(
|
| 49 |
+
prompt=
|
| 50 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 51 |
+
role='HUMAN'),
|
| 52 |
+
]),
|
| 53 |
+
type=
|
| 54 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 55 |
+
retriever=dict(
|
| 56 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 57 |
+
name='hotpotqa',
|
| 58 |
+
path='opencompass/Longbench',
|
| 59 |
+
reader_cfg=dict(
|
| 60 |
+
input_columns=[
|
| 61 |
+
'context',
|
| 62 |
+
'input',
|
| 63 |
+
],
|
| 64 |
+
output_column='answers',
|
| 65 |
+
test_range='[75:100]',
|
| 66 |
+
test_split='test',
|
| 67 |
+
train_split='test'),
|
| 68 |
+
type='opencompass.datasets.LongBenchhotpotqaDataset'),
|
| 69 |
+
dict(
|
| 70 |
+
abbr='LongBench_musique_3',
|
| 71 |
+
eval_cfg=dict(
|
| 72 |
+
evaluator=dict(
|
| 73 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 74 |
+
pred_role='BOT'),
|
| 75 |
+
infer_cfg=dict(
|
| 76 |
+
inferencer=dict(
|
| 77 |
+
max_out_len=32,
|
| 78 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 79 |
+
prompt_template=dict(
|
| 80 |
+
template=dict(round=[
|
| 81 |
+
dict(
|
| 82 |
+
prompt=
|
| 83 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 84 |
+
role='HUMAN'),
|
| 85 |
+
]),
|
| 86 |
+
type=
|
| 87 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 88 |
+
retriever=dict(
|
| 89 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 90 |
+
name='musique',
|
| 91 |
+
path='opencompass/Longbench',
|
| 92 |
+
reader_cfg=dict(
|
| 93 |
+
input_columns=[
|
| 94 |
+
'context',
|
| 95 |
+
'input',
|
| 96 |
+
],
|
| 97 |
+
output_column='answers',
|
| 98 |
+
test_range='[75:100]',
|
| 99 |
+
test_split='test',
|
| 100 |
+
train_split='test'),
|
| 101 |
+
type='opencompass.datasets.LongBenchmusiqueDataset'),
|
| 102 |
+
dict(
|
| 103 |
+
abbr='LongBench_multifieldqa_en_3',
|
| 104 |
+
eval_cfg=dict(
|
| 105 |
+
evaluator=dict(
|
| 106 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 107 |
+
pred_role='BOT'),
|
| 108 |
+
infer_cfg=dict(
|
| 109 |
+
inferencer=dict(
|
| 110 |
+
max_out_len=64,
|
| 111 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 112 |
+
prompt_template=dict(
|
| 113 |
+
template=dict(round=[
|
| 114 |
+
dict(
|
| 115 |
+
prompt=
|
| 116 |
+
'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 117 |
+
role='HUMAN'),
|
| 118 |
+
]),
|
| 119 |
+
type=
|
| 120 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 121 |
+
retriever=dict(
|
| 122 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 123 |
+
name='multifieldqa_en',
|
| 124 |
+
path='opencompass/Longbench',
|
| 125 |
+
reader_cfg=dict(
|
| 126 |
+
input_columns=[
|
| 127 |
+
'context',
|
| 128 |
+
'input',
|
| 129 |
+
],
|
| 130 |
+
output_column='answers',
|
| 131 |
+
test_range='[57:76]',
|
| 132 |
+
test_split='test',
|
| 133 |
+
train_split='test'),
|
| 134 |
+
type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
|
| 135 |
+
dict(
|
| 136 |
+
abbr='LongBench_multifieldqa_zh_3',
|
| 137 |
+
eval_cfg=dict(
|
| 138 |
+
evaluator=dict(
|
| 139 |
+
language='zh',
|
| 140 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 141 |
+
pred_role='BOT'),
|
| 142 |
+
infer_cfg=dict(
|
| 143 |
+
inferencer=dict(
|
| 144 |
+
max_out_len=64,
|
| 145 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 146 |
+
prompt_template=dict(
|
| 147 |
+
template=dict(round=[
|
| 148 |
+
dict(
|
| 149 |
+
prompt=
|
| 150 |
+
'阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
|
| 151 |
+
role='HUMAN'),
|
| 152 |
+
]),
|
| 153 |
+
type=
|
| 154 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 155 |
+
retriever=dict(
|
| 156 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 157 |
+
name='multifieldqa_zh',
|
| 158 |
+
path='opencompass/Longbench',
|
| 159 |
+
reader_cfg=dict(
|
| 160 |
+
input_columns=[
|
| 161 |
+
'context',
|
| 162 |
+
'input',
|
| 163 |
+
],
|
| 164 |
+
output_column='answers',
|
| 165 |
+
test_range='[75:100]',
|
| 166 |
+
test_split='test',
|
| 167 |
+
train_split='test'),
|
| 168 |
+
type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
|
| 169 |
+
dict(
|
| 170 |
+
abbr='LongBench_narrativeqa_3',
|
| 171 |
+
eval_cfg=dict(
|
| 172 |
+
evaluator=dict(
|
| 173 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 174 |
+
pred_role='BOT'),
|
| 175 |
+
infer_cfg=dict(
|
| 176 |
+
inferencer=dict(
|
| 177 |
+
max_out_len=128,
|
| 178 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 179 |
+
prompt_template=dict(
|
| 180 |
+
template=dict(round=[
|
| 181 |
+
dict(
|
| 182 |
+
prompt=
|
| 183 |
+
'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
|
| 184 |
+
role='HUMAN'),
|
| 185 |
+
]),
|
| 186 |
+
type=
|
| 187 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 188 |
+
retriever=dict(
|
| 189 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 190 |
+
name='narrativeqa',
|
| 191 |
+
path='opencompass/Longbench',
|
| 192 |
+
reader_cfg=dict(
|
| 193 |
+
input_columns=[
|
| 194 |
+
'context',
|
| 195 |
+
'input',
|
| 196 |
+
],
|
| 197 |
+
output_column='answers',
|
| 198 |
+
test_range='[75:100]',
|
| 199 |
+
test_split='test',
|
| 200 |
+
train_split='test'),
|
| 201 |
+
type='opencompass.datasets.LongBenchnarrativeqaDataset'),
|
| 202 |
+
dict(
|
| 203 |
+
abbr='LongBench_qasper_3',
|
| 204 |
+
eval_cfg=dict(
|
| 205 |
+
evaluator=dict(
|
| 206 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 207 |
+
pred_role='BOT'),
|
| 208 |
+
infer_cfg=dict(
|
| 209 |
+
inferencer=dict(
|
| 210 |
+
max_out_len=32,
|
| 211 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 212 |
+
prompt_template=dict(
|
| 213 |
+
template=dict(round=[
|
| 214 |
+
dict(
|
| 215 |
+
prompt=
|
| 216 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 217 |
+
role='HUMAN'),
|
| 218 |
+
]),
|
| 219 |
+
type=
|
| 220 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 221 |
+
retriever=dict(
|
| 222 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 223 |
+
name='qasper',
|
| 224 |
+
path='opencompass/Longbench',
|
| 225 |
+
reader_cfg=dict(
|
| 226 |
+
input_columns=[
|
| 227 |
+
'context',
|
| 228 |
+
'input',
|
| 229 |
+
],
|
| 230 |
+
output_column='answers',
|
| 231 |
+
test_range='[75:100]',
|
| 232 |
+
test_split='test',
|
| 233 |
+
train_split='test'),
|
| 234 |
+
type='opencompass.datasets.LongBenchqasperDataset'),
|
| 235 |
+
dict(
|
| 236 |
+
abbr='LongBench_triviaqa_3',
|
| 237 |
+
eval_cfg=dict(
|
| 238 |
+
evaluator=dict(
|
| 239 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 240 |
+
pred_postprocessor=dict(
|
| 241 |
+
type='opencompass.datasets.triviaqa_postprocess'),
|
| 242 |
+
pred_role='BOT'),
|
| 243 |
+
infer_cfg=dict(
|
| 244 |
+
inferencer=dict(
|
| 245 |
+
max_out_len=32,
|
| 246 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 247 |
+
prompt_template=dict(
|
| 248 |
+
template=dict(round=[
|
| 249 |
+
dict(
|
| 250 |
+
prompt=
|
| 251 |
+
'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
|
| 252 |
+
role='HUMAN'),
|
| 253 |
+
]),
|
| 254 |
+
type=
|
| 255 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 256 |
+
retriever=dict(
|
| 257 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 258 |
+
name='triviaqa',
|
| 259 |
+
path='opencompass/Longbench',
|
| 260 |
+
reader_cfg=dict(
|
| 261 |
+
input_columns=[
|
| 262 |
+
'context',
|
| 263 |
+
'input',
|
| 264 |
+
],
|
| 265 |
+
output_column='answers',
|
| 266 |
+
test_range='[75:100]',
|
| 267 |
+
test_split='test',
|
| 268 |
+
train_split='test'),
|
| 269 |
+
type='opencompass.datasets.LongBenchtriviaqaDataset'),
|
| 270 |
+
dict(
|
| 271 |
+
abbr='LongBench_gov_report_3',
|
| 272 |
+
eval_cfg=dict(
|
| 273 |
+
evaluator=dict(
|
| 274 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 275 |
+
pred_role='BOT'),
|
| 276 |
+
infer_cfg=dict(
|
| 277 |
+
inferencer=dict(
|
| 278 |
+
max_out_len=512,
|
| 279 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 280 |
+
prompt_template=dict(
|
| 281 |
+
template=dict(round=[
|
| 282 |
+
dict(
|
| 283 |
+
prompt=
|
| 284 |
+
'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
|
| 285 |
+
role='HUMAN'),
|
| 286 |
+
]),
|
| 287 |
+
type=
|
| 288 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 289 |
+
retriever=dict(
|
| 290 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 291 |
+
name='gov_report',
|
| 292 |
+
path='opencompass/Longbench',
|
| 293 |
+
reader_cfg=dict(
|
| 294 |
+
input_columns=[
|
| 295 |
+
'context',
|
| 296 |
+
],
|
| 297 |
+
output_column='answers',
|
| 298 |
+
test_range='[75:100]',
|
| 299 |
+
test_split='test',
|
| 300 |
+
train_split='test'),
|
| 301 |
+
type='opencompass.datasets.LongBenchgov_reportDataset'),
|
| 302 |
+
dict(
|
| 303 |
+
abbr='LongBench_qmsum_3',
|
| 304 |
+
eval_cfg=dict(
|
| 305 |
+
evaluator=dict(
|
| 306 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 307 |
+
pred_role='BOT'),
|
| 308 |
+
infer_cfg=dict(
|
| 309 |
+
inferencer=dict(
|
| 310 |
+
max_out_len=512,
|
| 311 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 312 |
+
prompt_template=dict(
|
| 313 |
+
template=dict(round=[
|
| 314 |
+
dict(
|
| 315 |
+
prompt=
|
| 316 |
+
'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
|
| 317 |
+
role='HUMAN'),
|
| 318 |
+
]),
|
| 319 |
+
type=
|
| 320 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 321 |
+
retriever=dict(
|
| 322 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 323 |
+
name='qmsum',
|
| 324 |
+
path='opencompass/Longbench',
|
| 325 |
+
reader_cfg=dict(
|
| 326 |
+
input_columns=[
|
| 327 |
+
'context',
|
| 328 |
+
'input',
|
| 329 |
+
],
|
| 330 |
+
output_column='answers',
|
| 331 |
+
test_range='[75:100]',
|
| 332 |
+
test_split='test',
|
| 333 |
+
train_split='test'),
|
| 334 |
+
type='opencompass.datasets.LongBenchqmsumDataset'),
|
| 335 |
+
dict(
|
| 336 |
+
abbr='LongBench_vcsum_3',
|
| 337 |
+
eval_cfg=dict(
|
| 338 |
+
evaluator=dict(
|
| 339 |
+
language='zh',
|
| 340 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 341 |
+
pred_role='BOT'),
|
| 342 |
+
infer_cfg=dict(
|
| 343 |
+
inferencer=dict(
|
| 344 |
+
max_out_len=512,
|
| 345 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 346 |
+
prompt_template=dict(
|
| 347 |
+
template=dict(round=[
|
| 348 |
+
dict(
|
| 349 |
+
prompt=
|
| 350 |
+
'下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
|
| 351 |
+
role='HUMAN'),
|
| 352 |
+
]),
|
| 353 |
+
type=
|
| 354 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 355 |
+
retriever=dict(
|
| 356 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 357 |
+
name='vcsum',
|
| 358 |
+
path='opencompass/Longbench',
|
| 359 |
+
reader_cfg=dict(
|
| 360 |
+
input_columns=[
|
| 361 |
+
'context',
|
| 362 |
+
],
|
| 363 |
+
output_column='answers',
|
| 364 |
+
test_range='[75:100]',
|
| 365 |
+
test_split='test',
|
| 366 |
+
train_split='test'),
|
| 367 |
+
type='opencompass.datasets.LongBenchvcsumDataset'),
|
| 368 |
+
dict(
|
| 369 |
+
abbr='LongBench_dureader_3',
|
| 370 |
+
eval_cfg=dict(
|
| 371 |
+
evaluator=dict(
|
| 372 |
+
language='zh',
|
| 373 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 374 |
+
pred_role='BOT'),
|
| 375 |
+
infer_cfg=dict(
|
| 376 |
+
inferencer=dict(
|
| 377 |
+
max_out_len=128,
|
| 378 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 379 |
+
prompt_template=dict(
|
| 380 |
+
template=dict(round=[
|
| 381 |
+
dict(
|
| 382 |
+
prompt=
|
| 383 |
+
'请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
|
| 384 |
+
role='HUMAN'),
|
| 385 |
+
]),
|
| 386 |
+
type=
|
| 387 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 388 |
+
retriever=dict(
|
| 389 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 390 |
+
name='dureader',
|
| 391 |
+
path='opencompass/Longbench',
|
| 392 |
+
reader_cfg=dict(
|
| 393 |
+
input_columns=[
|
| 394 |
+
'context',
|
| 395 |
+
'input',
|
| 396 |
+
],
|
| 397 |
+
output_column='answers',
|
| 398 |
+
test_range='[75:100]',
|
| 399 |
+
test_split='test',
|
| 400 |
+
train_split='test'),
|
| 401 |
+
type='opencompass.datasets.LongBenchdureaderDataset'),
|
| 402 |
+
dict(
|
| 403 |
+
abbr='LongBench_lcc_3',
|
| 404 |
+
eval_cfg=dict(
|
| 405 |
+
evaluator=dict(
|
| 406 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 407 |
+
pred_role='BOT'),
|
| 408 |
+
infer_cfg=dict(
|
| 409 |
+
inferencer=dict(
|
| 410 |
+
max_out_len=64,
|
| 411 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 412 |
+
prompt_template=dict(
|
| 413 |
+
template=dict(round=[
|
| 414 |
+
dict(
|
| 415 |
+
prompt=
|
| 416 |
+
'Please complete the code given below. \n{context}Next line of code:\n',
|
| 417 |
+
role='HUMAN'),
|
| 418 |
+
]),
|
| 419 |
+
type=
|
| 420 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 421 |
+
retriever=dict(
|
| 422 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 423 |
+
name='lcc',
|
| 424 |
+
path='opencompass/Longbench',
|
| 425 |
+
reader_cfg=dict(
|
| 426 |
+
input_columns=[
|
| 427 |
+
'context',
|
| 428 |
+
],
|
| 429 |
+
output_column='answers',
|
| 430 |
+
test_range='[189:252]',
|
| 431 |
+
test_split='test',
|
| 432 |
+
train_split='test'),
|
| 433 |
+
type='opencompass.datasets.LongBenchlccDataset'),
|
| 434 |
+
dict(
|
| 435 |
+
abbr='LongBench_repobench-p_3',
|
| 436 |
+
eval_cfg=dict(
|
| 437 |
+
evaluator=dict(
|
| 438 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 439 |
+
pred_role='BOT'),
|
| 440 |
+
infer_cfg=dict(
|
| 441 |
+
inferencer=dict(
|
| 442 |
+
max_out_len=64,
|
| 443 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 444 |
+
prompt_template=dict(
|
| 445 |
+
template=dict(round=[
|
| 446 |
+
dict(
|
| 447 |
+
prompt=
|
| 448 |
+
'Please complete the code given below. \n{context}{input}Next line of code:\n',
|
| 449 |
+
role='HUMAN'),
|
| 450 |
+
]),
|
| 451 |
+
type=
|
| 452 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 453 |
+
retriever=dict(
|
| 454 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 455 |
+
name='repobench-p',
|
| 456 |
+
path='opencompass/Longbench',
|
| 457 |
+
reader_cfg=dict(
|
| 458 |
+
input_columns=[
|
| 459 |
+
'context',
|
| 460 |
+
'input',
|
| 461 |
+
],
|
| 462 |
+
output_column='answers',
|
| 463 |
+
test_range='[189:252]',
|
| 464 |
+
test_split='test',
|
| 465 |
+
train_split='test'),
|
| 466 |
+
type='opencompass.datasets.LongBenchrepobenchDataset'),
|
| 467 |
+
dict(
|
| 468 |
+
abbr='LongBench_passage_retrieval_en_3',
|
| 469 |
+
eval_cfg=dict(
|
| 470 |
+
evaluator=dict(
|
| 471 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 472 |
+
pred_role='BOT'),
|
| 473 |
+
infer_cfg=dict(
|
| 474 |
+
inferencer=dict(
|
| 475 |
+
max_out_len=32,
|
| 476 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 477 |
+
prompt_template=dict(
|
| 478 |
+
template=dict(round=[
|
| 479 |
+
dict(
|
| 480 |
+
prompt=
|
| 481 |
+
'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
|
| 482 |
+
role='HUMAN'),
|
| 483 |
+
]),
|
| 484 |
+
type=
|
| 485 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 486 |
+
retriever=dict(
|
| 487 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 488 |
+
name='passage_retrieval_en',
|
| 489 |
+
path='opencompass/Longbench',
|
| 490 |
+
reader_cfg=dict(
|
| 491 |
+
input_columns=[
|
| 492 |
+
'context',
|
| 493 |
+
'input',
|
| 494 |
+
],
|
| 495 |
+
output_column='answers',
|
| 496 |
+
test_range='[75:100]',
|
| 497 |
+
test_split='test',
|
| 498 |
+
train_split='test'),
|
| 499 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
|
| 500 |
+
dict(
|
| 501 |
+
abbr='LongBench_passage_retrieval_zh_3',
|
| 502 |
+
eval_cfg=dict(
|
| 503 |
+
evaluator=dict(
|
| 504 |
+
language='zh',
|
| 505 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 506 |
+
pred_role='BOT'),
|
| 507 |
+
infer_cfg=dict(
|
| 508 |
+
inferencer=dict(
|
| 509 |
+
max_out_len=32,
|
| 510 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 511 |
+
prompt_template=dict(
|
| 512 |
+
template=dict(round=[
|
| 513 |
+
dict(
|
| 514 |
+
prompt=
|
| 515 |
+
'以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
|
| 516 |
+
role='HUMAN'),
|
| 517 |
+
]),
|
| 518 |
+
type=
|
| 519 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 520 |
+
retriever=dict(
|
| 521 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 522 |
+
name='passage_retrieval_zh',
|
| 523 |
+
path='opencompass/Longbench',
|
| 524 |
+
reader_cfg=dict(
|
| 525 |
+
input_columns=[
|
| 526 |
+
'context',
|
| 527 |
+
'input',
|
| 528 |
+
],
|
| 529 |
+
output_column='answers',
|
| 530 |
+
test_range='[75:100]',
|
| 531 |
+
test_split='test',
|
| 532 |
+
train_split='test'),
|
| 533 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
|
| 534 |
+
dict(
|
| 535 |
+
abbr='LongBench_passage_count_3',
|
| 536 |
+
eval_cfg=dict(
|
| 537 |
+
evaluator=dict(
|
| 538 |
+
type='opencompass.datasets.LongBenchCountEvaluator'),
|
| 539 |
+
pred_role='BOT'),
|
| 540 |
+
infer_cfg=dict(
|
| 541 |
+
inferencer=dict(
|
| 542 |
+
max_out_len=32,
|
| 543 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 544 |
+
prompt_template=dict(
|
| 545 |
+
template=dict(round=[
|
| 546 |
+
dict(
|
| 547 |
+
prompt=
|
| 548 |
+
'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
|
| 549 |
+
role='HUMAN'),
|
| 550 |
+
]),
|
| 551 |
+
type=
|
| 552 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 553 |
+
retriever=dict(
|
| 554 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 555 |
+
name='passage_count',
|
| 556 |
+
path='opencompass/Longbench',
|
| 557 |
+
reader_cfg=dict(
|
| 558 |
+
input_columns=[
|
| 559 |
+
'context',
|
| 560 |
+
'input',
|
| 561 |
+
],
|
| 562 |
+
output_column='answers',
|
| 563 |
+
test_range='[75:100]',
|
| 564 |
+
test_split='test',
|
| 565 |
+
train_split='test'),
|
| 566 |
+
type='opencompass.datasets.LongBenchpassage_countDataset'),
|
| 567 |
+
dict(
|
| 568 |
+
abbr='LongBench_trec_3',
|
| 569 |
+
eval_cfg=dict(
|
| 570 |
+
evaluator=dict(
|
| 571 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 572 |
+
),
|
| 573 |
+
pred_postprocessor=dict(
|
| 574 |
+
type='opencompass.datasets.trec_postprocess'),
|
| 575 |
+
pred_role='BOT'),
|
| 576 |
+
infer_cfg=dict(
|
| 577 |
+
inferencer=dict(
|
| 578 |
+
max_out_len=64,
|
| 579 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 580 |
+
prompt_template=dict(
|
| 581 |
+
template=dict(round=[
|
| 582 |
+
dict(
|
| 583 |
+
prompt=
|
| 584 |
+
'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
|
| 585 |
+
role='HUMAN'),
|
| 586 |
+
]),
|
| 587 |
+
type=
|
| 588 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 589 |
+
retriever=dict(
|
| 590 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 591 |
+
name='trec',
|
| 592 |
+
path='opencompass/Longbench',
|
| 593 |
+
reader_cfg=dict(
|
| 594 |
+
input_columns=[
|
| 595 |
+
'context',
|
| 596 |
+
'input',
|
| 597 |
+
],
|
| 598 |
+
output_column='all_labels',
|
| 599 |
+
test_range='[75:100]',
|
| 600 |
+
test_split='test',
|
| 601 |
+
train_split='test'),
|
| 602 |
+
type='opencompass.datasets.LongBenchtrecDataset'),
|
| 603 |
+
dict(
|
| 604 |
+
abbr='LongBench_lsht_3',
|
| 605 |
+
eval_cfg=dict(
|
| 606 |
+
evaluator=dict(
|
| 607 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 608 |
+
),
|
| 609 |
+
pred_postprocessor=dict(
|
| 610 |
+
type='opencompass.datasets.lsht_postprocess'),
|
| 611 |
+
pred_role='BOT'),
|
| 612 |
+
infer_cfg=dict(
|
| 613 |
+
inferencer=dict(
|
| 614 |
+
max_out_len=64,
|
| 615 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 616 |
+
prompt_template=dict(
|
| 617 |
+
template=dict(round=[
|
| 618 |
+
dict(
|
| 619 |
+
prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
|
| 620 |
+
role='HUMAN'),
|
| 621 |
+
]),
|
| 622 |
+
type=
|
| 623 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 624 |
+
retriever=dict(
|
| 625 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 626 |
+
name='lsht',
|
| 627 |
+
path='opencompass/Longbench',
|
| 628 |
+
reader_cfg=dict(
|
| 629 |
+
input_columns=[
|
| 630 |
+
'context',
|
| 631 |
+
'input',
|
| 632 |
+
],
|
| 633 |
+
output_column='all_labels',
|
| 634 |
+
test_range='[75:100]',
|
| 635 |
+
test_split='test',
|
| 636 |
+
train_split='test'),
|
| 637 |
+
type='opencompass.datasets.LongBenchlshtDataset'),
|
| 638 |
+
dict(
|
| 639 |
+
abbr='LongBench_multi_news_3',
|
| 640 |
+
eval_cfg=dict(
|
| 641 |
+
evaluator=dict(
|
| 642 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 643 |
+
pred_role='BOT'),
|
| 644 |
+
infer_cfg=dict(
|
| 645 |
+
inferencer=dict(
|
| 646 |
+
max_out_len=512,
|
| 647 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 648 |
+
prompt_template=dict(
|
| 649 |
+
template=dict(round=[
|
| 650 |
+
dict(
|
| 651 |
+
prompt=
|
| 652 |
+
'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
|
| 653 |
+
role='HUMAN'),
|
| 654 |
+
]),
|
| 655 |
+
type=
|
| 656 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 657 |
+
retriever=dict(
|
| 658 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 659 |
+
name='multi_news',
|
| 660 |
+
path='opencompass/Longbench',
|
| 661 |
+
reader_cfg=dict(
|
| 662 |
+
input_columns=[
|
| 663 |
+
'context',
|
| 664 |
+
],
|
| 665 |
+
output_column='answers',
|
| 666 |
+
test_range='[75:100]',
|
| 667 |
+
test_split='test',
|
| 668 |
+
train_split='test'),
|
| 669 |
+
type='opencompass.datasets.LongBenchmulti_newsDataset'),
|
| 670 |
+
dict(
|
| 671 |
+
abbr='LongBench_samsum_3',
|
| 672 |
+
eval_cfg=dict(
|
| 673 |
+
evaluator=dict(
|
| 674 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 675 |
+
pred_postprocessor=dict(
|
| 676 |
+
type='opencompass.datasets.samsum_postprocess'),
|
| 677 |
+
pred_role='BOT'),
|
| 678 |
+
infer_cfg=dict(
|
| 679 |
+
inferencer=dict(
|
| 680 |
+
max_out_len=128,
|
| 681 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 682 |
+
prompt_template=dict(
|
| 683 |
+
template=dict(round=[
|
| 684 |
+
dict(
|
| 685 |
+
prompt=
|
| 686 |
+
'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
|
| 687 |
+
role='HUMAN'),
|
| 688 |
+
]),
|
| 689 |
+
type=
|
| 690 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 691 |
+
retriever=dict(
|
| 692 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 693 |
+
name='samsum',
|
| 694 |
+
path='opencompass/Longbench',
|
| 695 |
+
reader_cfg=dict(
|
| 696 |
+
input_columns=[
|
| 697 |
+
'context',
|
| 698 |
+
'input',
|
| 699 |
+
],
|
| 700 |
+
output_column='answers',
|
| 701 |
+
test_range='[75:100]',
|
| 702 |
+
test_split='test',
|
| 703 |
+
train_split='test'),
|
| 704 |
+
type='opencompass.datasets.LongBenchsamsumDataset'),
|
| 705 |
+
dict(
|
| 706 |
+
abbr='LongBench_2wikimqa_3',
|
| 707 |
+
eval_cfg=dict(
|
| 708 |
+
evaluator=dict(
|
| 709 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 710 |
+
pred_role='BOT'),
|
| 711 |
+
infer_cfg=dict(
|
| 712 |
+
inferencer=dict(
|
| 713 |
+
max_out_len=32,
|
| 714 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 715 |
+
prompt_template=dict(
|
| 716 |
+
template=dict(round=[
|
| 717 |
+
dict(
|
| 718 |
+
prompt=
|
| 719 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 720 |
+
role='HUMAN'),
|
| 721 |
+
]),
|
| 722 |
+
type=
|
| 723 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 724 |
+
retriever=dict(
|
| 725 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 726 |
+
name='2wikimqa',
|
| 727 |
+
path='opencompass/Longbench',
|
| 728 |
+
reader_cfg=dict(
|
| 729 |
+
input_columns=[
|
| 730 |
+
'context',
|
| 731 |
+
'input',
|
| 732 |
+
],
|
| 733 |
+
output_column='answers',
|
| 734 |
+
test_range='[75:100]',
|
| 735 |
+
test_split='test',
|
| 736 |
+
train_split='test'),
|
| 737 |
+
type='opencompass.datasets.LongBench2wikimqaDataset'),
|
| 738 |
+
dict(
|
| 739 |
+
abbr='LongBench_hotpotqa_3',
|
| 740 |
+
eval_cfg=dict(
|
| 741 |
+
evaluator=dict(
|
| 742 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 743 |
+
pred_role='BOT'),
|
| 744 |
+
infer_cfg=dict(
|
| 745 |
+
inferencer=dict(
|
| 746 |
+
max_out_len=32,
|
| 747 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 748 |
+
prompt_template=dict(
|
| 749 |
+
template=dict(round=[
|
| 750 |
+
dict(
|
| 751 |
+
prompt=
|
| 752 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 753 |
+
role='HUMAN'),
|
| 754 |
+
]),
|
| 755 |
+
type=
|
| 756 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 757 |
+
retriever=dict(
|
| 758 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 759 |
+
name='hotpotqa',
|
| 760 |
+
path='opencompass/Longbench',
|
| 761 |
+
reader_cfg=dict(
|
| 762 |
+
input_columns=[
|
| 763 |
+
'context',
|
| 764 |
+
'input',
|
| 765 |
+
],
|
| 766 |
+
output_column='answers',
|
| 767 |
+
test_range='[75:100]',
|
| 768 |
+
test_split='test',
|
| 769 |
+
train_split='test'),
|
| 770 |
+
type='opencompass.datasets.LongBenchhotpotqaDataset'),
|
| 771 |
+
dict(
|
| 772 |
+
abbr='LongBench_musique_3',
|
| 773 |
+
eval_cfg=dict(
|
| 774 |
+
evaluator=dict(
|
| 775 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 776 |
+
pred_role='BOT'),
|
| 777 |
+
infer_cfg=dict(
|
| 778 |
+
inferencer=dict(
|
| 779 |
+
max_out_len=32,
|
| 780 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 781 |
+
prompt_template=dict(
|
| 782 |
+
template=dict(round=[
|
| 783 |
+
dict(
|
| 784 |
+
prompt=
|
| 785 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 786 |
+
role='HUMAN'),
|
| 787 |
+
]),
|
| 788 |
+
type=
|
| 789 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 790 |
+
retriever=dict(
|
| 791 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 792 |
+
name='musique',
|
| 793 |
+
path='opencompass/Longbench',
|
| 794 |
+
reader_cfg=dict(
|
| 795 |
+
input_columns=[
|
| 796 |
+
'context',
|
| 797 |
+
'input',
|
| 798 |
+
],
|
| 799 |
+
output_column='answers',
|
| 800 |
+
test_range='[75:100]',
|
| 801 |
+
test_split='test',
|
| 802 |
+
train_split='test'),
|
| 803 |
+
type='opencompass.datasets.LongBenchmusiqueDataset'),
|
| 804 |
+
dict(
|
| 805 |
+
abbr='LongBench_multifieldqa_en_3',
|
| 806 |
+
eval_cfg=dict(
|
| 807 |
+
evaluator=dict(
|
| 808 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 809 |
+
pred_role='BOT'),
|
| 810 |
+
infer_cfg=dict(
|
| 811 |
+
inferencer=dict(
|
| 812 |
+
max_out_len=64,
|
| 813 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 814 |
+
prompt_template=dict(
|
| 815 |
+
template=dict(round=[
|
| 816 |
+
dict(
|
| 817 |
+
prompt=
|
| 818 |
+
'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 819 |
+
role='HUMAN'),
|
| 820 |
+
]),
|
| 821 |
+
type=
|
| 822 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 823 |
+
retriever=dict(
|
| 824 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 825 |
+
name='multifieldqa_en',
|
| 826 |
+
path='opencompass/Longbench',
|
| 827 |
+
reader_cfg=dict(
|
| 828 |
+
input_columns=[
|
| 829 |
+
'context',
|
| 830 |
+
'input',
|
| 831 |
+
],
|
| 832 |
+
output_column='answers',
|
| 833 |
+
test_range='[57:76]',
|
| 834 |
+
test_split='test',
|
| 835 |
+
train_split='test'),
|
| 836 |
+
type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
|
| 837 |
+
dict(
|
| 838 |
+
abbr='LongBench_multifieldqa_zh_3',
|
| 839 |
+
eval_cfg=dict(
|
| 840 |
+
evaluator=dict(
|
| 841 |
+
language='zh',
|
| 842 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 843 |
+
pred_role='BOT'),
|
| 844 |
+
infer_cfg=dict(
|
| 845 |
+
inferencer=dict(
|
| 846 |
+
max_out_len=64,
|
| 847 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 848 |
+
prompt_template=dict(
|
| 849 |
+
template=dict(round=[
|
| 850 |
+
dict(
|
| 851 |
+
prompt=
|
| 852 |
+
'阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
|
| 853 |
+
role='HUMAN'),
|
| 854 |
+
]),
|
| 855 |
+
type=
|
| 856 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 857 |
+
retriever=dict(
|
| 858 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 859 |
+
name='multifieldqa_zh',
|
| 860 |
+
path='opencompass/Longbench',
|
| 861 |
+
reader_cfg=dict(
|
| 862 |
+
input_columns=[
|
| 863 |
+
'context',
|
| 864 |
+
'input',
|
| 865 |
+
],
|
| 866 |
+
output_column='answers',
|
| 867 |
+
test_range='[75:100]',
|
| 868 |
+
test_split='test',
|
| 869 |
+
train_split='test'),
|
| 870 |
+
type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
|
| 871 |
+
dict(
|
| 872 |
+
abbr='LongBench_narrativeqa_3',
|
| 873 |
+
eval_cfg=dict(
|
| 874 |
+
evaluator=dict(
|
| 875 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 876 |
+
pred_role='BOT'),
|
| 877 |
+
infer_cfg=dict(
|
| 878 |
+
inferencer=dict(
|
| 879 |
+
max_out_len=128,
|
| 880 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 881 |
+
prompt_template=dict(
|
| 882 |
+
template=dict(round=[
|
| 883 |
+
dict(
|
| 884 |
+
prompt=
|
| 885 |
+
'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
|
| 886 |
+
role='HUMAN'),
|
| 887 |
+
]),
|
| 888 |
+
type=
|
| 889 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 890 |
+
retriever=dict(
|
| 891 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 892 |
+
name='narrativeqa',
|
| 893 |
+
path='opencompass/Longbench',
|
| 894 |
+
reader_cfg=dict(
|
| 895 |
+
input_columns=[
|
| 896 |
+
'context',
|
| 897 |
+
'input',
|
| 898 |
+
],
|
| 899 |
+
output_column='answers',
|
| 900 |
+
test_range='[75:100]',
|
| 901 |
+
test_split='test',
|
| 902 |
+
train_split='test'),
|
| 903 |
+
type='opencompass.datasets.LongBenchnarrativeqaDataset'),
|
| 904 |
+
dict(
|
| 905 |
+
abbr='LongBench_qasper_3',
|
| 906 |
+
eval_cfg=dict(
|
| 907 |
+
evaluator=dict(
|
| 908 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 909 |
+
pred_role='BOT'),
|
| 910 |
+
infer_cfg=dict(
|
| 911 |
+
inferencer=dict(
|
| 912 |
+
max_out_len=32,
|
| 913 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 914 |
+
prompt_template=dict(
|
| 915 |
+
template=dict(round=[
|
| 916 |
+
dict(
|
| 917 |
+
prompt=
|
| 918 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 919 |
+
role='HUMAN'),
|
| 920 |
+
]),
|
| 921 |
+
type=
|
| 922 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 923 |
+
retriever=dict(
|
| 924 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 925 |
+
name='qasper',
|
| 926 |
+
path='opencompass/Longbench',
|
| 927 |
+
reader_cfg=dict(
|
| 928 |
+
input_columns=[
|
| 929 |
+
'context',
|
| 930 |
+
'input',
|
| 931 |
+
],
|
| 932 |
+
output_column='answers',
|
| 933 |
+
test_range='[75:100]',
|
| 934 |
+
test_split='test',
|
| 935 |
+
train_split='test'),
|
| 936 |
+
type='opencompass.datasets.LongBenchqasperDataset'),
|
| 937 |
+
dict(
|
| 938 |
+
abbr='LongBench_triviaqa_3',
|
| 939 |
+
eval_cfg=dict(
|
| 940 |
+
evaluator=dict(
|
| 941 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 942 |
+
pred_postprocessor=dict(
|
| 943 |
+
type='opencompass.datasets.triviaqa_postprocess'),
|
| 944 |
+
pred_role='BOT'),
|
| 945 |
+
infer_cfg=dict(
|
| 946 |
+
inferencer=dict(
|
| 947 |
+
max_out_len=32,
|
| 948 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 949 |
+
prompt_template=dict(
|
| 950 |
+
template=dict(round=[
|
| 951 |
+
dict(
|
| 952 |
+
prompt=
|
| 953 |
+
'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
|
| 954 |
+
role='HUMAN'),
|
| 955 |
+
]),
|
| 956 |
+
type=
|
| 957 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 958 |
+
retriever=dict(
|
| 959 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 960 |
+
name='triviaqa',
|
| 961 |
+
path='opencompass/Longbench',
|
| 962 |
+
reader_cfg=dict(
|
| 963 |
+
input_columns=[
|
| 964 |
+
'context',
|
| 965 |
+
'input',
|
| 966 |
+
],
|
| 967 |
+
output_column='answers',
|
| 968 |
+
test_range='[75:100]',
|
| 969 |
+
test_split='test',
|
| 970 |
+
train_split='test'),
|
| 971 |
+
type='opencompass.datasets.LongBenchtriviaqaDataset'),
|
| 972 |
+
dict(
|
| 973 |
+
abbr='LongBench_gov_report_3',
|
| 974 |
+
eval_cfg=dict(
|
| 975 |
+
evaluator=dict(
|
| 976 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 977 |
+
pred_role='BOT'),
|
| 978 |
+
infer_cfg=dict(
|
| 979 |
+
inferencer=dict(
|
| 980 |
+
max_out_len=512,
|
| 981 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 982 |
+
prompt_template=dict(
|
| 983 |
+
template=dict(round=[
|
| 984 |
+
dict(
|
| 985 |
+
prompt=
|
| 986 |
+
'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
|
| 987 |
+
role='HUMAN'),
|
| 988 |
+
]),
|
| 989 |
+
type=
|
| 990 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 991 |
+
retriever=dict(
|
| 992 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 993 |
+
name='gov_report',
|
| 994 |
+
path='opencompass/Longbench',
|
| 995 |
+
reader_cfg=dict(
|
| 996 |
+
input_columns=[
|
| 997 |
+
'context',
|
| 998 |
+
],
|
| 999 |
+
output_column='answers',
|
| 1000 |
+
test_range='[75:100]',
|
| 1001 |
+
test_split='test',
|
| 1002 |
+
train_split='test'),
|
| 1003 |
+
type='opencompass.datasets.LongBenchgov_reportDataset'),
|
| 1004 |
+
dict(
|
| 1005 |
+
abbr='LongBench_qmsum_3',
|
| 1006 |
+
eval_cfg=dict(
|
| 1007 |
+
evaluator=dict(
|
| 1008 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1009 |
+
pred_role='BOT'),
|
| 1010 |
+
infer_cfg=dict(
|
| 1011 |
+
inferencer=dict(
|
| 1012 |
+
max_out_len=512,
|
| 1013 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1014 |
+
prompt_template=dict(
|
| 1015 |
+
template=dict(round=[
|
| 1016 |
+
dict(
|
| 1017 |
+
prompt=
|
| 1018 |
+
'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
|
| 1019 |
+
role='HUMAN'),
|
| 1020 |
+
]),
|
| 1021 |
+
type=
|
| 1022 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1023 |
+
retriever=dict(
|
| 1024 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1025 |
+
name='qmsum',
|
| 1026 |
+
path='opencompass/Longbench',
|
| 1027 |
+
reader_cfg=dict(
|
| 1028 |
+
input_columns=[
|
| 1029 |
+
'context',
|
| 1030 |
+
'input',
|
| 1031 |
+
],
|
| 1032 |
+
output_column='answers',
|
| 1033 |
+
test_range='[75:100]',
|
| 1034 |
+
test_split='test',
|
| 1035 |
+
train_split='test'),
|
| 1036 |
+
type='opencompass.datasets.LongBenchqmsumDataset'),
|
| 1037 |
+
dict(
|
| 1038 |
+
abbr='LongBench_vcsum_3',
|
| 1039 |
+
eval_cfg=dict(
|
| 1040 |
+
evaluator=dict(
|
| 1041 |
+
language='zh',
|
| 1042 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1043 |
+
pred_role='BOT'),
|
| 1044 |
+
infer_cfg=dict(
|
| 1045 |
+
inferencer=dict(
|
| 1046 |
+
max_out_len=512,
|
| 1047 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1048 |
+
prompt_template=dict(
|
| 1049 |
+
template=dict(round=[
|
| 1050 |
+
dict(
|
| 1051 |
+
prompt=
|
| 1052 |
+
'下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
|
| 1053 |
+
role='HUMAN'),
|
| 1054 |
+
]),
|
| 1055 |
+
type=
|
| 1056 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1057 |
+
retriever=dict(
|
| 1058 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1059 |
+
name='vcsum',
|
| 1060 |
+
path='opencompass/Longbench',
|
| 1061 |
+
reader_cfg=dict(
|
| 1062 |
+
input_columns=[
|
| 1063 |
+
'context',
|
| 1064 |
+
],
|
| 1065 |
+
output_column='answers',
|
| 1066 |
+
test_range='[75:100]',
|
| 1067 |
+
test_split='test',
|
| 1068 |
+
train_split='test'),
|
| 1069 |
+
type='opencompass.datasets.LongBenchvcsumDataset'),
|
| 1070 |
+
dict(
|
| 1071 |
+
abbr='LongBench_dureader_3',
|
| 1072 |
+
eval_cfg=dict(
|
| 1073 |
+
evaluator=dict(
|
| 1074 |
+
language='zh',
|
| 1075 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1076 |
+
pred_role='BOT'),
|
| 1077 |
+
infer_cfg=dict(
|
| 1078 |
+
inferencer=dict(
|
| 1079 |
+
max_out_len=128,
|
| 1080 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1081 |
+
prompt_template=dict(
|
| 1082 |
+
template=dict(round=[
|
| 1083 |
+
dict(
|
| 1084 |
+
prompt=
|
| 1085 |
+
'请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
|
| 1086 |
+
role='HUMAN'),
|
| 1087 |
+
]),
|
| 1088 |
+
type=
|
| 1089 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1090 |
+
retriever=dict(
|
| 1091 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1092 |
+
name='dureader',
|
| 1093 |
+
path='opencompass/Longbench',
|
| 1094 |
+
reader_cfg=dict(
|
| 1095 |
+
input_columns=[
|
| 1096 |
+
'context',
|
| 1097 |
+
'input',
|
| 1098 |
+
],
|
| 1099 |
+
output_column='answers',
|
| 1100 |
+
test_range='[75:100]',
|
| 1101 |
+
test_split='test',
|
| 1102 |
+
train_split='test'),
|
| 1103 |
+
type='opencompass.datasets.LongBenchdureaderDataset'),
|
| 1104 |
+
dict(
|
| 1105 |
+
abbr='LongBench_lcc_3',
|
| 1106 |
+
eval_cfg=dict(
|
| 1107 |
+
evaluator=dict(
|
| 1108 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 1109 |
+
pred_role='BOT'),
|
| 1110 |
+
infer_cfg=dict(
|
| 1111 |
+
inferencer=dict(
|
| 1112 |
+
max_out_len=64,
|
| 1113 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1114 |
+
prompt_template=dict(
|
| 1115 |
+
template=dict(round=[
|
| 1116 |
+
dict(
|
| 1117 |
+
prompt=
|
| 1118 |
+
'Please complete the code given below. \n{context}Next line of code:\n',
|
| 1119 |
+
role='HUMAN'),
|
| 1120 |
+
]),
|
| 1121 |
+
type=
|
| 1122 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1123 |
+
retriever=dict(
|
| 1124 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1125 |
+
name='lcc',
|
| 1126 |
+
path='opencompass/Longbench',
|
| 1127 |
+
reader_cfg=dict(
|
| 1128 |
+
input_columns=[
|
| 1129 |
+
'context',
|
| 1130 |
+
],
|
| 1131 |
+
output_column='answers',
|
| 1132 |
+
test_range='[189:252]',
|
| 1133 |
+
test_split='test',
|
| 1134 |
+
train_split='test'),
|
| 1135 |
+
type='opencompass.datasets.LongBenchlccDataset'),
|
| 1136 |
+
dict(
|
| 1137 |
+
abbr='LongBench_repobench-p_3',
|
| 1138 |
+
eval_cfg=dict(
|
| 1139 |
+
evaluator=dict(
|
| 1140 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 1141 |
+
pred_role='BOT'),
|
| 1142 |
+
infer_cfg=dict(
|
| 1143 |
+
inferencer=dict(
|
| 1144 |
+
max_out_len=64,
|
| 1145 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1146 |
+
prompt_template=dict(
|
| 1147 |
+
template=dict(round=[
|
| 1148 |
+
dict(
|
| 1149 |
+
prompt=
|
| 1150 |
+
'Please complete the code given below. \n{context}{input}Next line of code:\n',
|
| 1151 |
+
role='HUMAN'),
|
| 1152 |
+
]),
|
| 1153 |
+
type=
|
| 1154 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1155 |
+
retriever=dict(
|
| 1156 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1157 |
+
name='repobench-p',
|
| 1158 |
+
path='opencompass/Longbench',
|
| 1159 |
+
reader_cfg=dict(
|
| 1160 |
+
input_columns=[
|
| 1161 |
+
'context',
|
| 1162 |
+
'input',
|
| 1163 |
+
],
|
| 1164 |
+
output_column='answers',
|
| 1165 |
+
test_range='[189:252]',
|
| 1166 |
+
test_split='test',
|
| 1167 |
+
train_split='test'),
|
| 1168 |
+
type='opencompass.datasets.LongBenchrepobenchDataset'),
|
| 1169 |
+
dict(
|
| 1170 |
+
abbr='LongBench_passage_retrieval_en_3',
|
| 1171 |
+
eval_cfg=dict(
|
| 1172 |
+
evaluator=dict(
|
| 1173 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 1174 |
+
pred_role='BOT'),
|
| 1175 |
+
infer_cfg=dict(
|
| 1176 |
+
inferencer=dict(
|
| 1177 |
+
max_out_len=32,
|
| 1178 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1179 |
+
prompt_template=dict(
|
| 1180 |
+
template=dict(round=[
|
| 1181 |
+
dict(
|
| 1182 |
+
prompt=
|
| 1183 |
+
'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
|
| 1184 |
+
role='HUMAN'),
|
| 1185 |
+
]),
|
| 1186 |
+
type=
|
| 1187 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1188 |
+
retriever=dict(
|
| 1189 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1190 |
+
name='passage_retrieval_en',
|
| 1191 |
+
path='opencompass/Longbench',
|
| 1192 |
+
reader_cfg=dict(
|
| 1193 |
+
input_columns=[
|
| 1194 |
+
'context',
|
| 1195 |
+
'input',
|
| 1196 |
+
],
|
| 1197 |
+
output_column='answers',
|
| 1198 |
+
test_range='[75:100]',
|
| 1199 |
+
test_split='test',
|
| 1200 |
+
train_split='test'),
|
| 1201 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
|
| 1202 |
+
dict(
|
| 1203 |
+
abbr='LongBench_passage_retrieval_zh_3',
|
| 1204 |
+
eval_cfg=dict(
|
| 1205 |
+
evaluator=dict(
|
| 1206 |
+
language='zh',
|
| 1207 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 1208 |
+
pred_role='BOT'),
|
| 1209 |
+
infer_cfg=dict(
|
| 1210 |
+
inferencer=dict(
|
| 1211 |
+
max_out_len=32,
|
| 1212 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1213 |
+
prompt_template=dict(
|
| 1214 |
+
template=dict(round=[
|
| 1215 |
+
dict(
|
| 1216 |
+
prompt=
|
| 1217 |
+
'以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
|
| 1218 |
+
role='HUMAN'),
|
| 1219 |
+
]),
|
| 1220 |
+
type=
|
| 1221 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1222 |
+
retriever=dict(
|
| 1223 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1224 |
+
name='passage_retrieval_zh',
|
| 1225 |
+
path='opencompass/Longbench',
|
| 1226 |
+
reader_cfg=dict(
|
| 1227 |
+
input_columns=[
|
| 1228 |
+
'context',
|
| 1229 |
+
'input',
|
| 1230 |
+
],
|
| 1231 |
+
output_column='answers',
|
| 1232 |
+
test_range='[75:100]',
|
| 1233 |
+
test_split='test',
|
| 1234 |
+
train_split='test'),
|
| 1235 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
|
| 1236 |
+
dict(
|
| 1237 |
+
abbr='LongBench_passage_count_3',
|
| 1238 |
+
eval_cfg=dict(
|
| 1239 |
+
evaluator=dict(
|
| 1240 |
+
type='opencompass.datasets.LongBenchCountEvaluator'),
|
| 1241 |
+
pred_role='BOT'),
|
| 1242 |
+
infer_cfg=dict(
|
| 1243 |
+
inferencer=dict(
|
| 1244 |
+
max_out_len=32,
|
| 1245 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1246 |
+
prompt_template=dict(
|
| 1247 |
+
template=dict(round=[
|
| 1248 |
+
dict(
|
| 1249 |
+
prompt=
|
| 1250 |
+
'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
|
| 1251 |
+
role='HUMAN'),
|
| 1252 |
+
]),
|
| 1253 |
+
type=
|
| 1254 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1255 |
+
retriever=dict(
|
| 1256 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1257 |
+
name='passage_count',
|
| 1258 |
+
path='opencompass/Longbench',
|
| 1259 |
+
reader_cfg=dict(
|
| 1260 |
+
input_columns=[
|
| 1261 |
+
'context',
|
| 1262 |
+
'input',
|
| 1263 |
+
],
|
| 1264 |
+
output_column='answers',
|
| 1265 |
+
test_range='[75:100]',
|
| 1266 |
+
test_split='test',
|
| 1267 |
+
train_split='test'),
|
| 1268 |
+
type='opencompass.datasets.LongBenchpassage_countDataset'),
|
| 1269 |
+
dict(
|
| 1270 |
+
abbr='LongBench_trec_3',
|
| 1271 |
+
eval_cfg=dict(
|
| 1272 |
+
evaluator=dict(
|
| 1273 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 1274 |
+
),
|
| 1275 |
+
pred_postprocessor=dict(
|
| 1276 |
+
type='opencompass.datasets.trec_postprocess'),
|
| 1277 |
+
pred_role='BOT'),
|
| 1278 |
+
infer_cfg=dict(
|
| 1279 |
+
inferencer=dict(
|
| 1280 |
+
max_out_len=64,
|
| 1281 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1282 |
+
prompt_template=dict(
|
| 1283 |
+
template=dict(round=[
|
| 1284 |
+
dict(
|
| 1285 |
+
prompt=
|
| 1286 |
+
'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
|
| 1287 |
+
role='HUMAN'),
|
| 1288 |
+
]),
|
| 1289 |
+
type=
|
| 1290 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1291 |
+
retriever=dict(
|
| 1292 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1293 |
+
name='trec',
|
| 1294 |
+
path='opencompass/Longbench',
|
| 1295 |
+
reader_cfg=dict(
|
| 1296 |
+
input_columns=[
|
| 1297 |
+
'context',
|
| 1298 |
+
'input',
|
| 1299 |
+
],
|
| 1300 |
+
output_column='all_labels',
|
| 1301 |
+
test_range='[75:100]',
|
| 1302 |
+
test_split='test',
|
| 1303 |
+
train_split='test'),
|
| 1304 |
+
type='opencompass.datasets.LongBenchtrecDataset'),
|
| 1305 |
+
dict(
|
| 1306 |
+
abbr='LongBench_lsht_3',
|
| 1307 |
+
eval_cfg=dict(
|
| 1308 |
+
evaluator=dict(
|
| 1309 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 1310 |
+
),
|
| 1311 |
+
pred_postprocessor=dict(
|
| 1312 |
+
type='opencompass.datasets.lsht_postprocess'),
|
| 1313 |
+
pred_role='BOT'),
|
| 1314 |
+
infer_cfg=dict(
|
| 1315 |
+
inferencer=dict(
|
| 1316 |
+
max_out_len=64,
|
| 1317 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1318 |
+
prompt_template=dict(
|
| 1319 |
+
template=dict(round=[
|
| 1320 |
+
dict(
|
| 1321 |
+
prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
|
| 1322 |
+
role='HUMAN'),
|
| 1323 |
+
]),
|
| 1324 |
+
type=
|
| 1325 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1326 |
+
retriever=dict(
|
| 1327 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1328 |
+
name='lsht',
|
| 1329 |
+
path='opencompass/Longbench',
|
| 1330 |
+
reader_cfg=dict(
|
| 1331 |
+
input_columns=[
|
| 1332 |
+
'context',
|
| 1333 |
+
'input',
|
| 1334 |
+
],
|
| 1335 |
+
output_column='all_labels',
|
| 1336 |
+
test_range='[75:100]',
|
| 1337 |
+
test_split='test',
|
| 1338 |
+
train_split='test'),
|
| 1339 |
+
type='opencompass.datasets.LongBenchlshtDataset'),
|
| 1340 |
+
dict(
|
| 1341 |
+
abbr='LongBench_multi_news_3',
|
| 1342 |
+
eval_cfg=dict(
|
| 1343 |
+
evaluator=dict(
|
| 1344 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1345 |
+
pred_role='BOT'),
|
| 1346 |
+
infer_cfg=dict(
|
| 1347 |
+
inferencer=dict(
|
| 1348 |
+
max_out_len=512,
|
| 1349 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1350 |
+
prompt_template=dict(
|
| 1351 |
+
template=dict(round=[
|
| 1352 |
+
dict(
|
| 1353 |
+
prompt=
|
| 1354 |
+
'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
|
| 1355 |
+
role='HUMAN'),
|
| 1356 |
+
]),
|
| 1357 |
+
type=
|
| 1358 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1359 |
+
retriever=dict(
|
| 1360 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1361 |
+
name='multi_news',
|
| 1362 |
+
path='opencompass/Longbench',
|
| 1363 |
+
reader_cfg=dict(
|
| 1364 |
+
input_columns=[
|
| 1365 |
+
'context',
|
| 1366 |
+
],
|
| 1367 |
+
output_column='answers',
|
| 1368 |
+
test_range='[75:100]',
|
| 1369 |
+
test_split='test',
|
| 1370 |
+
train_split='test'),
|
| 1371 |
+
type='opencompass.datasets.LongBenchmulti_newsDataset'),
|
| 1372 |
+
dict(
|
| 1373 |
+
abbr='LongBench_samsum_3',
|
| 1374 |
+
eval_cfg=dict(
|
| 1375 |
+
evaluator=dict(
|
| 1376 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1377 |
+
pred_postprocessor=dict(
|
| 1378 |
+
type='opencompass.datasets.samsum_postprocess'),
|
| 1379 |
+
pred_role='BOT'),
|
| 1380 |
+
infer_cfg=dict(
|
| 1381 |
+
inferencer=dict(
|
| 1382 |
+
max_out_len=128,
|
| 1383 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1384 |
+
prompt_template=dict(
|
| 1385 |
+
template=dict(round=[
|
| 1386 |
+
dict(
|
| 1387 |
+
prompt=
|
| 1388 |
+
'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
|
| 1389 |
+
role='HUMAN'),
|
| 1390 |
+
]),
|
| 1391 |
+
type=
|
| 1392 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1393 |
+
retriever=dict(
|
| 1394 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1395 |
+
name='samsum',
|
| 1396 |
+
path='opencompass/Longbench',
|
| 1397 |
+
reader_cfg=dict(
|
| 1398 |
+
input_columns=[
|
| 1399 |
+
'context',
|
| 1400 |
+
'input',
|
| 1401 |
+
],
|
| 1402 |
+
output_column='answers',
|
| 1403 |
+
test_range='[75:100]',
|
| 1404 |
+
test_split='test',
|
| 1405 |
+
train_split='test'),
|
| 1406 |
+
type='opencompass.datasets.LongBenchsamsumDataset'),
|
| 1407 |
+
],
|
| 1408 |
+
]
|
| 1409 |
+
models = [
|
| 1410 |
+
dict(
|
| 1411 |
+
abbr='delta_net',
|
| 1412 |
+
batch_size=128,
|
| 1413 |
+
max_seq_len=2048,
|
| 1414 |
+
model_kwargs=dict(
|
| 1415 |
+
device_map='auto',
|
| 1416 |
+
torch_dtype='torch.bfloat16',
|
| 1417 |
+
trust_remote_code=True),
|
| 1418 |
+
path='/mnt/jfzn/msj/delta_net-1.3B-100B',
|
| 1419 |
+
run_cfg=dict(num_gpus=1),
|
| 1420 |
+
tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
|
| 1421 |
+
tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
|
| 1422 |
+
type='opencompass.models.HuggingFaceBaseModel'),
|
| 1423 |
+
]
|
| 1424 |
+
work_dir = 'outputs/default/20251127_221150'
|
tmp/3baffa8c-bc69-4789-aa49-f30266896eb4_params.py
ADDED
|
File without changes
|
tmp/3bc1afd5-60f6-4b89-9fc0-909218b5c248_params.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets = [
|
| 2 |
+
[
|
| 3 |
+
dict(
|
| 4 |
+
abbr='LongBench_musique',
|
| 5 |
+
eval_cfg=dict(
|
| 6 |
+
evaluator=dict(
|
| 7 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 8 |
+
pred_role='BOT'),
|
| 9 |
+
infer_cfg=dict(
|
| 10 |
+
inferencer=dict(
|
| 11 |
+
max_out_len=32,
|
| 12 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
template=dict(round=[
|
| 15 |
+
dict(
|
| 16 |
+
prompt=
|
| 17 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 18 |
+
role='HUMAN'),
|
| 19 |
+
]),
|
| 20 |
+
type=
|
| 21 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 22 |
+
retriever=dict(
|
| 23 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 24 |
+
name='musique',
|
| 25 |
+
path='opencompass/Longbench',
|
| 26 |
+
reader_cfg=dict(
|
| 27 |
+
input_columns=[
|
| 28 |
+
'context',
|
| 29 |
+
'input',
|
| 30 |
+
],
|
| 31 |
+
output_column='answers',
|
| 32 |
+
test_split='test',
|
| 33 |
+
train_split='test'),
|
| 34 |
+
type='opencompass.datasets.LongBenchmusiqueDataset'),
|
| 35 |
+
],
|
| 36 |
+
]
|
| 37 |
+
eval = dict(runner=dict(task=dict(dump_details=True)))
|
| 38 |
+
models = [
|
| 39 |
+
dict(
|
| 40 |
+
abbr='gated_deltanet',
|
| 41 |
+
batch_size=128,
|
| 42 |
+
max_seq_len=2048,
|
| 43 |
+
model_kwargs=dict(
|
| 44 |
+
device_map='auto',
|
| 45 |
+
torch_dtype='torch.bfloat16',
|
| 46 |
+
trust_remote_code=True),
|
| 47 |
+
path='download_model/hgrn2-1.3B-100B',
|
| 48 |
+
run_cfg=dict(num_gpus=1),
|
| 49 |
+
tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
|
| 50 |
+
tokenizer_path='download_model/hgrn2-1.3B-100B',
|
| 51 |
+
type='opencompass.models.HuggingFaceBaseModel'),
|
| 52 |
+
]
|
| 53 |
+
work_dir = 'outputs/default/20251219_163447'
|
tmp/401500cf-6431-490c-9e43-14532e24796f_params.py
ADDED
|
@@ -0,0 +1,1424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets = [
|
| 2 |
+
[
|
| 3 |
+
dict(
|
| 4 |
+
abbr='LongBench_2wikimqa_0',
|
| 5 |
+
eval_cfg=dict(
|
| 6 |
+
evaluator=dict(
|
| 7 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 8 |
+
pred_role='BOT'),
|
| 9 |
+
infer_cfg=dict(
|
| 10 |
+
inferencer=dict(
|
| 11 |
+
max_out_len=32,
|
| 12 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
template=dict(round=[
|
| 15 |
+
dict(
|
| 16 |
+
prompt=
|
| 17 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 18 |
+
role='HUMAN'),
|
| 19 |
+
]),
|
| 20 |
+
type=
|
| 21 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 22 |
+
retriever=dict(
|
| 23 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 24 |
+
name='2wikimqa',
|
| 25 |
+
path='opencompass/Longbench',
|
| 26 |
+
reader_cfg=dict(
|
| 27 |
+
input_columns=[
|
| 28 |
+
'context',
|
| 29 |
+
'input',
|
| 30 |
+
],
|
| 31 |
+
output_column='answers',
|
| 32 |
+
test_range='[0:25]',
|
| 33 |
+
test_split='test',
|
| 34 |
+
train_split='test'),
|
| 35 |
+
type='opencompass.datasets.LongBench2wikimqaDataset'),
|
| 36 |
+
dict(
|
| 37 |
+
abbr='LongBench_hotpotqa_0',
|
| 38 |
+
eval_cfg=dict(
|
| 39 |
+
evaluator=dict(
|
| 40 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 41 |
+
pred_role='BOT'),
|
| 42 |
+
infer_cfg=dict(
|
| 43 |
+
inferencer=dict(
|
| 44 |
+
max_out_len=32,
|
| 45 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 46 |
+
prompt_template=dict(
|
| 47 |
+
template=dict(round=[
|
| 48 |
+
dict(
|
| 49 |
+
prompt=
|
| 50 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 51 |
+
role='HUMAN'),
|
| 52 |
+
]),
|
| 53 |
+
type=
|
| 54 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 55 |
+
retriever=dict(
|
| 56 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 57 |
+
name='hotpotqa',
|
| 58 |
+
path='opencompass/Longbench',
|
| 59 |
+
reader_cfg=dict(
|
| 60 |
+
input_columns=[
|
| 61 |
+
'context',
|
| 62 |
+
'input',
|
| 63 |
+
],
|
| 64 |
+
output_column='answers',
|
| 65 |
+
test_range='[0:25]',
|
| 66 |
+
test_split='test',
|
| 67 |
+
train_split='test'),
|
| 68 |
+
type='opencompass.datasets.LongBenchhotpotqaDataset'),
|
| 69 |
+
dict(
|
| 70 |
+
abbr='LongBench_musique_0',
|
| 71 |
+
eval_cfg=dict(
|
| 72 |
+
evaluator=dict(
|
| 73 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 74 |
+
pred_role='BOT'),
|
| 75 |
+
infer_cfg=dict(
|
| 76 |
+
inferencer=dict(
|
| 77 |
+
max_out_len=32,
|
| 78 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 79 |
+
prompt_template=dict(
|
| 80 |
+
template=dict(round=[
|
| 81 |
+
dict(
|
| 82 |
+
prompt=
|
| 83 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 84 |
+
role='HUMAN'),
|
| 85 |
+
]),
|
| 86 |
+
type=
|
| 87 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 88 |
+
retriever=dict(
|
| 89 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 90 |
+
name='musique',
|
| 91 |
+
path='opencompass/Longbench',
|
| 92 |
+
reader_cfg=dict(
|
| 93 |
+
input_columns=[
|
| 94 |
+
'context',
|
| 95 |
+
'input',
|
| 96 |
+
],
|
| 97 |
+
output_column='answers',
|
| 98 |
+
test_range='[0:25]',
|
| 99 |
+
test_split='test',
|
| 100 |
+
train_split='test'),
|
| 101 |
+
type='opencompass.datasets.LongBenchmusiqueDataset'),
|
| 102 |
+
dict(
|
| 103 |
+
abbr='LongBench_multifieldqa_en_0',
|
| 104 |
+
eval_cfg=dict(
|
| 105 |
+
evaluator=dict(
|
| 106 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 107 |
+
pred_role='BOT'),
|
| 108 |
+
infer_cfg=dict(
|
| 109 |
+
inferencer=dict(
|
| 110 |
+
max_out_len=64,
|
| 111 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 112 |
+
prompt_template=dict(
|
| 113 |
+
template=dict(round=[
|
| 114 |
+
dict(
|
| 115 |
+
prompt=
|
| 116 |
+
'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 117 |
+
role='HUMAN'),
|
| 118 |
+
]),
|
| 119 |
+
type=
|
| 120 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 121 |
+
retriever=dict(
|
| 122 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 123 |
+
name='multifieldqa_en',
|
| 124 |
+
path='opencompass/Longbench',
|
| 125 |
+
reader_cfg=dict(
|
| 126 |
+
input_columns=[
|
| 127 |
+
'context',
|
| 128 |
+
'input',
|
| 129 |
+
],
|
| 130 |
+
output_column='answers',
|
| 131 |
+
test_range='[0:19]',
|
| 132 |
+
test_split='test',
|
| 133 |
+
train_split='test'),
|
| 134 |
+
type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
|
| 135 |
+
dict(
|
| 136 |
+
abbr='LongBench_multifieldqa_zh_0',
|
| 137 |
+
eval_cfg=dict(
|
| 138 |
+
evaluator=dict(
|
| 139 |
+
language='zh',
|
| 140 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 141 |
+
pred_role='BOT'),
|
| 142 |
+
infer_cfg=dict(
|
| 143 |
+
inferencer=dict(
|
| 144 |
+
max_out_len=64,
|
| 145 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 146 |
+
prompt_template=dict(
|
| 147 |
+
template=dict(round=[
|
| 148 |
+
dict(
|
| 149 |
+
prompt=
|
| 150 |
+
'阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
|
| 151 |
+
role='HUMAN'),
|
| 152 |
+
]),
|
| 153 |
+
type=
|
| 154 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 155 |
+
retriever=dict(
|
| 156 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 157 |
+
name='multifieldqa_zh',
|
| 158 |
+
path='opencompass/Longbench',
|
| 159 |
+
reader_cfg=dict(
|
| 160 |
+
input_columns=[
|
| 161 |
+
'context',
|
| 162 |
+
'input',
|
| 163 |
+
],
|
| 164 |
+
output_column='answers',
|
| 165 |
+
test_range='[0:25]',
|
| 166 |
+
test_split='test',
|
| 167 |
+
train_split='test'),
|
| 168 |
+
type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
|
| 169 |
+
dict(
|
| 170 |
+
abbr='LongBench_narrativeqa_0',
|
| 171 |
+
eval_cfg=dict(
|
| 172 |
+
evaluator=dict(
|
| 173 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 174 |
+
pred_role='BOT'),
|
| 175 |
+
infer_cfg=dict(
|
| 176 |
+
inferencer=dict(
|
| 177 |
+
max_out_len=128,
|
| 178 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 179 |
+
prompt_template=dict(
|
| 180 |
+
template=dict(round=[
|
| 181 |
+
dict(
|
| 182 |
+
prompt=
|
| 183 |
+
'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
|
| 184 |
+
role='HUMAN'),
|
| 185 |
+
]),
|
| 186 |
+
type=
|
| 187 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 188 |
+
retriever=dict(
|
| 189 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 190 |
+
name='narrativeqa',
|
| 191 |
+
path='opencompass/Longbench',
|
| 192 |
+
reader_cfg=dict(
|
| 193 |
+
input_columns=[
|
| 194 |
+
'context',
|
| 195 |
+
'input',
|
| 196 |
+
],
|
| 197 |
+
output_column='answers',
|
| 198 |
+
test_range='[0:25]',
|
| 199 |
+
test_split='test',
|
| 200 |
+
train_split='test'),
|
| 201 |
+
type='opencompass.datasets.LongBenchnarrativeqaDataset'),
|
| 202 |
+
dict(
|
| 203 |
+
abbr='LongBench_qasper_0',
|
| 204 |
+
eval_cfg=dict(
|
| 205 |
+
evaluator=dict(
|
| 206 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 207 |
+
pred_role='BOT'),
|
| 208 |
+
infer_cfg=dict(
|
| 209 |
+
inferencer=dict(
|
| 210 |
+
max_out_len=32,
|
| 211 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 212 |
+
prompt_template=dict(
|
| 213 |
+
template=dict(round=[
|
| 214 |
+
dict(
|
| 215 |
+
prompt=
|
| 216 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 217 |
+
role='HUMAN'),
|
| 218 |
+
]),
|
| 219 |
+
type=
|
| 220 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 221 |
+
retriever=dict(
|
| 222 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 223 |
+
name='qasper',
|
| 224 |
+
path='opencompass/Longbench',
|
| 225 |
+
reader_cfg=dict(
|
| 226 |
+
input_columns=[
|
| 227 |
+
'context',
|
| 228 |
+
'input',
|
| 229 |
+
],
|
| 230 |
+
output_column='answers',
|
| 231 |
+
test_range='[0:25]',
|
| 232 |
+
test_split='test',
|
| 233 |
+
train_split='test'),
|
| 234 |
+
type='opencompass.datasets.LongBenchqasperDataset'),
|
| 235 |
+
dict(
|
| 236 |
+
abbr='LongBench_triviaqa_0',
|
| 237 |
+
eval_cfg=dict(
|
| 238 |
+
evaluator=dict(
|
| 239 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 240 |
+
pred_postprocessor=dict(
|
| 241 |
+
type='opencompass.datasets.triviaqa_postprocess'),
|
| 242 |
+
pred_role='BOT'),
|
| 243 |
+
infer_cfg=dict(
|
| 244 |
+
inferencer=dict(
|
| 245 |
+
max_out_len=32,
|
| 246 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 247 |
+
prompt_template=dict(
|
| 248 |
+
template=dict(round=[
|
| 249 |
+
dict(
|
| 250 |
+
prompt=
|
| 251 |
+
'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
|
| 252 |
+
role='HUMAN'),
|
| 253 |
+
]),
|
| 254 |
+
type=
|
| 255 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 256 |
+
retriever=dict(
|
| 257 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 258 |
+
name='triviaqa',
|
| 259 |
+
path='opencompass/Longbench',
|
| 260 |
+
reader_cfg=dict(
|
| 261 |
+
input_columns=[
|
| 262 |
+
'context',
|
| 263 |
+
'input',
|
| 264 |
+
],
|
| 265 |
+
output_column='answers',
|
| 266 |
+
test_range='[0:25]',
|
| 267 |
+
test_split='test',
|
| 268 |
+
train_split='test'),
|
| 269 |
+
type='opencompass.datasets.LongBenchtriviaqaDataset'),
|
| 270 |
+
dict(
|
| 271 |
+
abbr='LongBench_gov_report_0',
|
| 272 |
+
eval_cfg=dict(
|
| 273 |
+
evaluator=dict(
|
| 274 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 275 |
+
pred_role='BOT'),
|
| 276 |
+
infer_cfg=dict(
|
| 277 |
+
inferencer=dict(
|
| 278 |
+
max_out_len=512,
|
| 279 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 280 |
+
prompt_template=dict(
|
| 281 |
+
template=dict(round=[
|
| 282 |
+
dict(
|
| 283 |
+
prompt=
|
| 284 |
+
'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
|
| 285 |
+
role='HUMAN'),
|
| 286 |
+
]),
|
| 287 |
+
type=
|
| 288 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 289 |
+
retriever=dict(
|
| 290 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 291 |
+
name='gov_report',
|
| 292 |
+
path='opencompass/Longbench',
|
| 293 |
+
reader_cfg=dict(
|
| 294 |
+
input_columns=[
|
| 295 |
+
'context',
|
| 296 |
+
],
|
| 297 |
+
output_column='answers',
|
| 298 |
+
test_range='[0:25]',
|
| 299 |
+
test_split='test',
|
| 300 |
+
train_split='test'),
|
| 301 |
+
type='opencompass.datasets.LongBenchgov_reportDataset'),
|
| 302 |
+
dict(
|
| 303 |
+
abbr='LongBench_qmsum_0',
|
| 304 |
+
eval_cfg=dict(
|
| 305 |
+
evaluator=dict(
|
| 306 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 307 |
+
pred_role='BOT'),
|
| 308 |
+
infer_cfg=dict(
|
| 309 |
+
inferencer=dict(
|
| 310 |
+
max_out_len=512,
|
| 311 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 312 |
+
prompt_template=dict(
|
| 313 |
+
template=dict(round=[
|
| 314 |
+
dict(
|
| 315 |
+
prompt=
|
| 316 |
+
'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
|
| 317 |
+
role='HUMAN'),
|
| 318 |
+
]),
|
| 319 |
+
type=
|
| 320 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 321 |
+
retriever=dict(
|
| 322 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 323 |
+
name='qmsum',
|
| 324 |
+
path='opencompass/Longbench',
|
| 325 |
+
reader_cfg=dict(
|
| 326 |
+
input_columns=[
|
| 327 |
+
'context',
|
| 328 |
+
'input',
|
| 329 |
+
],
|
| 330 |
+
output_column='answers',
|
| 331 |
+
test_range='[0:25]',
|
| 332 |
+
test_split='test',
|
| 333 |
+
train_split='test'),
|
| 334 |
+
type='opencompass.datasets.LongBenchqmsumDataset'),
|
| 335 |
+
dict(
|
| 336 |
+
abbr='LongBench_vcsum_0',
|
| 337 |
+
eval_cfg=dict(
|
| 338 |
+
evaluator=dict(
|
| 339 |
+
language='zh',
|
| 340 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 341 |
+
pred_role='BOT'),
|
| 342 |
+
infer_cfg=dict(
|
| 343 |
+
inferencer=dict(
|
| 344 |
+
max_out_len=512,
|
| 345 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 346 |
+
prompt_template=dict(
|
| 347 |
+
template=dict(round=[
|
| 348 |
+
dict(
|
| 349 |
+
prompt=
|
| 350 |
+
'下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
|
| 351 |
+
role='HUMAN'),
|
| 352 |
+
]),
|
| 353 |
+
type=
|
| 354 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 355 |
+
retriever=dict(
|
| 356 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 357 |
+
name='vcsum',
|
| 358 |
+
path='opencompass/Longbench',
|
| 359 |
+
reader_cfg=dict(
|
| 360 |
+
input_columns=[
|
| 361 |
+
'context',
|
| 362 |
+
],
|
| 363 |
+
output_column='answers',
|
| 364 |
+
test_range='[0:25]',
|
| 365 |
+
test_split='test',
|
| 366 |
+
train_split='test'),
|
| 367 |
+
type='opencompass.datasets.LongBenchvcsumDataset'),
|
| 368 |
+
dict(
|
| 369 |
+
abbr='LongBench_dureader_0',
|
| 370 |
+
eval_cfg=dict(
|
| 371 |
+
evaluator=dict(
|
| 372 |
+
language='zh',
|
| 373 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 374 |
+
pred_role='BOT'),
|
| 375 |
+
infer_cfg=dict(
|
| 376 |
+
inferencer=dict(
|
| 377 |
+
max_out_len=128,
|
| 378 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 379 |
+
prompt_template=dict(
|
| 380 |
+
template=dict(round=[
|
| 381 |
+
dict(
|
| 382 |
+
prompt=
|
| 383 |
+
'请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
|
| 384 |
+
role='HUMAN'),
|
| 385 |
+
]),
|
| 386 |
+
type=
|
| 387 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 388 |
+
retriever=dict(
|
| 389 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 390 |
+
name='dureader',
|
| 391 |
+
path='opencompass/Longbench',
|
| 392 |
+
reader_cfg=dict(
|
| 393 |
+
input_columns=[
|
| 394 |
+
'context',
|
| 395 |
+
'input',
|
| 396 |
+
],
|
| 397 |
+
output_column='answers',
|
| 398 |
+
test_range='[0:25]',
|
| 399 |
+
test_split='test',
|
| 400 |
+
train_split='test'),
|
| 401 |
+
type='opencompass.datasets.LongBenchdureaderDataset'),
|
| 402 |
+
dict(
|
| 403 |
+
abbr='LongBench_lcc_0',
|
| 404 |
+
eval_cfg=dict(
|
| 405 |
+
evaluator=dict(
|
| 406 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 407 |
+
pred_role='BOT'),
|
| 408 |
+
infer_cfg=dict(
|
| 409 |
+
inferencer=dict(
|
| 410 |
+
max_out_len=64,
|
| 411 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 412 |
+
prompt_template=dict(
|
| 413 |
+
template=dict(round=[
|
| 414 |
+
dict(
|
| 415 |
+
prompt=
|
| 416 |
+
'Please complete the code given below. \n{context}Next line of code:\n',
|
| 417 |
+
role='HUMAN'),
|
| 418 |
+
]),
|
| 419 |
+
type=
|
| 420 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 421 |
+
retriever=dict(
|
| 422 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 423 |
+
name='lcc',
|
| 424 |
+
path='opencompass/Longbench',
|
| 425 |
+
reader_cfg=dict(
|
| 426 |
+
input_columns=[
|
| 427 |
+
'context',
|
| 428 |
+
],
|
| 429 |
+
output_column='answers',
|
| 430 |
+
test_range='[0:63]',
|
| 431 |
+
test_split='test',
|
| 432 |
+
train_split='test'),
|
| 433 |
+
type='opencompass.datasets.LongBenchlccDataset'),
|
| 434 |
+
dict(
|
| 435 |
+
abbr='LongBench_repobench-p_0',
|
| 436 |
+
eval_cfg=dict(
|
| 437 |
+
evaluator=dict(
|
| 438 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 439 |
+
pred_role='BOT'),
|
| 440 |
+
infer_cfg=dict(
|
| 441 |
+
inferencer=dict(
|
| 442 |
+
max_out_len=64,
|
| 443 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 444 |
+
prompt_template=dict(
|
| 445 |
+
template=dict(round=[
|
| 446 |
+
dict(
|
| 447 |
+
prompt=
|
| 448 |
+
'Please complete the code given below. \n{context}{input}Next line of code:\n',
|
| 449 |
+
role='HUMAN'),
|
| 450 |
+
]),
|
| 451 |
+
type=
|
| 452 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 453 |
+
retriever=dict(
|
| 454 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 455 |
+
name='repobench-p',
|
| 456 |
+
path='opencompass/Longbench',
|
| 457 |
+
reader_cfg=dict(
|
| 458 |
+
input_columns=[
|
| 459 |
+
'context',
|
| 460 |
+
'input',
|
| 461 |
+
],
|
| 462 |
+
output_column='answers',
|
| 463 |
+
test_range='[0:63]',
|
| 464 |
+
test_split='test',
|
| 465 |
+
train_split='test'),
|
| 466 |
+
type='opencompass.datasets.LongBenchrepobenchDataset'),
|
| 467 |
+
dict(
|
| 468 |
+
abbr='LongBench_passage_retrieval_en_0',
|
| 469 |
+
eval_cfg=dict(
|
| 470 |
+
evaluator=dict(
|
| 471 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 472 |
+
pred_role='BOT'),
|
| 473 |
+
infer_cfg=dict(
|
| 474 |
+
inferencer=dict(
|
| 475 |
+
max_out_len=32,
|
| 476 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 477 |
+
prompt_template=dict(
|
| 478 |
+
template=dict(round=[
|
| 479 |
+
dict(
|
| 480 |
+
prompt=
|
| 481 |
+
'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
|
| 482 |
+
role='HUMAN'),
|
| 483 |
+
]),
|
| 484 |
+
type=
|
| 485 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 486 |
+
retriever=dict(
|
| 487 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 488 |
+
name='passage_retrieval_en',
|
| 489 |
+
path='opencompass/Longbench',
|
| 490 |
+
reader_cfg=dict(
|
| 491 |
+
input_columns=[
|
| 492 |
+
'context',
|
| 493 |
+
'input',
|
| 494 |
+
],
|
| 495 |
+
output_column='answers',
|
| 496 |
+
test_range='[0:25]',
|
| 497 |
+
test_split='test',
|
| 498 |
+
train_split='test'),
|
| 499 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
|
| 500 |
+
dict(
|
| 501 |
+
abbr='LongBench_passage_retrieval_zh_0',
|
| 502 |
+
eval_cfg=dict(
|
| 503 |
+
evaluator=dict(
|
| 504 |
+
language='zh',
|
| 505 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 506 |
+
pred_role='BOT'),
|
| 507 |
+
infer_cfg=dict(
|
| 508 |
+
inferencer=dict(
|
| 509 |
+
max_out_len=32,
|
| 510 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 511 |
+
prompt_template=dict(
|
| 512 |
+
template=dict(round=[
|
| 513 |
+
dict(
|
| 514 |
+
prompt=
|
| 515 |
+
'以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
|
| 516 |
+
role='HUMAN'),
|
| 517 |
+
]),
|
| 518 |
+
type=
|
| 519 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 520 |
+
retriever=dict(
|
| 521 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 522 |
+
name='passage_retrieval_zh',
|
| 523 |
+
path='opencompass/Longbench',
|
| 524 |
+
reader_cfg=dict(
|
| 525 |
+
input_columns=[
|
| 526 |
+
'context',
|
| 527 |
+
'input',
|
| 528 |
+
],
|
| 529 |
+
output_column='answers',
|
| 530 |
+
test_range='[0:25]',
|
| 531 |
+
test_split='test',
|
| 532 |
+
train_split='test'),
|
| 533 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
|
| 534 |
+
dict(
|
| 535 |
+
abbr='LongBench_passage_count_0',
|
| 536 |
+
eval_cfg=dict(
|
| 537 |
+
evaluator=dict(
|
| 538 |
+
type='opencompass.datasets.LongBenchCountEvaluator'),
|
| 539 |
+
pred_role='BOT'),
|
| 540 |
+
infer_cfg=dict(
|
| 541 |
+
inferencer=dict(
|
| 542 |
+
max_out_len=32,
|
| 543 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 544 |
+
prompt_template=dict(
|
| 545 |
+
template=dict(round=[
|
| 546 |
+
dict(
|
| 547 |
+
prompt=
|
| 548 |
+
'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
|
| 549 |
+
role='HUMAN'),
|
| 550 |
+
]),
|
| 551 |
+
type=
|
| 552 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 553 |
+
retriever=dict(
|
| 554 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 555 |
+
name='passage_count',
|
| 556 |
+
path='opencompass/Longbench',
|
| 557 |
+
reader_cfg=dict(
|
| 558 |
+
input_columns=[
|
| 559 |
+
'context',
|
| 560 |
+
'input',
|
| 561 |
+
],
|
| 562 |
+
output_column='answers',
|
| 563 |
+
test_range='[0:25]',
|
| 564 |
+
test_split='test',
|
| 565 |
+
train_split='test'),
|
| 566 |
+
type='opencompass.datasets.LongBenchpassage_countDataset'),
|
| 567 |
+
dict(
|
| 568 |
+
abbr='LongBench_trec_0',
|
| 569 |
+
eval_cfg=dict(
|
| 570 |
+
evaluator=dict(
|
| 571 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 572 |
+
),
|
| 573 |
+
pred_postprocessor=dict(
|
| 574 |
+
type='opencompass.datasets.trec_postprocess'),
|
| 575 |
+
pred_role='BOT'),
|
| 576 |
+
infer_cfg=dict(
|
| 577 |
+
inferencer=dict(
|
| 578 |
+
max_out_len=64,
|
| 579 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 580 |
+
prompt_template=dict(
|
| 581 |
+
template=dict(round=[
|
| 582 |
+
dict(
|
| 583 |
+
prompt=
|
| 584 |
+
'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
|
| 585 |
+
role='HUMAN'),
|
| 586 |
+
]),
|
| 587 |
+
type=
|
| 588 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 589 |
+
retriever=dict(
|
| 590 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 591 |
+
name='trec',
|
| 592 |
+
path='opencompass/Longbench',
|
| 593 |
+
reader_cfg=dict(
|
| 594 |
+
input_columns=[
|
| 595 |
+
'context',
|
| 596 |
+
'input',
|
| 597 |
+
],
|
| 598 |
+
output_column='all_labels',
|
| 599 |
+
test_range='[0:25]',
|
| 600 |
+
test_split='test',
|
| 601 |
+
train_split='test'),
|
| 602 |
+
type='opencompass.datasets.LongBenchtrecDataset'),
|
| 603 |
+
dict(
|
| 604 |
+
abbr='LongBench_lsht_0',
|
| 605 |
+
eval_cfg=dict(
|
| 606 |
+
evaluator=dict(
|
| 607 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 608 |
+
),
|
| 609 |
+
pred_postprocessor=dict(
|
| 610 |
+
type='opencompass.datasets.lsht_postprocess'),
|
| 611 |
+
pred_role='BOT'),
|
| 612 |
+
infer_cfg=dict(
|
| 613 |
+
inferencer=dict(
|
| 614 |
+
max_out_len=64,
|
| 615 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 616 |
+
prompt_template=dict(
|
| 617 |
+
template=dict(round=[
|
| 618 |
+
dict(
|
| 619 |
+
prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
|
| 620 |
+
role='HUMAN'),
|
| 621 |
+
]),
|
| 622 |
+
type=
|
| 623 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 624 |
+
retriever=dict(
|
| 625 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 626 |
+
name='lsht',
|
| 627 |
+
path='opencompass/Longbench',
|
| 628 |
+
reader_cfg=dict(
|
| 629 |
+
input_columns=[
|
| 630 |
+
'context',
|
| 631 |
+
'input',
|
| 632 |
+
],
|
| 633 |
+
output_column='all_labels',
|
| 634 |
+
test_range='[0:25]',
|
| 635 |
+
test_split='test',
|
| 636 |
+
train_split='test'),
|
| 637 |
+
type='opencompass.datasets.LongBenchlshtDataset'),
|
| 638 |
+
dict(
|
| 639 |
+
abbr='LongBench_multi_news_0',
|
| 640 |
+
eval_cfg=dict(
|
| 641 |
+
evaluator=dict(
|
| 642 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 643 |
+
pred_role='BOT'),
|
| 644 |
+
infer_cfg=dict(
|
| 645 |
+
inferencer=dict(
|
| 646 |
+
max_out_len=512,
|
| 647 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 648 |
+
prompt_template=dict(
|
| 649 |
+
template=dict(round=[
|
| 650 |
+
dict(
|
| 651 |
+
prompt=
|
| 652 |
+
'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
|
| 653 |
+
role='HUMAN'),
|
| 654 |
+
]),
|
| 655 |
+
type=
|
| 656 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 657 |
+
retriever=dict(
|
| 658 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 659 |
+
name='multi_news',
|
| 660 |
+
path='opencompass/Longbench',
|
| 661 |
+
reader_cfg=dict(
|
| 662 |
+
input_columns=[
|
| 663 |
+
'context',
|
| 664 |
+
],
|
| 665 |
+
output_column='answers',
|
| 666 |
+
test_range='[0:25]',
|
| 667 |
+
test_split='test',
|
| 668 |
+
train_split='test'),
|
| 669 |
+
type='opencompass.datasets.LongBenchmulti_newsDataset'),
|
| 670 |
+
dict(
|
| 671 |
+
abbr='LongBench_samsum_0',
|
| 672 |
+
eval_cfg=dict(
|
| 673 |
+
evaluator=dict(
|
| 674 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 675 |
+
pred_postprocessor=dict(
|
| 676 |
+
type='opencompass.datasets.samsum_postprocess'),
|
| 677 |
+
pred_role='BOT'),
|
| 678 |
+
infer_cfg=dict(
|
| 679 |
+
inferencer=dict(
|
| 680 |
+
max_out_len=128,
|
| 681 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 682 |
+
prompt_template=dict(
|
| 683 |
+
template=dict(round=[
|
| 684 |
+
dict(
|
| 685 |
+
prompt=
|
| 686 |
+
'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
|
| 687 |
+
role='HUMAN'),
|
| 688 |
+
]),
|
| 689 |
+
type=
|
| 690 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 691 |
+
retriever=dict(
|
| 692 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 693 |
+
name='samsum',
|
| 694 |
+
path='opencompass/Longbench',
|
| 695 |
+
reader_cfg=dict(
|
| 696 |
+
input_columns=[
|
| 697 |
+
'context',
|
| 698 |
+
'input',
|
| 699 |
+
],
|
| 700 |
+
output_column='answers',
|
| 701 |
+
test_range='[0:25]',
|
| 702 |
+
test_split='test',
|
| 703 |
+
train_split='test'),
|
| 704 |
+
type='opencompass.datasets.LongBenchsamsumDataset'),
|
| 705 |
+
dict(
|
| 706 |
+
abbr='LongBench_2wikimqa_0',
|
| 707 |
+
eval_cfg=dict(
|
| 708 |
+
evaluator=dict(
|
| 709 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 710 |
+
pred_role='BOT'),
|
| 711 |
+
infer_cfg=dict(
|
| 712 |
+
inferencer=dict(
|
| 713 |
+
max_out_len=32,
|
| 714 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 715 |
+
prompt_template=dict(
|
| 716 |
+
template=dict(round=[
|
| 717 |
+
dict(
|
| 718 |
+
prompt=
|
| 719 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 720 |
+
role='HUMAN'),
|
| 721 |
+
]),
|
| 722 |
+
type=
|
| 723 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 724 |
+
retriever=dict(
|
| 725 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 726 |
+
name='2wikimqa',
|
| 727 |
+
path='opencompass/Longbench',
|
| 728 |
+
reader_cfg=dict(
|
| 729 |
+
input_columns=[
|
| 730 |
+
'context',
|
| 731 |
+
'input',
|
| 732 |
+
],
|
| 733 |
+
output_column='answers',
|
| 734 |
+
test_range='[0:25]',
|
| 735 |
+
test_split='test',
|
| 736 |
+
train_split='test'),
|
| 737 |
+
type='opencompass.datasets.LongBench2wikimqaDataset'),
|
| 738 |
+
dict(
|
| 739 |
+
abbr='LongBench_hotpotqa_0',
|
| 740 |
+
eval_cfg=dict(
|
| 741 |
+
evaluator=dict(
|
| 742 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 743 |
+
pred_role='BOT'),
|
| 744 |
+
infer_cfg=dict(
|
| 745 |
+
inferencer=dict(
|
| 746 |
+
max_out_len=32,
|
| 747 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 748 |
+
prompt_template=dict(
|
| 749 |
+
template=dict(round=[
|
| 750 |
+
dict(
|
| 751 |
+
prompt=
|
| 752 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 753 |
+
role='HUMAN'),
|
| 754 |
+
]),
|
| 755 |
+
type=
|
| 756 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 757 |
+
retriever=dict(
|
| 758 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 759 |
+
name='hotpotqa',
|
| 760 |
+
path='opencompass/Longbench',
|
| 761 |
+
reader_cfg=dict(
|
| 762 |
+
input_columns=[
|
| 763 |
+
'context',
|
| 764 |
+
'input',
|
| 765 |
+
],
|
| 766 |
+
output_column='answers',
|
| 767 |
+
test_range='[0:25]',
|
| 768 |
+
test_split='test',
|
| 769 |
+
train_split='test'),
|
| 770 |
+
type='opencompass.datasets.LongBenchhotpotqaDataset'),
|
| 771 |
+
dict(
|
| 772 |
+
abbr='LongBench_musique_0',
|
| 773 |
+
eval_cfg=dict(
|
| 774 |
+
evaluator=dict(
|
| 775 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 776 |
+
pred_role='BOT'),
|
| 777 |
+
infer_cfg=dict(
|
| 778 |
+
inferencer=dict(
|
| 779 |
+
max_out_len=32,
|
| 780 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 781 |
+
prompt_template=dict(
|
| 782 |
+
template=dict(round=[
|
| 783 |
+
dict(
|
| 784 |
+
prompt=
|
| 785 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 786 |
+
role='HUMAN'),
|
| 787 |
+
]),
|
| 788 |
+
type=
|
| 789 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 790 |
+
retriever=dict(
|
| 791 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 792 |
+
name='musique',
|
| 793 |
+
path='opencompass/Longbench',
|
| 794 |
+
reader_cfg=dict(
|
| 795 |
+
input_columns=[
|
| 796 |
+
'context',
|
| 797 |
+
'input',
|
| 798 |
+
],
|
| 799 |
+
output_column='answers',
|
| 800 |
+
test_range='[0:25]',
|
| 801 |
+
test_split='test',
|
| 802 |
+
train_split='test'),
|
| 803 |
+
type='opencompass.datasets.LongBenchmusiqueDataset'),
|
| 804 |
+
dict(
|
| 805 |
+
abbr='LongBench_multifieldqa_en_0',
|
| 806 |
+
eval_cfg=dict(
|
| 807 |
+
evaluator=dict(
|
| 808 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 809 |
+
pred_role='BOT'),
|
| 810 |
+
infer_cfg=dict(
|
| 811 |
+
inferencer=dict(
|
| 812 |
+
max_out_len=64,
|
| 813 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 814 |
+
prompt_template=dict(
|
| 815 |
+
template=dict(round=[
|
| 816 |
+
dict(
|
| 817 |
+
prompt=
|
| 818 |
+
'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 819 |
+
role='HUMAN'),
|
| 820 |
+
]),
|
| 821 |
+
type=
|
| 822 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 823 |
+
retriever=dict(
|
| 824 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 825 |
+
name='multifieldqa_en',
|
| 826 |
+
path='opencompass/Longbench',
|
| 827 |
+
reader_cfg=dict(
|
| 828 |
+
input_columns=[
|
| 829 |
+
'context',
|
| 830 |
+
'input',
|
| 831 |
+
],
|
| 832 |
+
output_column='answers',
|
| 833 |
+
test_range='[0:19]',
|
| 834 |
+
test_split='test',
|
| 835 |
+
train_split='test'),
|
| 836 |
+
type='opencompass.datasets.LongBenchmultifieldqa_enDataset'),
|
| 837 |
+
dict(
|
| 838 |
+
abbr='LongBench_multifieldqa_zh_0',
|
| 839 |
+
eval_cfg=dict(
|
| 840 |
+
evaluator=dict(
|
| 841 |
+
language='zh',
|
| 842 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 843 |
+
pred_role='BOT'),
|
| 844 |
+
infer_cfg=dict(
|
| 845 |
+
inferencer=dict(
|
| 846 |
+
max_out_len=64,
|
| 847 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 848 |
+
prompt_template=dict(
|
| 849 |
+
template=dict(round=[
|
| 850 |
+
dict(
|
| 851 |
+
prompt=
|
| 852 |
+
'阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:',
|
| 853 |
+
role='HUMAN'),
|
| 854 |
+
]),
|
| 855 |
+
type=
|
| 856 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 857 |
+
retriever=dict(
|
| 858 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 859 |
+
name='multifieldqa_zh',
|
| 860 |
+
path='opencompass/Longbench',
|
| 861 |
+
reader_cfg=dict(
|
| 862 |
+
input_columns=[
|
| 863 |
+
'context',
|
| 864 |
+
'input',
|
| 865 |
+
],
|
| 866 |
+
output_column='answers',
|
| 867 |
+
test_range='[0:25]',
|
| 868 |
+
test_split='test',
|
| 869 |
+
train_split='test'),
|
| 870 |
+
type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'),
|
| 871 |
+
dict(
|
| 872 |
+
abbr='LongBench_narrativeqa_0',
|
| 873 |
+
eval_cfg=dict(
|
| 874 |
+
evaluator=dict(
|
| 875 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 876 |
+
pred_role='BOT'),
|
| 877 |
+
infer_cfg=dict(
|
| 878 |
+
inferencer=dict(
|
| 879 |
+
max_out_len=128,
|
| 880 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 881 |
+
prompt_template=dict(
|
| 882 |
+
template=dict(round=[
|
| 883 |
+
dict(
|
| 884 |
+
prompt=
|
| 885 |
+
'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
|
| 886 |
+
role='HUMAN'),
|
| 887 |
+
]),
|
| 888 |
+
type=
|
| 889 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 890 |
+
retriever=dict(
|
| 891 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 892 |
+
name='narrativeqa',
|
| 893 |
+
path='opencompass/Longbench',
|
| 894 |
+
reader_cfg=dict(
|
| 895 |
+
input_columns=[
|
| 896 |
+
'context',
|
| 897 |
+
'input',
|
| 898 |
+
],
|
| 899 |
+
output_column='answers',
|
| 900 |
+
test_range='[0:25]',
|
| 901 |
+
test_split='test',
|
| 902 |
+
train_split='test'),
|
| 903 |
+
type='opencompass.datasets.LongBenchnarrativeqaDataset'),
|
| 904 |
+
dict(
|
| 905 |
+
abbr='LongBench_qasper_0',
|
| 906 |
+
eval_cfg=dict(
|
| 907 |
+
evaluator=dict(
|
| 908 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 909 |
+
pred_role='BOT'),
|
| 910 |
+
infer_cfg=dict(
|
| 911 |
+
inferencer=dict(
|
| 912 |
+
max_out_len=32,
|
| 913 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 914 |
+
prompt_template=dict(
|
| 915 |
+
template=dict(round=[
|
| 916 |
+
dict(
|
| 917 |
+
prompt=
|
| 918 |
+
'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 919 |
+
role='HUMAN'),
|
| 920 |
+
]),
|
| 921 |
+
type=
|
| 922 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 923 |
+
retriever=dict(
|
| 924 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 925 |
+
name='qasper',
|
| 926 |
+
path='opencompass/Longbench',
|
| 927 |
+
reader_cfg=dict(
|
| 928 |
+
input_columns=[
|
| 929 |
+
'context',
|
| 930 |
+
'input',
|
| 931 |
+
],
|
| 932 |
+
output_column='answers',
|
| 933 |
+
test_range='[0:25]',
|
| 934 |
+
test_split='test',
|
| 935 |
+
train_split='test'),
|
| 936 |
+
type='opencompass.datasets.LongBenchqasperDataset'),
|
| 937 |
+
dict(
|
| 938 |
+
abbr='LongBench_triviaqa_0',
|
| 939 |
+
eval_cfg=dict(
|
| 940 |
+
evaluator=dict(
|
| 941 |
+
type='opencompass.datasets.LongBenchF1Evaluator'),
|
| 942 |
+
pred_postprocessor=dict(
|
| 943 |
+
type='opencompass.datasets.triviaqa_postprocess'),
|
| 944 |
+
pred_role='BOT'),
|
| 945 |
+
infer_cfg=dict(
|
| 946 |
+
inferencer=dict(
|
| 947 |
+
max_out_len=32,
|
| 948 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 949 |
+
prompt_template=dict(
|
| 950 |
+
template=dict(round=[
|
| 951 |
+
dict(
|
| 952 |
+
prompt=
|
| 953 |
+
'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
|
| 954 |
+
role='HUMAN'),
|
| 955 |
+
]),
|
| 956 |
+
type=
|
| 957 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 958 |
+
retriever=dict(
|
| 959 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 960 |
+
name='triviaqa',
|
| 961 |
+
path='opencompass/Longbench',
|
| 962 |
+
reader_cfg=dict(
|
| 963 |
+
input_columns=[
|
| 964 |
+
'context',
|
| 965 |
+
'input',
|
| 966 |
+
],
|
| 967 |
+
output_column='answers',
|
| 968 |
+
test_range='[0:25]',
|
| 969 |
+
test_split='test',
|
| 970 |
+
train_split='test'),
|
| 971 |
+
type='opencompass.datasets.LongBenchtriviaqaDataset'),
|
| 972 |
+
dict(
|
| 973 |
+
abbr='LongBench_gov_report_0',
|
| 974 |
+
eval_cfg=dict(
|
| 975 |
+
evaluator=dict(
|
| 976 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 977 |
+
pred_role='BOT'),
|
| 978 |
+
infer_cfg=dict(
|
| 979 |
+
inferencer=dict(
|
| 980 |
+
max_out_len=512,
|
| 981 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 982 |
+
prompt_template=dict(
|
| 983 |
+
template=dict(round=[
|
| 984 |
+
dict(
|
| 985 |
+
prompt=
|
| 986 |
+
'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:',
|
| 987 |
+
role='HUMAN'),
|
| 988 |
+
]),
|
| 989 |
+
type=
|
| 990 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 991 |
+
retriever=dict(
|
| 992 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 993 |
+
name='gov_report',
|
| 994 |
+
path='opencompass/Longbench',
|
| 995 |
+
reader_cfg=dict(
|
| 996 |
+
input_columns=[
|
| 997 |
+
'context',
|
| 998 |
+
],
|
| 999 |
+
output_column='answers',
|
| 1000 |
+
test_range='[0:25]',
|
| 1001 |
+
test_split='test',
|
| 1002 |
+
train_split='test'),
|
| 1003 |
+
type='opencompass.datasets.LongBenchgov_reportDataset'),
|
| 1004 |
+
dict(
|
| 1005 |
+
abbr='LongBench_qmsum_0',
|
| 1006 |
+
eval_cfg=dict(
|
| 1007 |
+
evaluator=dict(
|
| 1008 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1009 |
+
pred_role='BOT'),
|
| 1010 |
+
infer_cfg=dict(
|
| 1011 |
+
inferencer=dict(
|
| 1012 |
+
max_out_len=512,
|
| 1013 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1014 |
+
prompt_template=dict(
|
| 1015 |
+
template=dict(round=[
|
| 1016 |
+
dict(
|
| 1017 |
+
prompt=
|
| 1018 |
+
'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
|
| 1019 |
+
role='HUMAN'),
|
| 1020 |
+
]),
|
| 1021 |
+
type=
|
| 1022 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1023 |
+
retriever=dict(
|
| 1024 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1025 |
+
name='qmsum',
|
| 1026 |
+
path='opencompass/Longbench',
|
| 1027 |
+
reader_cfg=dict(
|
| 1028 |
+
input_columns=[
|
| 1029 |
+
'context',
|
| 1030 |
+
'input',
|
| 1031 |
+
],
|
| 1032 |
+
output_column='answers',
|
| 1033 |
+
test_range='[0:25]',
|
| 1034 |
+
test_split='test',
|
| 1035 |
+
train_split='test'),
|
| 1036 |
+
type='opencompass.datasets.LongBenchqmsumDataset'),
|
| 1037 |
+
dict(
|
| 1038 |
+
abbr='LongBench_vcsum_0',
|
| 1039 |
+
eval_cfg=dict(
|
| 1040 |
+
evaluator=dict(
|
| 1041 |
+
language='zh',
|
| 1042 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1043 |
+
pred_role='BOT'),
|
| 1044 |
+
infer_cfg=dict(
|
| 1045 |
+
inferencer=dict(
|
| 1046 |
+
max_out_len=512,
|
| 1047 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1048 |
+
prompt_template=dict(
|
| 1049 |
+
template=dict(round=[
|
| 1050 |
+
dict(
|
| 1051 |
+
prompt=
|
| 1052 |
+
'下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
|
| 1053 |
+
role='HUMAN'),
|
| 1054 |
+
]),
|
| 1055 |
+
type=
|
| 1056 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1057 |
+
retriever=dict(
|
| 1058 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1059 |
+
name='vcsum',
|
| 1060 |
+
path='opencompass/Longbench',
|
| 1061 |
+
reader_cfg=dict(
|
| 1062 |
+
input_columns=[
|
| 1063 |
+
'context',
|
| 1064 |
+
],
|
| 1065 |
+
output_column='answers',
|
| 1066 |
+
test_range='[0:25]',
|
| 1067 |
+
test_split='test',
|
| 1068 |
+
train_split='test'),
|
| 1069 |
+
type='opencompass.datasets.LongBenchvcsumDataset'),
|
| 1070 |
+
dict(
|
| 1071 |
+
abbr='LongBench_dureader_0',
|
| 1072 |
+
eval_cfg=dict(
|
| 1073 |
+
evaluator=dict(
|
| 1074 |
+
language='zh',
|
| 1075 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1076 |
+
pred_role='BOT'),
|
| 1077 |
+
infer_cfg=dict(
|
| 1078 |
+
inferencer=dict(
|
| 1079 |
+
max_out_len=128,
|
| 1080 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1081 |
+
prompt_template=dict(
|
| 1082 |
+
template=dict(round=[
|
| 1083 |
+
dict(
|
| 1084 |
+
prompt=
|
| 1085 |
+
'请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
|
| 1086 |
+
role='HUMAN'),
|
| 1087 |
+
]),
|
| 1088 |
+
type=
|
| 1089 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1090 |
+
retriever=dict(
|
| 1091 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1092 |
+
name='dureader',
|
| 1093 |
+
path='opencompass/Longbench',
|
| 1094 |
+
reader_cfg=dict(
|
| 1095 |
+
input_columns=[
|
| 1096 |
+
'context',
|
| 1097 |
+
'input',
|
| 1098 |
+
],
|
| 1099 |
+
output_column='answers',
|
| 1100 |
+
test_range='[0:25]',
|
| 1101 |
+
test_split='test',
|
| 1102 |
+
train_split='test'),
|
| 1103 |
+
type='opencompass.datasets.LongBenchdureaderDataset'),
|
| 1104 |
+
dict(
|
| 1105 |
+
abbr='LongBench_lcc_0',
|
| 1106 |
+
eval_cfg=dict(
|
| 1107 |
+
evaluator=dict(
|
| 1108 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 1109 |
+
pred_role='BOT'),
|
| 1110 |
+
infer_cfg=dict(
|
| 1111 |
+
inferencer=dict(
|
| 1112 |
+
max_out_len=64,
|
| 1113 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1114 |
+
prompt_template=dict(
|
| 1115 |
+
template=dict(round=[
|
| 1116 |
+
dict(
|
| 1117 |
+
prompt=
|
| 1118 |
+
'Please complete the code given below. \n{context}Next line of code:\n',
|
| 1119 |
+
role='HUMAN'),
|
| 1120 |
+
]),
|
| 1121 |
+
type=
|
| 1122 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1123 |
+
retriever=dict(
|
| 1124 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1125 |
+
name='lcc',
|
| 1126 |
+
path='opencompass/Longbench',
|
| 1127 |
+
reader_cfg=dict(
|
| 1128 |
+
input_columns=[
|
| 1129 |
+
'context',
|
| 1130 |
+
],
|
| 1131 |
+
output_column='answers',
|
| 1132 |
+
test_range='[0:63]',
|
| 1133 |
+
test_split='test',
|
| 1134 |
+
train_split='test'),
|
| 1135 |
+
type='opencompass.datasets.LongBenchlccDataset'),
|
| 1136 |
+
dict(
|
| 1137 |
+
abbr='LongBench_repobench-p_0',
|
| 1138 |
+
eval_cfg=dict(
|
| 1139 |
+
evaluator=dict(
|
| 1140 |
+
type='opencompass.datasets.LongBenchCodeSimEvaluator'),
|
| 1141 |
+
pred_role='BOT'),
|
| 1142 |
+
infer_cfg=dict(
|
| 1143 |
+
inferencer=dict(
|
| 1144 |
+
max_out_len=64,
|
| 1145 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1146 |
+
prompt_template=dict(
|
| 1147 |
+
template=dict(round=[
|
| 1148 |
+
dict(
|
| 1149 |
+
prompt=
|
| 1150 |
+
'Please complete the code given below. \n{context}{input}Next line of code:\n',
|
| 1151 |
+
role='HUMAN'),
|
| 1152 |
+
]),
|
| 1153 |
+
type=
|
| 1154 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1155 |
+
retriever=dict(
|
| 1156 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1157 |
+
name='repobench-p',
|
| 1158 |
+
path='opencompass/Longbench',
|
| 1159 |
+
reader_cfg=dict(
|
| 1160 |
+
input_columns=[
|
| 1161 |
+
'context',
|
| 1162 |
+
'input',
|
| 1163 |
+
],
|
| 1164 |
+
output_column='answers',
|
| 1165 |
+
test_range='[0:63]',
|
| 1166 |
+
test_split='test',
|
| 1167 |
+
train_split='test'),
|
| 1168 |
+
type='opencompass.datasets.LongBenchrepobenchDataset'),
|
| 1169 |
+
dict(
|
| 1170 |
+
abbr='LongBench_passage_retrieval_en_0',
|
| 1171 |
+
eval_cfg=dict(
|
| 1172 |
+
evaluator=dict(
|
| 1173 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 1174 |
+
pred_role='BOT'),
|
| 1175 |
+
infer_cfg=dict(
|
| 1176 |
+
inferencer=dict(
|
| 1177 |
+
max_out_len=32,
|
| 1178 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1179 |
+
prompt_template=dict(
|
| 1180 |
+
template=dict(round=[
|
| 1181 |
+
dict(
|
| 1182 |
+
prompt=
|
| 1183 |
+
'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
|
| 1184 |
+
role='HUMAN'),
|
| 1185 |
+
]),
|
| 1186 |
+
type=
|
| 1187 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1188 |
+
retriever=dict(
|
| 1189 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1190 |
+
name='passage_retrieval_en',
|
| 1191 |
+
path='opencompass/Longbench',
|
| 1192 |
+
reader_cfg=dict(
|
| 1193 |
+
input_columns=[
|
| 1194 |
+
'context',
|
| 1195 |
+
'input',
|
| 1196 |
+
],
|
| 1197 |
+
output_column='answers',
|
| 1198 |
+
test_range='[0:25]',
|
| 1199 |
+
test_split='test',
|
| 1200 |
+
train_split='test'),
|
| 1201 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'),
|
| 1202 |
+
dict(
|
| 1203 |
+
abbr='LongBench_passage_retrieval_zh_0',
|
| 1204 |
+
eval_cfg=dict(
|
| 1205 |
+
evaluator=dict(
|
| 1206 |
+
language='zh',
|
| 1207 |
+
type='opencompass.datasets.LongBenchRetrievalEvaluator'),
|
| 1208 |
+
pred_role='BOT'),
|
| 1209 |
+
infer_cfg=dict(
|
| 1210 |
+
inferencer=dict(
|
| 1211 |
+
max_out_len=32,
|
| 1212 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1213 |
+
prompt_template=dict(
|
| 1214 |
+
template=dict(round=[
|
| 1215 |
+
dict(
|
| 1216 |
+
prompt=
|
| 1217 |
+
'以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
|
| 1218 |
+
role='HUMAN'),
|
| 1219 |
+
]),
|
| 1220 |
+
type=
|
| 1221 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1222 |
+
retriever=dict(
|
| 1223 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1224 |
+
name='passage_retrieval_zh',
|
| 1225 |
+
path='opencompass/Longbench',
|
| 1226 |
+
reader_cfg=dict(
|
| 1227 |
+
input_columns=[
|
| 1228 |
+
'context',
|
| 1229 |
+
'input',
|
| 1230 |
+
],
|
| 1231 |
+
output_column='answers',
|
| 1232 |
+
test_range='[0:25]',
|
| 1233 |
+
test_split='test',
|
| 1234 |
+
train_split='test'),
|
| 1235 |
+
type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'),
|
| 1236 |
+
dict(
|
| 1237 |
+
abbr='LongBench_passage_count_0',
|
| 1238 |
+
eval_cfg=dict(
|
| 1239 |
+
evaluator=dict(
|
| 1240 |
+
type='opencompass.datasets.LongBenchCountEvaluator'),
|
| 1241 |
+
pred_role='BOT'),
|
| 1242 |
+
infer_cfg=dict(
|
| 1243 |
+
inferencer=dict(
|
| 1244 |
+
max_out_len=32,
|
| 1245 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1246 |
+
prompt_template=dict(
|
| 1247 |
+
template=dict(round=[
|
| 1248 |
+
dict(
|
| 1249 |
+
prompt=
|
| 1250 |
+
'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ',
|
| 1251 |
+
role='HUMAN'),
|
| 1252 |
+
]),
|
| 1253 |
+
type=
|
| 1254 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1255 |
+
retriever=dict(
|
| 1256 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1257 |
+
name='passage_count',
|
| 1258 |
+
path='opencompass/Longbench',
|
| 1259 |
+
reader_cfg=dict(
|
| 1260 |
+
input_columns=[
|
| 1261 |
+
'context',
|
| 1262 |
+
'input',
|
| 1263 |
+
],
|
| 1264 |
+
output_column='answers',
|
| 1265 |
+
test_range='[0:25]',
|
| 1266 |
+
test_split='test',
|
| 1267 |
+
train_split='test'),
|
| 1268 |
+
type='opencompass.datasets.LongBenchpassage_countDataset'),
|
| 1269 |
+
dict(
|
| 1270 |
+
abbr='LongBench_trec_0',
|
| 1271 |
+
eval_cfg=dict(
|
| 1272 |
+
evaluator=dict(
|
| 1273 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 1274 |
+
),
|
| 1275 |
+
pred_postprocessor=dict(
|
| 1276 |
+
type='opencompass.datasets.trec_postprocess'),
|
| 1277 |
+
pred_role='BOT'),
|
| 1278 |
+
infer_cfg=dict(
|
| 1279 |
+
inferencer=dict(
|
| 1280 |
+
max_out_len=64,
|
| 1281 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1282 |
+
prompt_template=dict(
|
| 1283 |
+
template=dict(round=[
|
| 1284 |
+
dict(
|
| 1285 |
+
prompt=
|
| 1286 |
+
'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
|
| 1287 |
+
role='HUMAN'),
|
| 1288 |
+
]),
|
| 1289 |
+
type=
|
| 1290 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1291 |
+
retriever=dict(
|
| 1292 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1293 |
+
name='trec',
|
| 1294 |
+
path='opencompass/Longbench',
|
| 1295 |
+
reader_cfg=dict(
|
| 1296 |
+
input_columns=[
|
| 1297 |
+
'context',
|
| 1298 |
+
'input',
|
| 1299 |
+
],
|
| 1300 |
+
output_column='all_labels',
|
| 1301 |
+
test_range='[0:25]',
|
| 1302 |
+
test_split='test',
|
| 1303 |
+
train_split='test'),
|
| 1304 |
+
type='opencompass.datasets.LongBenchtrecDataset'),
|
| 1305 |
+
dict(
|
| 1306 |
+
abbr='LongBench_lsht_0',
|
| 1307 |
+
eval_cfg=dict(
|
| 1308 |
+
evaluator=dict(
|
| 1309 |
+
type='opencompass.datasets.LongBenchClassificationEvaluator'
|
| 1310 |
+
),
|
| 1311 |
+
pred_postprocessor=dict(
|
| 1312 |
+
type='opencompass.datasets.lsht_postprocess'),
|
| 1313 |
+
pred_role='BOT'),
|
| 1314 |
+
infer_cfg=dict(
|
| 1315 |
+
inferencer=dict(
|
| 1316 |
+
max_out_len=64,
|
| 1317 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1318 |
+
prompt_template=dict(
|
| 1319 |
+
template=dict(round=[
|
| 1320 |
+
dict(
|
| 1321 |
+
prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}',
|
| 1322 |
+
role='HUMAN'),
|
| 1323 |
+
]),
|
| 1324 |
+
type=
|
| 1325 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1326 |
+
retriever=dict(
|
| 1327 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1328 |
+
name='lsht',
|
| 1329 |
+
path='opencompass/Longbench',
|
| 1330 |
+
reader_cfg=dict(
|
| 1331 |
+
input_columns=[
|
| 1332 |
+
'context',
|
| 1333 |
+
'input',
|
| 1334 |
+
],
|
| 1335 |
+
output_column='all_labels',
|
| 1336 |
+
test_range='[0:25]',
|
| 1337 |
+
test_split='test',
|
| 1338 |
+
train_split='test'),
|
| 1339 |
+
type='opencompass.datasets.LongBenchlshtDataset'),
|
| 1340 |
+
dict(
|
| 1341 |
+
abbr='LongBench_multi_news_0',
|
| 1342 |
+
eval_cfg=dict(
|
| 1343 |
+
evaluator=dict(
|
| 1344 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1345 |
+
pred_role='BOT'),
|
| 1346 |
+
infer_cfg=dict(
|
| 1347 |
+
inferencer=dict(
|
| 1348 |
+
max_out_len=512,
|
| 1349 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1350 |
+
prompt_template=dict(
|
| 1351 |
+
template=dict(round=[
|
| 1352 |
+
dict(
|
| 1353 |
+
prompt=
|
| 1354 |
+
'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n',
|
| 1355 |
+
role='HUMAN'),
|
| 1356 |
+
]),
|
| 1357 |
+
type=
|
| 1358 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1359 |
+
retriever=dict(
|
| 1360 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1361 |
+
name='multi_news',
|
| 1362 |
+
path='opencompass/Longbench',
|
| 1363 |
+
reader_cfg=dict(
|
| 1364 |
+
input_columns=[
|
| 1365 |
+
'context',
|
| 1366 |
+
],
|
| 1367 |
+
output_column='answers',
|
| 1368 |
+
test_range='[0:25]',
|
| 1369 |
+
test_split='test',
|
| 1370 |
+
train_split='test'),
|
| 1371 |
+
type='opencompass.datasets.LongBenchmulti_newsDataset'),
|
| 1372 |
+
dict(
|
| 1373 |
+
abbr='LongBench_samsum_0',
|
| 1374 |
+
eval_cfg=dict(
|
| 1375 |
+
evaluator=dict(
|
| 1376 |
+
type='opencompass.datasets.LongBenchRougeEvaluator'),
|
| 1377 |
+
pred_postprocessor=dict(
|
| 1378 |
+
type='opencompass.datasets.samsum_postprocess'),
|
| 1379 |
+
pred_role='BOT'),
|
| 1380 |
+
infer_cfg=dict(
|
| 1381 |
+
inferencer=dict(
|
| 1382 |
+
max_out_len=128,
|
| 1383 |
+
type='opencompass.openicl.icl_inferencer.GenInferencer'),
|
| 1384 |
+
prompt_template=dict(
|
| 1385 |
+
template=dict(round=[
|
| 1386 |
+
dict(
|
| 1387 |
+
prompt=
|
| 1388 |
+
'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
|
| 1389 |
+
role='HUMAN'),
|
| 1390 |
+
]),
|
| 1391 |
+
type=
|
| 1392 |
+
'opencompass.openicl.icl_prompt_template.PromptTemplate'),
|
| 1393 |
+
retriever=dict(
|
| 1394 |
+
type='opencompass.openicl.icl_retriever.ZeroRetriever')),
|
| 1395 |
+
name='samsum',
|
| 1396 |
+
path='opencompass/Longbench',
|
| 1397 |
+
reader_cfg=dict(
|
| 1398 |
+
input_columns=[
|
| 1399 |
+
'context',
|
| 1400 |
+
'input',
|
| 1401 |
+
],
|
| 1402 |
+
output_column='answers',
|
| 1403 |
+
test_range='[0:25]',
|
| 1404 |
+
test_split='test',
|
| 1405 |
+
train_split='test'),
|
| 1406 |
+
type='opencompass.datasets.LongBenchsamsumDataset'),
|
| 1407 |
+
],
|
| 1408 |
+
]
|
| 1409 |
+
models = [
|
| 1410 |
+
dict(
|
| 1411 |
+
abbr='delta_net',
|
| 1412 |
+
batch_size=128,
|
| 1413 |
+
max_seq_len=2048,
|
| 1414 |
+
model_kwargs=dict(
|
| 1415 |
+
device_map='auto',
|
| 1416 |
+
torch_dtype='torch.bfloat16',
|
| 1417 |
+
trust_remote_code=True),
|
| 1418 |
+
path='/mnt/jfzn/msj/delta_net-1.3B-100B',
|
| 1419 |
+
run_cfg=dict(num_gpus=1),
|
| 1420 |
+
tokenizer_kwargs=dict(padding_side='left', truncation_side='left'),
|
| 1421 |
+
tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B',
|
| 1422 |
+
type='opencompass.models.HuggingFaceBaseModel'),
|
| 1423 |
+
]
|
| 1424 |
+
work_dir = 'outputs/default/20251127_221150'
|