Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_en/longbench_passage_retrieval_en_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_en/longbench_passage_retrieval_en_gen_734db5.py +46 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_zh/longbench_passage_retrieval_zh_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_zh/longbench_passage_retrieval_zh_gen_01cca2.py +46 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchqasper/longbench_qasper_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchqasper/longbench_qasper_gen_6b3efc.py +43 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchqmsum/longbench_qmsum_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchqmsum/longbench_qmsum_gen_d33331.py +43 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchrepobench/longbench_repobench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchrepobench/longbench_repobench_gen_6df953.py +43 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchsamsum/longbench_samsum_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchsamsum/longbench_samsum_gen_f4416d.py +49 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchtrec/longbench_trec_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchtrec/longbench_trec_gen_824187.py +49 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchtriviaqa/longbench_triviaqa_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchtriviaqa/longbench_triviaqa_gen_d30cb9.py +49 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchvcsum/longbench_vcsum_gen.py +4 -0
- build/lib/opencompass/configs/datasets/longbench/longbenchvcsum/longbench_vcsum_gen_f7a8ac.py +43 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalcmrc_mixup/lveval_cmrc_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalcmrc_mixup/lveval_cmrc_mixup_gen_465823.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevaldureader_mixup/lveval_dureader_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevaldureader_mixup/lveval_dureader_mixup_gen_465823.py +55 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_en/lveval_factrecall_en_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_en/lveval_factrecall_en_gen_9a836f.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_zh/lveval_factrecall_zh_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_zh/lveval_factrecall_zh_gen_dbee70.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalhotpotwikiqa_mixup/lveval_hotpotwikiqa_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalhotpotwikiqa_mixup/lveval_hotpotwikiqa_mixup_gen_77ce82.py +59 -0
- build/lib/opencompass/configs/datasets/lveval/lvevallic_mixup/lveval_lic_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevallic_mixup/lveval_lic_mixup_gen_01eb0c.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalloogle_CR_mixup/lveval_loogle_CR_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalloogle_CR_mixup/lveval_loogle_CR_mixup_gen_d7ea36.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalloogle_MIR_mixup/lveval_loogle_MIR_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalloogle_MIR_mixup/lveval_loogle_MIR_mixup_gen_d7ea36.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalloogle_SD_mixup/lveval_loogle_SD_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalloogle_SD_mixup/lveval_loogle_SD_mixup_gen_d7ea36.py +54 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen_d7ea36.py +59 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen.py +6 -0
- build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen_0fbdad.py +59 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc.py +110 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice.py +136 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_20.py +133 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_50.py +43 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py +125 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_80.py +43 -0
- build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py +97 -0
- build/lib/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_1000k.py +18 -0
- build/lib/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py +288 -0
- build/lib/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py +108 -0
build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_en/longbench_passage_retrieval_en_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_passage_retrieval_en_gen_734db5 import LongBench_passage_retrieval_en_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_en/longbench_passage_retrieval_en_gen_734db5.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LongBenchRetrievalEvaluator,
|
| 6 |
+
LongBenchpassage_retrieval_enDataset,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
LongBench_passage_retrieval_en_reader_cfg = dict(
|
| 10 |
+
input_columns=['context', 'input'],
|
| 11 |
+
output_column='answers',
|
| 12 |
+
train_split='test',
|
| 13 |
+
test_split='test',
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
LongBench_passage_retrieval_en_infer_cfg = dict(
|
| 17 |
+
prompt_template=dict(
|
| 18 |
+
type=PromptTemplate,
|
| 19 |
+
template=dict(
|
| 20 |
+
round=[
|
| 21 |
+
dict(
|
| 22 |
+
role='HUMAN',
|
| 23 |
+
prompt='Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
|
| 24 |
+
),
|
| 25 |
+
],
|
| 26 |
+
),
|
| 27 |
+
),
|
| 28 |
+
retriever=dict(type=ZeroRetriever),
|
| 29 |
+
inferencer=dict(type=GenInferencer, max_out_len=32),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
LongBench_passage_retrieval_en_eval_cfg = dict(
|
| 33 |
+
evaluator=dict(type=LongBenchRetrievalEvaluator), pred_role='BOT'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
LongBench_passage_retrieval_en_datasets = [
|
| 37 |
+
dict(
|
| 38 |
+
type=LongBenchpassage_retrieval_enDataset,
|
| 39 |
+
abbr='LongBench_passage_retrieval_en',
|
| 40 |
+
path='opencompass/Longbench',
|
| 41 |
+
name='passage_retrieval_en',
|
| 42 |
+
reader_cfg=LongBench_passage_retrieval_en_reader_cfg,
|
| 43 |
+
infer_cfg=LongBench_passage_retrieval_en_infer_cfg,
|
| 44 |
+
eval_cfg=LongBench_passage_retrieval_en_eval_cfg,
|
| 45 |
+
)
|
| 46 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_zh/longbench_passage_retrieval_zh_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_passage_retrieval_zh_gen_01cca2 import LongBench_passage_retrieval_zh_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchpassage_retrieval_zh/longbench_passage_retrieval_zh_gen_01cca2.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LongBenchRetrievalEvaluator,
|
| 6 |
+
LongBenchpassage_retrieval_zhDataset,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
LongBench_passage_retrieval_zh_reader_cfg = dict(
|
| 10 |
+
input_columns=['context', 'input'],
|
| 11 |
+
output_column='answers',
|
| 12 |
+
train_split='test',
|
| 13 |
+
test_split='test',
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
LongBench_passage_retrieval_zh_infer_cfg = dict(
|
| 17 |
+
prompt_template=dict(
|
| 18 |
+
type=PromptTemplate,
|
| 19 |
+
template=dict(
|
| 20 |
+
round=[
|
| 21 |
+
dict(
|
| 22 |
+
role='HUMAN',
|
| 23 |
+
prompt='以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
|
| 24 |
+
),
|
| 25 |
+
],
|
| 26 |
+
),
|
| 27 |
+
),
|
| 28 |
+
retriever=dict(type=ZeroRetriever),
|
| 29 |
+
inferencer=dict(type=GenInferencer, max_out_len=32),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
LongBench_passage_retrieval_zh_eval_cfg = dict(
|
| 33 |
+
evaluator=dict(type=LongBenchRetrievalEvaluator, language='zh'), pred_role='BOT'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
LongBench_passage_retrieval_zh_datasets = [
|
| 37 |
+
dict(
|
| 38 |
+
type=LongBenchpassage_retrieval_zhDataset,
|
| 39 |
+
abbr='LongBench_passage_retrieval_zh',
|
| 40 |
+
path='opencompass/Longbench',
|
| 41 |
+
name='passage_retrieval_zh',
|
| 42 |
+
reader_cfg=LongBench_passage_retrieval_zh_reader_cfg,
|
| 43 |
+
infer_cfg=LongBench_passage_retrieval_zh_infer_cfg,
|
| 44 |
+
eval_cfg=LongBench_passage_retrieval_zh_eval_cfg,
|
| 45 |
+
)
|
| 46 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchqasper/longbench_qasper_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_qasper_gen_6b3efc import LongBench_qasper_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchqasper/longbench_qasper_gen_6b3efc.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LongBenchF1Evaluator, LongBenchqasperDataset
|
| 5 |
+
|
| 6 |
+
LongBench_qasper_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LongBench_qasper_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=32),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LongBench_qasper_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LongBenchF1Evaluator), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_qasper_datasets = [
|
| 34 |
+
dict(
|
| 35 |
+
type=LongBenchqasperDataset,
|
| 36 |
+
abbr='LongBench_qasper',
|
| 37 |
+
path='opencompass/Longbench',
|
| 38 |
+
name='qasper',
|
| 39 |
+
reader_cfg=LongBench_qasper_reader_cfg,
|
| 40 |
+
infer_cfg=LongBench_qasper_infer_cfg,
|
| 41 |
+
eval_cfg=LongBench_qasper_eval_cfg,
|
| 42 |
+
)
|
| 43 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchqmsum/longbench_qmsum_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_qmsum_gen_d33331 import LongBench_qmsum_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchqmsum/longbench_qmsum_gen_d33331.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchqmsumDataset
|
| 5 |
+
|
| 6 |
+
LongBench_qmsum_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LongBench_qmsum_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=512),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LongBench_qmsum_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LongBenchRougeEvaluator), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_qmsum_datasets = [
|
| 34 |
+
dict(
|
| 35 |
+
type=LongBenchqmsumDataset,
|
| 36 |
+
abbr='LongBench_qmsum',
|
| 37 |
+
path='opencompass/Longbench',
|
| 38 |
+
name='qmsum',
|
| 39 |
+
reader_cfg=LongBench_qmsum_reader_cfg,
|
| 40 |
+
infer_cfg=LongBench_qmsum_infer_cfg,
|
| 41 |
+
eval_cfg=LongBench_qmsum_eval_cfg,
|
| 42 |
+
)
|
| 43 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchrepobench/longbench_repobench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_repobench_gen_6df953 import LongBench_repobench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchrepobench/longbench_repobench_gen_6df953.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LongBenchCodeSimEvaluator, LongBenchrepobenchDataset
|
| 5 |
+
|
| 6 |
+
LongBench_repobench_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LongBench_repobench_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='Please complete the code given below. \n{context}{input}Next line of code:\n',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LongBench_repobench_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LongBenchCodeSimEvaluator), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_repobench_datasets = [
|
| 34 |
+
dict(
|
| 35 |
+
type=LongBenchrepobenchDataset,
|
| 36 |
+
abbr='LongBench_repobench-p',
|
| 37 |
+
path='opencompass/Longbench',
|
| 38 |
+
name='repobench-p',
|
| 39 |
+
reader_cfg=LongBench_repobench_reader_cfg,
|
| 40 |
+
infer_cfg=LongBench_repobench_infer_cfg,
|
| 41 |
+
eval_cfg=LongBench_repobench_eval_cfg,
|
| 42 |
+
)
|
| 43 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchsamsum/longbench_samsum_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_samsum_gen_f4416d import LongBench_samsum_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchsamsum/longbench_samsum_gen_f4416d.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LongBenchRougeEvaluator,
|
| 6 |
+
LongBenchsamsumDataset,
|
| 7 |
+
samsum_postprocess,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
LongBench_samsum_reader_cfg = dict(
|
| 11 |
+
input_columns=['context', 'input'],
|
| 12 |
+
output_column='answers',
|
| 13 |
+
train_split='test',
|
| 14 |
+
test_split='test',
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
LongBench_samsum_infer_cfg = dict(
|
| 18 |
+
prompt_template=dict(
|
| 19 |
+
type=PromptTemplate,
|
| 20 |
+
template=dict(
|
| 21 |
+
round=[
|
| 22 |
+
dict(
|
| 23 |
+
role='HUMAN',
|
| 24 |
+
prompt='Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}',
|
| 25 |
+
),
|
| 26 |
+
],
|
| 27 |
+
),
|
| 28 |
+
),
|
| 29 |
+
retriever=dict(type=ZeroRetriever),
|
| 30 |
+
inferencer=dict(type=GenInferencer, max_out_len=128),
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_samsum_eval_cfg = dict(
|
| 34 |
+
evaluator=dict(type=LongBenchRougeEvaluator),
|
| 35 |
+
pred_role='BOT',
|
| 36 |
+
pred_postprocessor=dict(type=samsum_postprocess),
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
LongBench_samsum_datasets = [
|
| 40 |
+
dict(
|
| 41 |
+
type=LongBenchsamsumDataset,
|
| 42 |
+
abbr='LongBench_samsum',
|
| 43 |
+
path='opencompass/Longbench',
|
| 44 |
+
name='samsum',
|
| 45 |
+
reader_cfg=LongBench_samsum_reader_cfg,
|
| 46 |
+
infer_cfg=LongBench_samsum_infer_cfg,
|
| 47 |
+
eval_cfg=LongBench_samsum_eval_cfg,
|
| 48 |
+
)
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchtrec/longbench_trec_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_trec_gen_824187 import LongBench_trec_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchtrec/longbench_trec_gen_824187.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LongBenchClassificationEvaluator,
|
| 6 |
+
LongBenchtrecDataset,
|
| 7 |
+
trec_postprocess,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
LongBench_trec_reader_cfg = dict(
|
| 11 |
+
input_columns=['context', 'input'],
|
| 12 |
+
output_column='all_labels',
|
| 13 |
+
train_split='test',
|
| 14 |
+
test_split='test',
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
LongBench_trec_infer_cfg = dict(
|
| 18 |
+
prompt_template=dict(
|
| 19 |
+
type=PromptTemplate,
|
| 20 |
+
template=dict(
|
| 21 |
+
round=[
|
| 22 |
+
dict(
|
| 23 |
+
role='HUMAN',
|
| 24 |
+
prompt='Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}',
|
| 25 |
+
),
|
| 26 |
+
],
|
| 27 |
+
),
|
| 28 |
+
),
|
| 29 |
+
retriever=dict(type=ZeroRetriever),
|
| 30 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_trec_eval_cfg = dict(
|
| 34 |
+
evaluator=dict(type=LongBenchClassificationEvaluator),
|
| 35 |
+
pred_role='BOT',
|
| 36 |
+
pred_postprocessor=dict(type=trec_postprocess),
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
LongBench_trec_datasets = [
|
| 40 |
+
dict(
|
| 41 |
+
type=LongBenchtrecDataset,
|
| 42 |
+
abbr='LongBench_trec',
|
| 43 |
+
path='opencompass/Longbench',
|
| 44 |
+
name='trec',
|
| 45 |
+
reader_cfg=LongBench_trec_reader_cfg,
|
| 46 |
+
infer_cfg=LongBench_trec_infer_cfg,
|
| 47 |
+
eval_cfg=LongBench_trec_eval_cfg,
|
| 48 |
+
)
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchtriviaqa/longbench_triviaqa_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_triviaqa_gen_d30cb9 import LongBench_triviaqa_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchtriviaqa/longbench_triviaqa_gen_d30cb9.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LongBenchF1Evaluator,
|
| 6 |
+
LongBenchtriviaqaDataset,
|
| 7 |
+
triviaqa_postprocess,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
LongBench_triviaqa_reader_cfg = dict(
|
| 11 |
+
input_columns=['context', 'input'],
|
| 12 |
+
output_column='answers',
|
| 13 |
+
train_split='test',
|
| 14 |
+
test_split='test',
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
LongBench_triviaqa_infer_cfg = dict(
|
| 18 |
+
prompt_template=dict(
|
| 19 |
+
type=PromptTemplate,
|
| 20 |
+
template=dict(
|
| 21 |
+
round=[
|
| 22 |
+
dict(
|
| 23 |
+
role='HUMAN',
|
| 24 |
+
prompt='Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}',
|
| 25 |
+
),
|
| 26 |
+
],
|
| 27 |
+
),
|
| 28 |
+
),
|
| 29 |
+
retriever=dict(type=ZeroRetriever),
|
| 30 |
+
inferencer=dict(type=GenInferencer, max_out_len=32),
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_triviaqa_eval_cfg = dict(
|
| 34 |
+
evaluator=dict(type=LongBenchF1Evaluator),
|
| 35 |
+
pred_role='BOT',
|
| 36 |
+
pred_postprocessor=dict(type=triviaqa_postprocess),
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
LongBench_triviaqa_datasets = [
|
| 40 |
+
dict(
|
| 41 |
+
type=LongBenchtriviaqaDataset,
|
| 42 |
+
abbr='LongBench_triviaqa',
|
| 43 |
+
path='opencompass/Longbench',
|
| 44 |
+
name='triviaqa',
|
| 45 |
+
reader_cfg=LongBench_triviaqa_reader_cfg,
|
| 46 |
+
infer_cfg=LongBench_triviaqa_infer_cfg,
|
| 47 |
+
eval_cfg=LongBench_triviaqa_eval_cfg,
|
| 48 |
+
)
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/longbench/longbenchvcsum/longbench_vcsum_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .longbench_vcsum_gen_f7a8ac import LongBench_vcsum_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/longbench/longbenchvcsum/longbench_vcsum_gen_f7a8ac.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchvcsumDataset
|
| 5 |
+
|
| 6 |
+
LongBench_vcsum_reader_cfg = dict(
|
| 7 |
+
input_columns=['context'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LongBench_vcsum_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=512),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LongBench_vcsum_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LongBenchRougeEvaluator, language='zh'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
LongBench_vcsum_datasets = [
|
| 34 |
+
dict(
|
| 35 |
+
type=LongBenchvcsumDataset,
|
| 36 |
+
abbr='LongBench_vcsum',
|
| 37 |
+
path='opencompass/Longbench',
|
| 38 |
+
name='vcsum',
|
| 39 |
+
reader_cfg=LongBench_vcsum_reader_cfg,
|
| 40 |
+
infer_cfg=LongBench_vcsum_infer_cfg,
|
| 41 |
+
eval_cfg=LongBench_vcsum_eval_cfg,
|
| 42 |
+
)
|
| 43 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalcmrc_mixup/lveval_cmrc_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_cmrc_mixup_gen_465823 import (
|
| 5 |
+
LVEval_cmrc_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalcmrc_mixup/lveval_cmrc_mixup_gen_465823.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalOPTF1Evaluator, LVEvalcmrcDataset
|
| 5 |
+
|
| 6 |
+
LVEval_cmrc_mixup_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_cmrc_mixup_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。\n\n文章:{context}\n\n现在请基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。\n\n问题:{input}\n回答:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_cmrc_mixup_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='zh'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_cmrc_mixup_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvalcmrcDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_cmrc_mixup_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_cmrc_mixup_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_cmrc_mixup_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('cmrc_mixup', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevaldureader_mixup/lveval_dureader_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_dureader_mixup_gen_465823 import (
|
| 5 |
+
LVEval_dureader_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevaldureader_mixup/lveval_dureader_mixup_gen_465823.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalOPTRougeEvaluator, LVEvaldureaderDataset
|
| 5 |
+
|
| 6 |
+
LVEval_dureader_mixup_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_dureader_mixup_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。\n\n文章:{context}\n\n现在请基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。\n\n问题:{input}\n回答:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_dureader_mixup_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalOPTRougeEvaluator, language='zh'),
|
| 31 |
+
pred_role='BOT',
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 38 |
+
datasets = []
|
| 39 |
+
for length in length_levels:
|
| 40 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 41 |
+
return datasets
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
LVEval_dureader_mixup_datasets = [
|
| 45 |
+
dict(
|
| 46 |
+
type=LVEvaldureaderDataset,
|
| 47 |
+
abbr='LVEval_' + name_len,
|
| 48 |
+
path='Infinigence/LVEval',
|
| 49 |
+
name=name_len,
|
| 50 |
+
reader_cfg=LVEval_dureader_mixup_reader_cfg,
|
| 51 |
+
infer_cfg=LVEval_dureader_mixup_infer_cfg,
|
| 52 |
+
eval_cfg=LVEval_dureader_mixup_eval_cfg,
|
| 53 |
+
)
|
| 54 |
+
for name_len in get_dataset_names('dureader_mixup', DATASET_LENGTH_LEVEL)
|
| 55 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_en/lveval_factrecall_en_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_factrecall_en_gen_9a836f import (
|
| 5 |
+
LVEval_factrecall_en_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_en/lveval_factrecall_en_gen_9a836f.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalF1Evaluator, LVEvalfactrecallenDataset
|
| 5 |
+
|
| 6 |
+
LVEval_factrecall_en_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_factrecall_en_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='Please answer the following questions based on the given article.\n\nArticle: {context}\n\nPlease answer the following questions based on the above article.\n\nQuestion: {input}\nAnswer:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=16),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_factrecall_en_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalF1Evaluator, language='en'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_factrecall_en_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvalfactrecallenDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_factrecall_en_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_factrecall_en_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_factrecall_en_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('factrecall_en', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_zh/lveval_factrecall_zh_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_factrecall_zh_gen_dbee70 import (
|
| 5 |
+
LVEval_factrecall_zh_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalfactrecall_zh/lveval_factrecall_zh_gen_dbee70.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalF1Evaluator, LVEvalfactrecallzhDataset
|
| 5 |
+
|
| 6 |
+
LVEval_factrecall_zh_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_factrecall_zh_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='请基于给定的文章回答下述问题。\n\n文章:{context}\n\n现在请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=16),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_factrecall_zh_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalF1Evaluator, language='zh'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_factrecall_zh_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvalfactrecallzhDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_factrecall_zh_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_factrecall_zh_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_factrecall_zh_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('factrecall_zh', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalhotpotwikiqa_mixup/lveval_hotpotwikiqa_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_hotpotwikiqa_mixup_gen_77ce82 import (
|
| 5 |
+
LVEval_hotpotwikiqa_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalhotpotwikiqa_mixup/lveval_hotpotwikiqa_mixup_gen_77ce82.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LVEvalOPTF1Evaluator,
|
| 6 |
+
LVEvalhotpotwikiqaDataset,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
LVEval_hotpotwikiqa_mixup_reader_cfg = dict(
|
| 10 |
+
input_columns=['context', 'input'],
|
| 11 |
+
output_column='answers',
|
| 12 |
+
train_split='test',
|
| 13 |
+
test_split='test',
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
LVEval_hotpotwikiqa_mixup_infer_cfg = dict(
|
| 17 |
+
prompt_template=dict(
|
| 18 |
+
type=PromptTemplate,
|
| 19 |
+
template=dict(
|
| 20 |
+
round=[
|
| 21 |
+
dict(
|
| 22 |
+
role='HUMAN',
|
| 23 |
+
prompt='Answer the question based on the given passages. Questions and answers are only relevant to some passages. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to some passages. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:',
|
| 24 |
+
),
|
| 25 |
+
],
|
| 26 |
+
),
|
| 27 |
+
),
|
| 28 |
+
retriever=dict(type=ZeroRetriever),
|
| 29 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
LVEval_hotpotwikiqa_mixup_eval_cfg = dict(
|
| 33 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='en'), pred_role='BOT'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 40 |
+
datasets = []
|
| 41 |
+
for length in length_levels:
|
| 42 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 43 |
+
return datasets
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
LVEval_hotpotwikiqa_mixup_datasets = [
|
| 47 |
+
dict(
|
| 48 |
+
type=LVEvalhotpotwikiqaDataset,
|
| 49 |
+
abbr='LVEval_' + name_len,
|
| 50 |
+
path='Infinigence/LVEval',
|
| 51 |
+
name=name_len,
|
| 52 |
+
reader_cfg=LVEval_hotpotwikiqa_mixup_reader_cfg,
|
| 53 |
+
infer_cfg=LVEval_hotpotwikiqa_mixup_infer_cfg,
|
| 54 |
+
eval_cfg=LVEval_hotpotwikiqa_mixup_eval_cfg,
|
| 55 |
+
)
|
| 56 |
+
for name_len in get_dataset_names(
|
| 57 |
+
'hotpotwikiqa_mixup', DATASET_LENGTH_LEVEL
|
| 58 |
+
)
|
| 59 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevallic_mixup/lveval_lic_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_lic_mixup_gen_01eb0c import (
|
| 5 |
+
LVEval_lic_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevallic_mixup/lveval_lic_mixup_gen_01eb0c.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalOPTF1Evaluator, LVEvallicDataset
|
| 5 |
+
|
| 6 |
+
LVEval_lic_mixup_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_lic_mixup_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。\n\n文章:{context}\n\n请现在基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。\n\n问题:{input}\n回答:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_lic_mixup_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='zh'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_lic_mixup_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvallicDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_lic_mixup_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_lic_mixup_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_lic_mixup_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('lic_mixup', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalloogle_CR_mixup/lveval_loogle_CR_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_loogle_CR_mixup_gen_d7ea36 import (
|
| 5 |
+
LVEval_loogle_CR_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalloogle_CR_mixup/lveval_loogle_CR_mixup_gen_d7ea36.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalOPTF1Evaluator, LVEvallooglecrDataset
|
| 5 |
+
|
| 6 |
+
LVEval_loogle_CR_mixup_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_loogle_CR_mixup_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='Please answer the following question based on the given passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_loogle_CR_mixup_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='en'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_loogle_CR_mixup_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvallooglecrDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_loogle_CR_mixup_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_loogle_CR_mixup_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_loogle_CR_mixup_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('loogle_CR_mixup', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalloogle_MIR_mixup/lveval_loogle_MIR_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_loogle_MIR_mixup_gen_d7ea36 import (
|
| 5 |
+
LVEval_loogle_MIR_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalloogle_MIR_mixup/lveval_loogle_MIR_mixup_gen_d7ea36.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalOPTF1Evaluator, LVEvallooglemirDataset
|
| 5 |
+
|
| 6 |
+
LVEval_loogle_MIR_mixup_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_loogle_MIR_mixup_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='Please answer the following question based on the given passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_loogle_MIR_mixup_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='en'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_loogle_MIR_mixup_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvallooglemirDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_loogle_MIR_mixup_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_loogle_MIR_mixup_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_loogle_MIR_mixup_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('loogle_MIR_mixup', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalloogle_SD_mixup/lveval_loogle_SD_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_loogle_SD_mixup_gen_d7ea36 import (
|
| 5 |
+
LVEval_loogle_SD_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalloogle_SD_mixup/lveval_loogle_SD_mixup_gen_d7ea36.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import LVEvalOPTF1Evaluator, LVEvallooglesdDataset
|
| 5 |
+
|
| 6 |
+
LVEval_loogle_SD_mixup_reader_cfg = dict(
|
| 7 |
+
input_columns=['context', 'input'],
|
| 8 |
+
output_column='answers',
|
| 9 |
+
train_split='test',
|
| 10 |
+
test_split='test',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
LVEval_loogle_SD_mixup_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(
|
| 17 |
+
round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='Please answer the following question based on the given passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:',
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
LVEval_loogle_SD_mixup_eval_cfg = dict(
|
| 30 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='en'), pred_role='BOT'
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 37 |
+
datasets = []
|
| 38 |
+
for length in length_levels:
|
| 39 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 40 |
+
return datasets
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
LVEval_loogle_SD_mixup_datasets = [
|
| 44 |
+
dict(
|
| 45 |
+
type=LVEvallooglesdDataset,
|
| 46 |
+
abbr='LVEval_' + name_len,
|
| 47 |
+
path='Infinigence/LVEval',
|
| 48 |
+
name=name_len,
|
| 49 |
+
reader_cfg=LVEval_loogle_SD_mixup_reader_cfg,
|
| 50 |
+
infer_cfg=LVEval_loogle_SD_mixup_infer_cfg,
|
| 51 |
+
eval_cfg=LVEval_loogle_SD_mixup_eval_cfg,
|
| 52 |
+
)
|
| 53 |
+
for name_len in get_dataset_names('loogle_SD_mixup', DATASET_LENGTH_LEVEL)
|
| 54 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_multifieldqa_en_mixup_gen_d7ea36 import (
|
| 5 |
+
LVEval_multifieldqa_en_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen_d7ea36.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LVEvalOPTF1Evaluator,
|
| 6 |
+
LVEvalmultifieldqaenDataset,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
LVEval_multifieldqa_en_mixup_reader_cfg = dict(
|
| 10 |
+
input_columns=['context', 'input'],
|
| 11 |
+
output_column='answers',
|
| 12 |
+
train_split='test',
|
| 13 |
+
test_split='test',
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
LVEval_multifieldqa_en_mixup_infer_cfg = dict(
|
| 17 |
+
prompt_template=dict(
|
| 18 |
+
type=PromptTemplate,
|
| 19 |
+
template=dict(
|
| 20 |
+
round=[
|
| 21 |
+
dict(
|
| 22 |
+
role='HUMAN',
|
| 23 |
+
prompt='Please answer the following question based on the given passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:',
|
| 24 |
+
),
|
| 25 |
+
],
|
| 26 |
+
),
|
| 27 |
+
),
|
| 28 |
+
retriever=dict(type=ZeroRetriever),
|
| 29 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
LVEval_multifieldqa_en_mixup_eval_cfg = dict(
|
| 33 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='en'), pred_role='BOT'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 40 |
+
datasets = []
|
| 41 |
+
for length in length_levels:
|
| 42 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 43 |
+
return datasets
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
LVEval_multifieldqa_en_mixup_datasets = [
|
| 47 |
+
dict(
|
| 48 |
+
type=LVEvalmultifieldqaenDataset,
|
| 49 |
+
abbr='LVEval_' + name_len,
|
| 50 |
+
path='Infinigence/LVEval',
|
| 51 |
+
name=name_len,
|
| 52 |
+
reader_cfg=LVEval_multifieldqa_en_mixup_reader_cfg,
|
| 53 |
+
infer_cfg=LVEval_multifieldqa_en_mixup_infer_cfg,
|
| 54 |
+
eval_cfg=LVEval_multifieldqa_en_mixup_eval_cfg,
|
| 55 |
+
)
|
| 56 |
+
for name_len in get_dataset_names(
|
| 57 |
+
'multifieldqa_en_mixup', DATASET_LENGTH_LEVEL
|
| 58 |
+
)
|
| 59 |
+
]
|
build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .lveval_multifieldqa_zh_mixup_gen_0fbdad import (
|
| 5 |
+
LVEval_multifieldqa_zh_mixup_datasets,
|
| 6 |
+
) # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen_0fbdad.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import (
|
| 5 |
+
LVEvalOPTF1Evaluator,
|
| 6 |
+
LVEvalmultifieldqazhDataset,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
LVEval_multifieldqa_zh_mixup_reader_cfg = dict(
|
| 10 |
+
input_columns=['context', 'input'],
|
| 11 |
+
output_column='answers',
|
| 12 |
+
train_split='test',
|
| 13 |
+
test_split='test',
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
LVEval_multifieldqa_zh_mixup_infer_cfg = dict(
|
| 17 |
+
prompt_template=dict(
|
| 18 |
+
type=PromptTemplate,
|
| 19 |
+
template=dict(
|
| 20 |
+
round=[
|
| 21 |
+
dict(
|
| 22 |
+
role='HUMAN',
|
| 23 |
+
prompt='请阅读以下文章并用中文回答问题,问题和答案只与其中一篇文章有关。只需要直接给出问题的答案,不要输出其他任何解释和证据。\n\n文章:{context}\n\n请基于上面的文章回答下面的问题,问题和答案只与其中一篇文章有关。只需要直接给出问题的答案,不要输出其他任何解释和证据。\n\n问题:{input}\n回答:',
|
| 24 |
+
),
|
| 25 |
+
],
|
| 26 |
+
),
|
| 27 |
+
),
|
| 28 |
+
retriever=dict(type=ZeroRetriever),
|
| 29 |
+
inferencer=dict(type=GenInferencer, max_out_len=64),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
LVEval_multifieldqa_zh_mixup_eval_cfg = dict(
|
| 33 |
+
evaluator=dict(type=LVEvalOPTF1Evaluator, language='zh'), pred_role='BOT'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
DATASET_LENGTH_LEVEL = ['16k', '32k', '64k', '128k', '256k']
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_dataset_names(dataset_name, length_levels):
|
| 40 |
+
datasets = []
|
| 41 |
+
for length in length_levels:
|
| 42 |
+
datasets.append(f'{dataset_name}_{length}')
|
| 43 |
+
return datasets
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
LVEval_multifieldqa_zh_mixup_datasets = [
|
| 47 |
+
dict(
|
| 48 |
+
type=LVEvalmultifieldqazhDataset,
|
| 49 |
+
abbr='LVEval_' + name_len,
|
| 50 |
+
path='Infinigence/LVEval',
|
| 51 |
+
name=name_len,
|
| 52 |
+
reader_cfg=LVEval_multifieldqa_zh_mixup_reader_cfg,
|
| 53 |
+
infer_cfg=LVEval_multifieldqa_zh_mixup_infer_cfg,
|
| 54 |
+
eval_cfg=LVEval_multifieldqa_zh_mixup_eval_cfg,
|
| 55 |
+
)
|
| 56 |
+
for name_len in get_dataset_names(
|
| 57 |
+
'multifieldqa_zh_mixup', DATASET_LENGTH_LEVEL
|
| 58 |
+
)
|
| 59 |
+
]
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets.needlebench.atc import NeedleBenchATCOrderedDataset
|
| 5 |
+
from opencompass.datasets.needlebench.atc import NeedleBenchATCDataset
|
| 6 |
+
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
|
| 7 |
+
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
| 8 |
+
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
| 9 |
+
|
| 10 |
+
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
| 11 |
+
|
| 12 |
+
needlebench_infer_cfg = dict(
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
type=PromptTemplate,
|
| 15 |
+
template=dict(
|
| 16 |
+
round=[
|
| 17 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 18 |
+
dict(role='BOT', prompt='{answer}\n'),
|
| 19 |
+
]
|
| 20 |
+
)
|
| 21 |
+
),
|
| 22 |
+
retriever=dict(type=ZeroRetriever),
|
| 23 |
+
inferencer=dict(type=GenInferencer))
|
| 24 |
+
|
| 25 |
+
needlebench_eval_cfg = dict(
|
| 26 |
+
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
| 27 |
+
pred_postprocessor=dict(type=needlebench_postprocess),
|
| 28 |
+
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
| 29 |
+
pred_role='BOT')
|
| 30 |
+
|
| 31 |
+
needle_num_list = list(range(2, 100, 3))
|
| 32 |
+
document_depth_percent_intervals = 20
|
| 33 |
+
repeats = 30
|
| 34 |
+
|
| 35 |
+
path = 'opencompass/needlebench'
|
| 36 |
+
file_name = 'names.json'
|
| 37 |
+
|
| 38 |
+
needlebench_atc_datasets_zh = []
|
| 39 |
+
needlebench_atc_datasets_en = []
|
| 40 |
+
needlebench_atc_datasets_zh_ordered = []
|
| 41 |
+
needlebench_atc_datasets_en_ordered = []
|
| 42 |
+
|
| 43 |
+
for num_needles in needle_num_list:
|
| 44 |
+
# ordered English version
|
| 45 |
+
dataset_dict = {
|
| 46 |
+
'abbr': f'needlebench_atc_challenge'
|
| 47 |
+
f'needle_{num_needles}_en_ordered',
|
| 48 |
+
'type': NeedleBenchATCOrderedDataset,
|
| 49 |
+
'path': path,
|
| 50 |
+
'file_name': file_name,
|
| 51 |
+
'num_needles': num_needles,
|
| 52 |
+
'language': 'English',
|
| 53 |
+
'repeats': repeats,
|
| 54 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 55 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 56 |
+
'eval_cfg': needlebench_eval_cfg
|
| 57 |
+
}
|
| 58 |
+
needlebench_atc_datasets_en_ordered.append(dataset_dict)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
for num_needles in needle_num_list:
|
| 62 |
+
# ordered Chinese version
|
| 63 |
+
dataset_dict = {
|
| 64 |
+
'abbr': f'needlebench_atc_challenge'
|
| 65 |
+
f'needle_{num_needles}_zh_ordered',
|
| 66 |
+
'type': NeedleBenchATCOrderedDataset,
|
| 67 |
+
'path': path,
|
| 68 |
+
'file_name': file_name,
|
| 69 |
+
'num_needles': num_needles,
|
| 70 |
+
'language': 'Chinese',
|
| 71 |
+
'repeats': repeats,
|
| 72 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 73 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 74 |
+
'eval_cfg': needlebench_eval_cfg
|
| 75 |
+
}
|
| 76 |
+
needlebench_atc_datasets_zh_ordered.append(dataset_dict)
|
| 77 |
+
|
| 78 |
+
for num_needles in needle_num_list:
|
| 79 |
+
# standard English version
|
| 80 |
+
dataset_dict = {
|
| 81 |
+
'abbr': f'needlebench_atc_challenge'
|
| 82 |
+
f'needle_{num_needles}_en',
|
| 83 |
+
'type': NeedleBenchATCDataset,
|
| 84 |
+
'path': path,
|
| 85 |
+
'file_name': file_name,
|
| 86 |
+
'num_needles': num_needles,
|
| 87 |
+
'language': 'English',
|
| 88 |
+
'repeats': repeats,
|
| 89 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 90 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 91 |
+
'eval_cfg': needlebench_eval_cfg
|
| 92 |
+
}
|
| 93 |
+
needlebench_atc_datasets_en.append(dataset_dict)
|
| 94 |
+
|
| 95 |
+
for num_needles in needle_num_list:
|
| 96 |
+
# standard Chinese version
|
| 97 |
+
dataset_dict = {
|
| 98 |
+
'abbr': f'needlebench_atc_challenge'
|
| 99 |
+
f'needle_{num_needles}_zh',
|
| 100 |
+
'type': NeedleBenchATCDataset,
|
| 101 |
+
'path': path,
|
| 102 |
+
'file_name': file_name,
|
| 103 |
+
'num_needles': num_needles,
|
| 104 |
+
'language': 'Chinese',
|
| 105 |
+
'repeats': repeats,
|
| 106 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 107 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 108 |
+
'eval_cfg': needlebench_eval_cfg
|
| 109 |
+
}
|
| 110 |
+
needlebench_atc_datasets_zh.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import CircularEvaluator
|
| 5 |
+
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
| 7 |
+
|
| 8 |
+
# ----------------------- Prompt Content----------------------- #
|
| 9 |
+
|
| 10 |
+
few_shot_prompts = {
|
| 11 |
+
'single_choice_prompts': {
|
| 12 |
+
'single_choice_cn': [
|
| 13 |
+
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王展\n'),
|
| 14 |
+
dict(role='BOT', prompt='回答: A'),
|
| 15 |
+
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
|
| 16 |
+
dict(role='BOT', prompt='回答: D'),
|
| 17 |
+
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
|
| 18 |
+
dict(role='BOT', prompt='回答: C'),
|
| 19 |
+
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
|
| 20 |
+
dict(role='BOT', prompt='回答: D'),
|
| 21 |
+
dict(role='HUMAN', prompt='问题: {question}'),
|
| 22 |
+
],
|
| 23 |
+
'single_choice_en': [
|
| 24 |
+
dict(role='HUMAN', prompt="Question: Jasmine Lane plays the role of James Hill's father in James Hill's life. \nGiven the scrambled family relationships described above, who is the eldest relative that 'James Hill' can trace back to in the context?\nA. Jasmine Lane\nB. James Hill\nC. Christopher Flores\nD. Paula Reeves\n"),
|
| 25 |
+
dict(role='BOT', prompt='Answer: A'),
|
| 26 |
+
dict(role='HUMAN', prompt="Question: For Michael Goodwin, Annette Valdez is not just a dad, but also a friend.For Annette Valdez, Mary Sanders is not just a father, but also a friend. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Michael Goodwin' can trace back to in the context?\nA. Michael Goodwin\nB. Annette Valdez\nC. Mary Sanders\nD. James Mitchell\n"),
|
| 27 |
+
dict(role='BOT', prompt='Answer: C'),
|
| 28 |
+
dict(role='HUMAN', prompt="Question: Laura Garcia considers Mr. Stephen Valdez DDS as their mom.Mr. Stephen Valdez DDS's maternal grandfather is Elizabeth Smith.Laura Garcia is Ashley Johnson's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Ashley Johnson' can trace back to in the context?\nA. Ashley Johnson\nB. Mr. Stephen Valdez DDS\nC. Laura Garcia\nD. Elizabeth Smith\n"),
|
| 29 |
+
dict(role='BOT', prompt='Answer: D'),
|
| 30 |
+
dict(role='HUMAN', prompt="Question: Justin Sanchez's paternal grandfather is Laurie Bean.For Laurie Bean, Christina Kim is not just a mother, but also a friend.Jessica Bass, as Christina Kim's maternal grandfather, has a significant impact on Christina Kim's upbringing.Sarah Hampton considers Justin Sanchez as their father. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Sarah Hampton' can trace back to in the context?\nA. Christina Kim\nB. Jessica Bass\nC. Laurie Bean\nD. Justin Sanchez\n"),
|
| 31 |
+
dict(role='BOT', prompt='Answer: B'),
|
| 32 |
+
dict(role='HUMAN', prompt='Question: {question}'),
|
| 33 |
+
],
|
| 34 |
+
'single_choice_cn_reasoning': [
|
| 35 |
+
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王���\n'),
|
| 36 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 刘红的爸爸是毕东。\n\n因此,"刘红"能够向上追溯到的最年长的亲人是毕东。所以答案是:A'),
|
| 37 |
+
|
| 38 |
+
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
|
| 39 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 蓝鑫把黄玉兰视为其母亲。\n- 黄玉兰的姥姥是梅建平。\n- 梅建平的妈妈是朱丽丽。\n- 朱丽丽的妈妈是焦慧。\n\n因此,"蓝鑫"能够向上追溯到的最年长的亲人是焦慧。所以答案是:D'),
|
| 40 |
+
|
| 41 |
+
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
|
| 42 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 常宁的外公是余萍。\n- 余萍把杨颖视为其爸爸。\n- 杨颖的外公是刘慧。\n- 刘慧是郭建华所生的孩子。\n- 郭建华的父亲是毕东。\n- 毕东的姥姥是柳金凤。\n- 柳金凤的妈妈是奉兵。\n\n因此,"常宁"能够向上追溯到的最年长的亲人是奉兵。所以答案是:C'),
|
| 43 |
+
|
| 44 |
+
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
|
| 45 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 郑玉珍的外婆是谢平。\n- 谢平的奶奶是魏丽丽。\n- 魏丽丽的外公是魏阳。 \n- 魏阳的姥姥是郭兵。\n- 郭兵的爷爷是李建国。 \n- 李建国的外婆是丁颖。 \n- 丁颖的外公是武波。 \n- 武波的父亲是刘瑜。\n- 刘瑜的爷爷是许玲。\n- 许玲的父亲是余桂芳。 \n\n因此,"郑玉珍"能够向上追溯到的最年长的亲人是余桂芳。所以答案是:D'),
|
| 46 |
+
dict(role='HUMAN', prompt='问题: {question}'),
|
| 47 |
+
],
|
| 48 |
+
'single_choice_en_reasoning': [
|
| 49 |
+
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
|
| 50 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
|
| 51 |
+
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
|
| 52 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
|
| 53 |
+
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
|
| 54 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
|
| 55 |
+
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
|
| 56 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
|
| 57 |
+
dict(role='HUMAN', prompt='Question: {question}'),
|
| 58 |
+
],
|
| 59 |
+
},
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# ----------------------- Prompt Settings ----------------------- #
|
| 63 |
+
needle_num_list = list(range(2, 20, 1))
|
| 64 |
+
path = 'opencompass/needlebench'
|
| 65 |
+
file_name = 'names.json'
|
| 66 |
+
|
| 67 |
+
repeats = 10
|
| 68 |
+
|
| 69 |
+
# Use Zero-Shot or not
|
| 70 |
+
with_few_shot = True
|
| 71 |
+
|
| 72 |
+
# Max for this dataset is 4, should be set with `with_few_shot`
|
| 73 |
+
few_shot_samples = 4
|
| 74 |
+
|
| 75 |
+
# Generate reasoning path or not, only for single choice
|
| 76 |
+
with_reasoning = True
|
| 77 |
+
|
| 78 |
+
# Use circular evaluation or not
|
| 79 |
+
with_circular_eval = True
|
| 80 |
+
|
| 81 |
+
needlebench_prompts = few_shot_prompts
|
| 82 |
+
single_choice_prompts = needlebench_prompts['single_choice_prompts']
|
| 83 |
+
|
| 84 |
+
# Set few shot prompt number
|
| 85 |
+
for _name in list(single_choice_prompts.keys()):
|
| 86 |
+
if with_few_shot:
|
| 87 |
+
assert few_shot_samples > 0 and few_shot_samples <= 4
|
| 88 |
+
single_choice_prompts[_name] = \
|
| 89 |
+
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
|
| 90 |
+
|
| 91 |
+
# ----------------------- Dataset Settings ----------------------- #
|
| 92 |
+
|
| 93 |
+
needlebench_datasets = []
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
needlebench_atc_reader_cfg = dict(input_columns=['question'],
|
| 97 |
+
output_column='answer')
|
| 98 |
+
|
| 99 |
+
for _name in list(single_choice_prompts.keys()):
|
| 100 |
+
|
| 101 |
+
needlebench_atc_infer_cfg = dict(
|
| 102 |
+
prompt_template=dict(
|
| 103 |
+
type=PromptTemplate,
|
| 104 |
+
template=dict(
|
| 105 |
+
round=(single_choice_prompts[_name])),
|
| 106 |
+
),
|
| 107 |
+
retriever=dict(type=ZeroRetriever),
|
| 108 |
+
inferencer=dict(type=GenInferencer,),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
needlebench_atc_eval_cfg = dict(
|
| 112 |
+
evaluator=dict(type=CircularEvaluator),
|
| 113 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
| 114 |
+
|
| 115 |
+
current_needle_num_list = needle_num_list
|
| 116 |
+
|
| 117 |
+
for num_needles in current_needle_num_list:
|
| 118 |
+
abbr = (f'NeedleBenchATCDataset-'
|
| 119 |
+
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
| 120 |
+
language = 'English' if 'en' in _name else 'Chinese'
|
| 121 |
+
if 'reasoning' in _name:
|
| 122 |
+
abbr += '-Reasoning'
|
| 123 |
+
dataset_dict = {
|
| 124 |
+
'abbr': abbr,
|
| 125 |
+
'type': NeedleBenchATCDataset,
|
| 126 |
+
'path': path,
|
| 127 |
+
'file_name': file_name,
|
| 128 |
+
'num_needles': num_needles,
|
| 129 |
+
'language': language,
|
| 130 |
+
'repeats': repeats,
|
| 131 |
+
'with_circular': with_circular_eval,
|
| 132 |
+
'reader_cfg': needlebench_atc_reader_cfg,
|
| 133 |
+
'infer_cfg': needlebench_atc_infer_cfg,
|
| 134 |
+
'eval_cfg': needlebench_atc_eval_cfg
|
| 135 |
+
}
|
| 136 |
+
needlebench_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_20.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import CircularEvaluator
|
| 5 |
+
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
| 7 |
+
|
| 8 |
+
# ----------------------- Prompt Content----------------------- #
|
| 9 |
+
|
| 10 |
+
few_shot_prompts = {
|
| 11 |
+
'single_choice_prompts': {
|
| 12 |
+
'single_choice_cn': [
|
| 13 |
+
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王展\n'),
|
| 14 |
+
dict(role='BOT', prompt='回答: A'),
|
| 15 |
+
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
|
| 16 |
+
dict(role='BOT', prompt='回答: D'),
|
| 17 |
+
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
|
| 18 |
+
dict(role='BOT', prompt='回答: C'),
|
| 19 |
+
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
|
| 20 |
+
dict(role='BOT', prompt='回答: D'),
|
| 21 |
+
dict(role='HUMAN', prompt='问题: {question}'),
|
| 22 |
+
],
|
| 23 |
+
'single_choice_en': [
|
| 24 |
+
dict(role='HUMAN', prompt="Question: Jasmine Lane plays the role of James Hill's father in James Hill's life. \nGiven the scrambled family relationships described above, who is the eldest relative that 'James Hill' can trace back to in the context?\nA. Jasmine Lane\nB. James Hill\nC. Christopher Flores\nD. Paula Reeves\n"),
|
| 25 |
+
dict(role='BOT', prompt='Answer: A'),
|
| 26 |
+
dict(role='HUMAN', prompt="Question: For Michael Goodwin, Annette Valdez is not just a dad, but also a friend.For Annette Valdez, Mary Sanders is not just a father, but also a friend. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Michael Goodwin' can trace back to in the context?\nA. Michael Goodwin\nB. Annette Valdez\nC. Mary Sanders\nD. James Mitchell\n"),
|
| 27 |
+
dict(role='BOT', prompt='Answer: C'),
|
| 28 |
+
dict(role='HUMAN', prompt="Question: Laura Garcia considers Mr. Stephen Valdez DDS as their mom.Mr. Stephen Valdez DDS's maternal grandfather is Elizabeth Smith.Laura Garcia is Ashley Johnson's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Ashley Johnson' can trace back to in the context?\nA. Ashley Johnson\nB. Mr. Stephen Valdez DDS\nC. Laura Garcia\nD. Elizabeth Smith\n"),
|
| 29 |
+
dict(role='BOT', prompt='Answer: D'),
|
| 30 |
+
dict(role='HUMAN', prompt="Question: Justin Sanchez's paternal grandfather is Laurie Bean.For Laurie Bean, Christina Kim is not just a mother, but also a friend.Jessica Bass, as Christina Kim's maternal grandfather, has a significant impact on Christina Kim's upbringing.Sarah Hampton considers Justin Sanchez as their father. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Sarah Hampton' can trace back to in the context?\nA. Christina Kim\nB. Jessica Bass\nC. Laurie Bean\nD. Justin Sanchez\n"),
|
| 31 |
+
dict(role='BOT', prompt='Answer: B'),
|
| 32 |
+
dict(role='HUMAN', prompt='Question: {question}'),
|
| 33 |
+
],
|
| 34 |
+
'single_choice_cn_reasoning': [
|
| 35 |
+
dict(role='HUMAN', prompt='问题: 毕东作为刘红的爸爸,对刘红的成长有重要影响。 \n在上面提供的打乱的家族关系文本中,"刘红"的能够向上追溯到的最年长的亲人是谁?\nA. 毕东\nB. 刘红\nC. 李清亮\nD. 王���\n'),
|
| 36 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 刘红的爸爸是毕东。\n\n因此,"刘红"能够向上追溯到的最年长的亲人是毕东。所以答案是:A'),
|
| 37 |
+
|
| 38 |
+
dict(role='HUMAN', prompt='问题: 梅建平作为黄玉兰的姥姥,对黄玉兰的成长有重要影响。梅建平的妈妈是朱丽丽。蓝鑫把黄玉兰视为其母亲。焦慧不仅是朱丽丽的妈妈,还是朱丽丽的榜样。 \n在上面提供的打乱的家族关系文本中,"蓝鑫"的能够向上追溯到的最年长的亲人是谁?\nA. 梅建平\nB. 朱丽丽\nC. 蓝鑫\nD. 焦慧\n'),
|
| 39 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 蓝鑫把黄玉兰视为其母亲。\n- 黄玉兰的姥姥是梅建平。\n- 梅建平的妈妈是朱丽丽。\n- 朱丽丽的妈妈是焦慧。\n\n因此,"蓝鑫"能够向上追溯到的最年长的亲人是焦慧。所以答案是:D'),
|
| 40 |
+
|
| 41 |
+
dict(role='HUMAN', prompt='问题: 毕东把柳金凤视为其姥姥。奉兵作为柳金凤的妈妈,对柳金凤的成长有重要影响。余萍把杨颖视为其爸爸。毕东在郭建华的生命中扮演着父亲的角色。常宁的外公是余萍。刘慧是郭建华所生的孩子。刘慧在杨颖的生命中扮演着外公的角色。 \n在上面提供的打乱的家族关系文本中,"常宁"的能够向上追溯到的最年长的亲人是谁?\nA. 柳金凤\nB. 毕东\nC. 奉兵\nD. 余萍\n'),
|
| 42 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 常宁的外公是余萍。\n- 余萍把杨颖视为其爸爸。\n- 杨颖的外公是刘慧。\n- 刘慧是郭建华所生的孩子。\n- 郭建华的父亲是毕东。\n- 毕东的姥姥是柳金凤。\n- 柳金凤的妈妈是奉兵。\n\n因此,"常宁"能够向上追溯到的最年长的亲人是奉兵。所以答案是:C'),
|
| 43 |
+
|
| 44 |
+
dict(role='HUMAN', prompt='问题: 魏丽丽在谢平的生命中扮演着w奶奶的角色。郭兵是魏阳的姥姥。谢平是郑玉珍的外婆。丁颖把武波视为其外公。丁颖在李建国的生命中扮演着外婆的角色。武波的父亲是刘瑜。许玲把余桂芳视为其父亲。刘瑜把许玲视为其爷爷。李建国对郭兵来说,不只是一个爷爷,还是一个朋友。魏丽丽的外公是魏阳。 \n在上面提供的打乱的家族关系文本中,"郑玉珍"的能够向上追溯到的最年长的亲人是谁?\nA. 魏丽丽\nB. 刘瑜\nC. 李建国\nD. 余桂芳\n'),
|
| 45 |
+
dict(role='BOT', prompt='回答: 根据提供的信息,我们可以构建以下家族关系链:\n\n- 郑玉珍的外婆是谢平。\n- 谢平的奶奶是魏丽丽。\n- 魏丽丽的外公是魏阳。 \n- 魏阳的姥姥是郭兵。\n- 郭兵的爷爷是李建国。 \n- 李建国的外婆是丁颖。 \n- 丁颖的外公是武波。 \n- 武波的父亲是刘瑜。\n- 刘瑜的爷爷是许玲。\n- 许玲的父亲是余桂芳。 \n\n因此,"郑玉珍"能够向上追溯到的最年长的亲人是余桂芳。所以答案是:D'),
|
| 46 |
+
dict(role='HUMAN', prompt='问题: {question}'),
|
| 47 |
+
],
|
| 48 |
+
'single_choice_en_reasoning': [
|
| 49 |
+
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
|
| 50 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
|
| 51 |
+
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
|
| 52 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
|
| 53 |
+
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
|
| 54 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
|
| 55 |
+
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
|
| 56 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
|
| 57 |
+
dict(role='HUMAN', prompt='Question: {question}'),
|
| 58 |
+
],
|
| 59 |
+
},
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# ----------------------- Prompt Settings ----------------------- #
|
| 63 |
+
needle_num_list = list(range(2, 20, 1))
|
| 64 |
+
path = 'opencompass/needlebench'
|
| 65 |
+
file_name = 'names.json'
|
| 66 |
+
repeats = 10
|
| 67 |
+
|
| 68 |
+
# Use Zero-Shot or not
|
| 69 |
+
with_few_shot = True
|
| 70 |
+
|
| 71 |
+
# Max for this dataset is 4, should be set with `with_few_shot`
|
| 72 |
+
few_shot_samples = 4
|
| 73 |
+
|
| 74 |
+
# Generate reasoning path or not, only for single choice
|
| 75 |
+
with_reasoning = True
|
| 76 |
+
|
| 77 |
+
# Use circular evaluation or not
|
| 78 |
+
with_circular_eval = True
|
| 79 |
+
|
| 80 |
+
needlebench_prompts = few_shot_prompts
|
| 81 |
+
single_choice_prompts = needlebench_prompts['single_choice_prompts']
|
| 82 |
+
|
| 83 |
+
# Set few shot prompt number
|
| 84 |
+
for _name in list(single_choice_prompts.keys()):
|
| 85 |
+
if with_few_shot:
|
| 86 |
+
assert few_shot_samples > 0 and few_shot_samples <= 4
|
| 87 |
+
single_choice_prompts[_name] = \
|
| 88 |
+
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
|
| 89 |
+
|
| 90 |
+
# ----------------------- Dataset Settings ----------------------- #
|
| 91 |
+
|
| 92 |
+
needlebench_datasets = []
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
needlebench_atc_reader_cfg = dict(input_columns=['question'],
|
| 96 |
+
output_column='answer')
|
| 97 |
+
|
| 98 |
+
for _name in list(single_choice_prompts.keys()):
|
| 99 |
+
|
| 100 |
+
needlebench_atc_infer_cfg = dict(
|
| 101 |
+
prompt_template=dict(
|
| 102 |
+
type=PromptTemplate,
|
| 103 |
+
template=dict(
|
| 104 |
+
round=(single_choice_prompts[_name])),
|
| 105 |
+
),
|
| 106 |
+
retriever=dict(type=ZeroRetriever),
|
| 107 |
+
inferencer=dict(type=GenInferencer,),
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
needlebench_atc_eval_cfg = dict(
|
| 111 |
+
evaluator=dict(type=CircularEvaluator),
|
| 112 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
| 113 |
+
|
| 114 |
+
for num_needles in needle_num_list:
|
| 115 |
+
abbr = (f'NeedleBenchATCDataset-'
|
| 116 |
+
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
| 117 |
+
language = 'English' if 'en' in _name else 'Chinese'
|
| 118 |
+
if 'reasoning' in _name:
|
| 119 |
+
abbr += '-Reasoning'
|
| 120 |
+
dataset_dict = {
|
| 121 |
+
'abbr': abbr,
|
| 122 |
+
'type': NeedleBenchATCDataset,
|
| 123 |
+
'path': path,
|
| 124 |
+
'file_name': file_name,
|
| 125 |
+
'num_needles': num_needles,
|
| 126 |
+
'language': language,
|
| 127 |
+
'repeats': repeats,
|
| 128 |
+
'with_circular': with_circular_eval,
|
| 129 |
+
'reader_cfg': needlebench_atc_reader_cfg,
|
| 130 |
+
'infer_cfg': needlebench_atc_infer_cfg,
|
| 131 |
+
'eval_cfg': needlebench_atc_eval_cfg
|
| 132 |
+
}
|
| 133 |
+
needlebench_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_50.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from .atc_choice_20 import *
|
| 4 |
+
|
| 5 |
+
needle_num_list = list(range(2, 50, 1))
|
| 6 |
+
needlebench_datasets = []
|
| 7 |
+
|
| 8 |
+
for _name in list(single_choice_prompts.keys()):
|
| 9 |
+
|
| 10 |
+
needlebench_atc_infer_cfg = dict(
|
| 11 |
+
prompt_template=dict(
|
| 12 |
+
type=PromptTemplate,
|
| 13 |
+
template=dict(
|
| 14 |
+
round=(single_choice_prompts[_name])),
|
| 15 |
+
),
|
| 16 |
+
retriever=dict(type=ZeroRetriever),
|
| 17 |
+
inferencer=dict(type=GenInferencer,),
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
needlebench_atc_eval_cfg = dict(
|
| 21 |
+
evaluator=dict(type=CircularEvaluator),
|
| 22 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
| 23 |
+
|
| 24 |
+
for num_needles in needle_num_list:
|
| 25 |
+
abbr = (f'NeedleBenchATCDataset-'
|
| 26 |
+
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
| 27 |
+
language = 'English' if 'en' in _name else 'Chinese'
|
| 28 |
+
if 'reasoning' in _name:
|
| 29 |
+
abbr += '-Reasoning'
|
| 30 |
+
dataset_dict = {
|
| 31 |
+
'abbr': abbr,
|
| 32 |
+
'type': NeedleBenchATCDataset,
|
| 33 |
+
'path': path,
|
| 34 |
+
'file_name': file_name,
|
| 35 |
+
'num_needles': num_needles,
|
| 36 |
+
'language': language,
|
| 37 |
+
'repeats': repeats,
|
| 38 |
+
'with_circular': with_circular_eval,
|
| 39 |
+
'reader_cfg': needlebench_atc_reader_cfg,
|
| 40 |
+
'infer_cfg': needlebench_atc_infer_cfg,
|
| 41 |
+
'eval_cfg': needlebench_atc_eval_cfg
|
| 42 |
+
}
|
| 43 |
+
needlebench_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_50_en_reasoning.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import CircularEvaluator
|
| 5 |
+
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
| 7 |
+
|
| 8 |
+
# ----------------------- Prompt Content----------------------- #
|
| 9 |
+
|
| 10 |
+
few_shot_prompts = {
|
| 11 |
+
'single_choice_prompts': {
|
| 12 |
+
'single_choice_en_reasoning': [
|
| 13 |
+
dict(
|
| 14 |
+
role='HUMAN',
|
| 15 |
+
prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n",
|
| 16 |
+
),
|
| 17 |
+
dict(
|
| 18 |
+
role='BOT',
|
| 19 |
+
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C",
|
| 20 |
+
),
|
| 21 |
+
dict(
|
| 22 |
+
role='HUMAN',
|
| 23 |
+
prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n",
|
| 24 |
+
),
|
| 25 |
+
dict(
|
| 26 |
+
role='BOT',
|
| 27 |
+
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D",
|
| 28 |
+
),
|
| 29 |
+
dict(
|
| 30 |
+
role='HUMAN',
|
| 31 |
+
prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n",
|
| 32 |
+
),
|
| 33 |
+
dict(
|
| 34 |
+
role='BOT',
|
| 35 |
+
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B",
|
| 36 |
+
),
|
| 37 |
+
dict(
|
| 38 |
+
role='HUMAN',
|
| 39 |
+
prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n",
|
| 40 |
+
),
|
| 41 |
+
dict(
|
| 42 |
+
role='BOT',
|
| 43 |
+
prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A",
|
| 44 |
+
),
|
| 45 |
+
dict(role='HUMAN', prompt='Question: {question}'),
|
| 46 |
+
],
|
| 47 |
+
},
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# ----------------------- Prompt Settings ----------------------- #
|
| 51 |
+
needle_num_list = list(range(2, 50, 1))
|
| 52 |
+
path = 'opencompass/needlebench'
|
| 53 |
+
file_name = 'names.json'
|
| 54 |
+
repeats = 10
|
| 55 |
+
|
| 56 |
+
# Use Zero-Shot or not
|
| 57 |
+
with_few_shot = True
|
| 58 |
+
|
| 59 |
+
# Max for this dataset is 4, should be set with `with_few_shot`
|
| 60 |
+
few_shot_samples = 4
|
| 61 |
+
|
| 62 |
+
# Generate reasoning path or not, only for single choice
|
| 63 |
+
with_reasoning = True
|
| 64 |
+
|
| 65 |
+
# Use circular evaluation or not
|
| 66 |
+
with_circular_eval = True
|
| 67 |
+
|
| 68 |
+
needlebench_prompts = few_shot_prompts
|
| 69 |
+
single_choice_prompts = needlebench_prompts['single_choice_prompts']
|
| 70 |
+
|
| 71 |
+
# Set few shot prompt number
|
| 72 |
+
for _name in list(single_choice_prompts.keys()):
|
| 73 |
+
if with_few_shot:
|
| 74 |
+
assert few_shot_samples > 0 and few_shot_samples <= 4
|
| 75 |
+
single_choice_prompts[_name] = single_choice_prompts[_name][
|
| 76 |
+
-few_shot_samples * 2 - 1 :
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
# ----------------------- Dataset Settings ----------------------- #
|
| 80 |
+
|
| 81 |
+
needlebench_datasets = []
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
needlebench_atc_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
| 85 |
+
|
| 86 |
+
for _name in list(single_choice_prompts.keys()):
|
| 87 |
+
|
| 88 |
+
needlebench_atc_infer_cfg = dict(
|
| 89 |
+
prompt_template=dict(
|
| 90 |
+
type=PromptTemplate,
|
| 91 |
+
template=dict(round=(single_choice_prompts[_name])),
|
| 92 |
+
),
|
| 93 |
+
retriever=dict(type=ZeroRetriever),
|
| 94 |
+
inferencer=dict(
|
| 95 |
+
type=GenInferencer,
|
| 96 |
+
),
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
needlebench_atc_eval_cfg = dict(
|
| 100 |
+
evaluator=dict(type=CircularEvaluator),
|
| 101 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
for num_needles in needle_num_list:
|
| 105 |
+
abbr = (
|
| 106 |
+
f'NeedleBenchATCDataset-'
|
| 107 |
+
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}'
|
| 108 |
+
)
|
| 109 |
+
language = 'English' if 'en' in _name else 'Chinese'
|
| 110 |
+
if 'reasoning' in _name:
|
| 111 |
+
abbr += '-Reasoning'
|
| 112 |
+
dataset_dict = {
|
| 113 |
+
'abbr': abbr,
|
| 114 |
+
'type': NeedleBenchATCDataset,
|
| 115 |
+
'path': path,
|
| 116 |
+
'file_name':file_name,
|
| 117 |
+
'num_needles': num_needles,
|
| 118 |
+
'language': language,
|
| 119 |
+
'repeats': repeats,
|
| 120 |
+
'with_circular': with_circular_eval,
|
| 121 |
+
'reader_cfg': needlebench_atc_reader_cfg,
|
| 122 |
+
'infer_cfg': needlebench_atc_infer_cfg,
|
| 123 |
+
'eval_cfg': needlebench_atc_eval_cfg,
|
| 124 |
+
}
|
| 125 |
+
needlebench_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_80.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
with read_base():
|
| 3 |
+
from .atc_choice_20 import *
|
| 4 |
+
|
| 5 |
+
needle_num_list = list(range(2, 80, 1))
|
| 6 |
+
needlebench_datasets = []
|
| 7 |
+
|
| 8 |
+
for _name in list(single_choice_prompts.keys()):
|
| 9 |
+
|
| 10 |
+
needlebench_atc_infer_cfg = dict(
|
| 11 |
+
prompt_template=dict(
|
| 12 |
+
type=PromptTemplate,
|
| 13 |
+
template=dict(
|
| 14 |
+
round=(single_choice_prompts[_name])),
|
| 15 |
+
),
|
| 16 |
+
retriever=dict(type=ZeroRetriever),
|
| 17 |
+
inferencer=dict(type=GenInferencer,),
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
needlebench_atc_eval_cfg = dict(
|
| 21 |
+
evaluator=dict(type=CircularEvaluator),
|
| 22 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
| 23 |
+
|
| 24 |
+
for num_needles in needle_num_list:
|
| 25 |
+
abbr = (f'NeedleBenchATCDataset-'
|
| 26 |
+
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
| 27 |
+
language = 'English' if 'en' in _name else 'Chinese'
|
| 28 |
+
if 'reasoning' in _name:
|
| 29 |
+
abbr += '-Reasoning'
|
| 30 |
+
dataset_dict = {
|
| 31 |
+
'abbr': abbr,
|
| 32 |
+
'type': NeedleBenchATCDataset,
|
| 33 |
+
'path': path,
|
| 34 |
+
'file_name': file_name,
|
| 35 |
+
'num_needles': num_needles,
|
| 36 |
+
'language': language,
|
| 37 |
+
'repeats': repeats,
|
| 38 |
+
'with_circular': with_circular_eval,
|
| 39 |
+
'reader_cfg': needlebench_atc_reader_cfg,
|
| 40 |
+
'infer_cfg': needlebench_atc_infer_cfg,
|
| 41 |
+
'eval_cfg': needlebench_atc_eval_cfg
|
| 42 |
+
}
|
| 43 |
+
needlebench_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/atc/atc_choice_80_en_reasoning.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import CircularEvaluator
|
| 5 |
+
from opencompass.datasets.needlebench.atc_choice import NeedleBenchATCDataset
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
| 7 |
+
|
| 8 |
+
# ----------------------- Prompt Content----------------------- #
|
| 9 |
+
|
| 10 |
+
few_shot_prompts = {
|
| 11 |
+
'single_choice_prompts': {
|
| 12 |
+
'single_choice_en_reasoning': [
|
| 13 |
+
dict(role='HUMAN', prompt="Question: Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jessica Stewart' can trace back to in the context?\nA. Jack Burch\nB. Jessica Stewart\nC. Sharon House\nD. Carolyn Jackson\n"),
|
| 14 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- Sharon House, as Jessica Stewart's father, has a significant impact on Jessica Stewart's upbringing.\n\nTherefore, the eldest relative that 'Jessica Stewart' can trace back to in the context is Sharon House. The answer is: C"),
|
| 15 |
+
dict(role='HUMAN', prompt="Question: For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.Jacob Oconnor's paternal grandmother is Robert Hill. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jacob Oconnor' can trace back to in the context?\nA. Laura Holland\nB. Robert Hill\nC. Jacob Oconnor\nD. Mikayla Scott\n"),
|
| 16 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n-Jacob Oconnor's paternal grandmother is Robert Hill. \n- For Robert Hill, Mikayla Scott is not just a paternal grandfather, but also a friend.\n\nTherefore, the eldest relative that 'Jacob Oconnor' can trace back to in the context is Mikayla Scott. The answer is: D"),
|
| 17 |
+
dict(role='HUMAN', prompt="Question: Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life.Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.Michael Martinez is not only Misty Moore's father but also Misty Moore's role model. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Barbara Fuentes' can trace back to in the context?\nA. Michael Martinez\nB. Jennifer Garcia\nC. Misty Moore\nD. Barbara Fuentes\n"),
|
| 18 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n- Misty Moore plays the role of Barbara Fuentes's maternal grandfather in Barbara Fuentes's life. \n- Michael Martinez is not only Misty Moore's father but also Misty Moore's role model.\n- Jennifer Garcia, as Michael Martinez's grandmother, has a significant impact on Michael Martinez's upbringing.\n\nTherefore, the eldest relative that 'Barbara Fuentes' can trace back to in the context is Jennifer Garcia. The answer is: B"),
|
| 19 |
+
dict(role='HUMAN', prompt="Question: Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.Victor Dean considers Mary Gay as their grandfather.Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.David Hernandez is Jennifer Williams's mom. \nGiven the scrambled family relationships described above, who is the eldest relative that 'Jennifer Williams' can trace back to in the context?\nA. Marcus Miller\nB. Carlos Smith\nC. Mary Gay\nD. Victor Dean\n"),
|
| 20 |
+
dict(role='BOT', prompt="Answer: Based on the provided information, we can construct the following family relationship chain:\n\n- David Hernandez is Jennifer Williams's mom.\n- Danielle Yates is not only David Hernandez's paternal grandmother but also David Hernandez's role model.\n- Victor Dean is not only Danielle Yates's maternal grandmother but also Danielle Yates's role model.\n- Victor Dean considers Mary Gay as their grandfather. \n- Carlos Smith, as Mary Gay's grandfather, has a significant impact on Mary Gay's upbringing.\n- Marcus Miller, as Carlos Smith's paternal grandfather, has a significant impact on Carlos Smith's upbringing.\n\nTherefore, the eldest relative that 'Jennifer Williams' can trace back to in the context is Marcus Miller. The answer is: A"),
|
| 21 |
+
dict(role='HUMAN', prompt='Question: {question}'),
|
| 22 |
+
],
|
| 23 |
+
},
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# ----------------------- Prompt Settings ----------------------- #
|
| 27 |
+
needle_num_list = list(range(2, 80, 1))
|
| 28 |
+
path = 'opencompass/needlebench'
|
| 29 |
+
file_name = 'names.json'
|
| 30 |
+
repeats = 10
|
| 31 |
+
|
| 32 |
+
# Use Zero-Shot or not
|
| 33 |
+
with_few_shot = True
|
| 34 |
+
|
| 35 |
+
# Max for this dataset is 4, should be set with `with_few_shot`
|
| 36 |
+
few_shot_samples = 4
|
| 37 |
+
|
| 38 |
+
# Generate reasoning path or not, only for single choice
|
| 39 |
+
with_reasoning = True
|
| 40 |
+
|
| 41 |
+
# Use circular evaluation or not
|
| 42 |
+
with_circular_eval = True
|
| 43 |
+
|
| 44 |
+
needlebench_prompts = few_shot_prompts
|
| 45 |
+
single_choice_prompts = needlebench_prompts['single_choice_prompts']
|
| 46 |
+
|
| 47 |
+
# Set few shot prompt number
|
| 48 |
+
for _name in list(single_choice_prompts.keys()):
|
| 49 |
+
if with_few_shot:
|
| 50 |
+
assert few_shot_samples > 0 and few_shot_samples <= 4
|
| 51 |
+
single_choice_prompts[_name] = \
|
| 52 |
+
single_choice_prompts[_name][- few_shot_samples * 2 - 1:]
|
| 53 |
+
|
| 54 |
+
# ----------------------- Dataset Settings ----------------------- #
|
| 55 |
+
|
| 56 |
+
needlebench_datasets = []
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
needlebench_atc_reader_cfg = dict(input_columns=['question'],
|
| 60 |
+
output_column='answer')
|
| 61 |
+
|
| 62 |
+
for _name in list(single_choice_prompts.keys()):
|
| 63 |
+
|
| 64 |
+
needlebench_atc_infer_cfg = dict(
|
| 65 |
+
prompt_template=dict(
|
| 66 |
+
type=PromptTemplate,
|
| 67 |
+
template=dict(
|
| 68 |
+
round=(single_choice_prompts[_name])),
|
| 69 |
+
),
|
| 70 |
+
retriever=dict(type=ZeroRetriever),
|
| 71 |
+
inferencer=dict(type=GenInferencer,),
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
needlebench_atc_eval_cfg = dict(
|
| 75 |
+
evaluator=dict(type=CircularEvaluator),
|
| 76 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
| 77 |
+
|
| 78 |
+
for num_needles in needle_num_list:
|
| 79 |
+
abbr = (f'NeedleBenchATCDataset-'
|
| 80 |
+
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
|
| 81 |
+
language = 'English' if 'en' in _name else 'Chinese'
|
| 82 |
+
if 'reasoning' in _name:
|
| 83 |
+
abbr += '-Reasoning'
|
| 84 |
+
dataset_dict = {
|
| 85 |
+
'abbr': abbr,
|
| 86 |
+
'type': NeedleBenchATCDataset,
|
| 87 |
+
'path': path,
|
| 88 |
+
'file_name': file_name,
|
| 89 |
+
'num_needles': num_needles,
|
| 90 |
+
'language': language,
|
| 91 |
+
'repeats': repeats,
|
| 92 |
+
'with_circular': with_circular_eval,
|
| 93 |
+
'reader_cfg': needlebench_atc_reader_cfg,
|
| 94 |
+
'infer_cfg': needlebench_atc_infer_cfg,
|
| 95 |
+
'eval_cfg': needlebench_atc_eval_cfg
|
| 96 |
+
}
|
| 97 |
+
needlebench_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_1000k.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .needlebench_multi_reasoning_1000k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
|
| 5 |
+
from .needlebench_multi_reasoning_1000k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
|
| 6 |
+
from .needlebench_multi_reasoning_1000k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
|
| 7 |
+
from .needlebench_multi_reasoning_1000k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
|
| 8 |
+
from .needlebench_multi_reasoning_1000k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
|
| 9 |
+
from .needlebench_multi_reasoning_1000k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
|
| 10 |
+
from .needlebench_multi_reasoning_1000k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
|
| 11 |
+
from .needlebench_multi_reasoning_1000k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
|
| 12 |
+
|
| 13 |
+
from .needlebench_single_1000k import needlebench_en_datasets as needlebench_origin_en_datasets
|
| 14 |
+
from .needlebench_single_1000k import needlebench_zh_datasets as needlebench_origin_zh_datasets
|
| 15 |
+
from .needlebench_multi_retrieval_1000k import needlebench_en_datasets as needlebench_parallel_en_datasets
|
| 16 |
+
from .needlebench_multi_retrieval_1000k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
|
| 17 |
+
|
| 18 |
+
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
build/lib/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning_1000k.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
|
| 5 |
+
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
|
| 6 |
+
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
| 7 |
+
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
| 8 |
+
import math
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def logistic(x, L=100, x0=50, k=0.1):
|
| 12 |
+
return round(L / (1 + math.exp(-k * (x - x0))), 3)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def generate_linear_space(start, end, num):
|
| 16 |
+
if num == 1:
|
| 17 |
+
return [start]
|
| 18 |
+
elif num < 1:
|
| 19 |
+
raise ValueError('num must be at least 1.')
|
| 20 |
+
step = (end - start) / (num - 1)
|
| 21 |
+
return [start + step * i for i in range(num)]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def generate_depth_percents(intervals, interval_type):
|
| 25 |
+
if interval_type == 'linear':
|
| 26 |
+
return generate_linear_space(0, 100, intervals)
|
| 27 |
+
elif interval_type == 'sigmoid':
|
| 28 |
+
linear_space = generate_linear_space(0, 100, intervals)
|
| 29 |
+
return [logistic(x) for x in linear_space]
|
| 30 |
+
else:
|
| 31 |
+
raise ValueError('Unsupported interval type')
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
| 35 |
+
|
| 36 |
+
needlebench_infer_cfg = dict(
|
| 37 |
+
prompt_template=dict(
|
| 38 |
+
type=PromptTemplate,
|
| 39 |
+
template=dict(
|
| 40 |
+
round=[
|
| 41 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 42 |
+
dict(role='BOT', prompt='{answer}\n'),
|
| 43 |
+
]
|
| 44 |
+
),
|
| 45 |
+
),
|
| 46 |
+
retriever=dict(type=ZeroRetriever),
|
| 47 |
+
inferencer=dict(type=GenInferencer),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
needlebench_eval_cfg = dict(
|
| 51 |
+
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
| 52 |
+
pred_postprocessor=dict(type=needlebench_postprocess),
|
| 53 |
+
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
| 54 |
+
pred_role='BOT',
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]
|
| 58 |
+
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
| 59 |
+
|
| 60 |
+
# ----------English Version----------
|
| 61 |
+
base_path = 'opencompass/needlebench'
|
| 62 |
+
file_list = ['PaulGrahamEssays.jsonl']
|
| 63 |
+
|
| 64 |
+
needle_file_name = 'multi_needle_reasoning_en.json'
|
| 65 |
+
diff = 10
|
| 66 |
+
num_needles = 2
|
| 67 |
+
needlebench_2needle_en_datasets = []
|
| 68 |
+
language = 'English'
|
| 69 |
+
|
| 70 |
+
for original_context_length in context_lengths:
|
| 71 |
+
for depth_percent in depths_list:
|
| 72 |
+
dataset_dict = {
|
| 73 |
+
'abbr': f'Length{original_context_length}'
|
| 74 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
| 75 |
+
'type': NeedleBenchMultiDataset,
|
| 76 |
+
'path': base_path,
|
| 77 |
+
'length': original_context_length,
|
| 78 |
+
'depth': int(depth_percent),
|
| 79 |
+
'tokenizer_model': 'gpt-4',
|
| 80 |
+
'file_list': file_list,
|
| 81 |
+
'num_repeats_per_file': 10,
|
| 82 |
+
'length_buffer': 600,
|
| 83 |
+
'guide': True,
|
| 84 |
+
'language': language,
|
| 85 |
+
'needle_file_name': needle_file_name,
|
| 86 |
+
'num_needles': num_needles,
|
| 87 |
+
'diff': diff,
|
| 88 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 89 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 90 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 91 |
+
}
|
| 92 |
+
needlebench_2needle_en_datasets.append(dataset_dict)
|
| 93 |
+
|
| 94 |
+
num_needles = 3
|
| 95 |
+
needlebench_3needle_en_datasets = []
|
| 96 |
+
|
| 97 |
+
for original_context_length in context_lengths:
|
| 98 |
+
for depth_percent in depths_list:
|
| 99 |
+
dataset_dict = {
|
| 100 |
+
'abbr': f'Length{original_context_length}'
|
| 101 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
| 102 |
+
'type': NeedleBenchMultiDataset,
|
| 103 |
+
'path': base_path,
|
| 104 |
+
'length': original_context_length,
|
| 105 |
+
'depth': int(depth_percent),
|
| 106 |
+
'tokenizer_model': 'gpt-4',
|
| 107 |
+
'file_list': file_list,
|
| 108 |
+
'num_repeats_per_file': 10,
|
| 109 |
+
'length_buffer': 600,
|
| 110 |
+
'guide': True,
|
| 111 |
+
'language': language,
|
| 112 |
+
'needle_file_name': needle_file_name,
|
| 113 |
+
'num_needles': num_needles,
|
| 114 |
+
'diff': diff,
|
| 115 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 116 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 117 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 118 |
+
}
|
| 119 |
+
needlebench_3needle_en_datasets.append(dataset_dict)
|
| 120 |
+
|
| 121 |
+
num_needles = 4
|
| 122 |
+
needlebench_4needle_en_datasets = []
|
| 123 |
+
|
| 124 |
+
for original_context_length in context_lengths:
|
| 125 |
+
for depth_percent in depths_list:
|
| 126 |
+
dataset_dict = {
|
| 127 |
+
'abbr': f'Length{original_context_length}'
|
| 128 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
| 129 |
+
'type': NeedleBenchMultiDataset,
|
| 130 |
+
'path': base_path,
|
| 131 |
+
'length': original_context_length,
|
| 132 |
+
'depth': int(depth_percent),
|
| 133 |
+
'tokenizer_model': 'gpt-4',
|
| 134 |
+
'file_list': file_list,
|
| 135 |
+
'num_repeats_per_file': 10,
|
| 136 |
+
'length_buffer': 600,
|
| 137 |
+
'guide': True,
|
| 138 |
+
'language': language,
|
| 139 |
+
'needle_file_name': needle_file_name,
|
| 140 |
+
'num_needles': num_needles,
|
| 141 |
+
'diff': diff,
|
| 142 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 143 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 144 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 145 |
+
}
|
| 146 |
+
needlebench_4needle_en_datasets.append(dataset_dict)
|
| 147 |
+
|
| 148 |
+
num_needles = 5
|
| 149 |
+
needlebench_5needle_en_datasets = []
|
| 150 |
+
|
| 151 |
+
for original_context_length in context_lengths:
|
| 152 |
+
for depth_percent in depths_list:
|
| 153 |
+
dataset_dict = {
|
| 154 |
+
'abbr': f'Length{original_context_length}'
|
| 155 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k',
|
| 156 |
+
'type': NeedleBenchMultiDataset,
|
| 157 |
+
'path': base_path,
|
| 158 |
+
'length': original_context_length,
|
| 159 |
+
'depth': int(depth_percent),
|
| 160 |
+
'tokenizer_model': 'gpt-4',
|
| 161 |
+
'file_list': file_list,
|
| 162 |
+
'num_repeats_per_file': 10,
|
| 163 |
+
'length_buffer': 600,
|
| 164 |
+
'guide': True,
|
| 165 |
+
'language': language,
|
| 166 |
+
'needle_file_name': needle_file_name,
|
| 167 |
+
'num_needles': num_needles,
|
| 168 |
+
'diff': diff,
|
| 169 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 170 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 171 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 172 |
+
}
|
| 173 |
+
needlebench_5needle_en_datasets.append(dataset_dict)
|
| 174 |
+
|
| 175 |
+
# ----------Chinese Version----------
|
| 176 |
+
base_path = 'opencompass/needlebench'
|
| 177 |
+
file_list = ['zh_finance.jsonl']
|
| 178 |
+
|
| 179 |
+
needle_file_name = 'multi_needle_reasoning_zh.json'
|
| 180 |
+
diff = 10
|
| 181 |
+
num_needles = 2
|
| 182 |
+
needlebench_2needle_zh_datasets = []
|
| 183 |
+
language = 'Chinese'
|
| 184 |
+
|
| 185 |
+
for original_context_length in context_lengths:
|
| 186 |
+
for depth_percent in depths_list:
|
| 187 |
+
dataset_dict = {
|
| 188 |
+
'abbr': f'Length{original_context_length}'
|
| 189 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
| 190 |
+
'type': NeedleBenchMultiDataset,
|
| 191 |
+
'path': base_path,
|
| 192 |
+
'length': original_context_length,
|
| 193 |
+
'depth': int(depth_percent),
|
| 194 |
+
'tokenizer_model': 'gpt-4',
|
| 195 |
+
'file_list': file_list,
|
| 196 |
+
'num_repeats_per_file': 10,
|
| 197 |
+
'length_buffer': 200,
|
| 198 |
+
'guide': True,
|
| 199 |
+
'language': language,
|
| 200 |
+
'needle_file_name': needle_file_name,
|
| 201 |
+
'num_needles': num_needles,
|
| 202 |
+
'diff': diff,
|
| 203 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 204 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 205 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 206 |
+
}
|
| 207 |
+
needlebench_2needle_zh_datasets.append(dataset_dict)
|
| 208 |
+
|
| 209 |
+
num_needles = 3
|
| 210 |
+
needlebench_3needle_zh_datasets = []
|
| 211 |
+
|
| 212 |
+
for original_context_length in context_lengths:
|
| 213 |
+
for depth_percent in depths_list:
|
| 214 |
+
dataset_dict = {
|
| 215 |
+
'abbr': f'Length{original_context_length}'
|
| 216 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
| 217 |
+
'type': NeedleBenchMultiDataset,
|
| 218 |
+
'path': base_path,
|
| 219 |
+
'length': original_context_length,
|
| 220 |
+
'depth': int(depth_percent),
|
| 221 |
+
'tokenizer_model': 'gpt-4',
|
| 222 |
+
'file_list': file_list,
|
| 223 |
+
'num_repeats_per_file': 10,
|
| 224 |
+
'length_buffer': 200,
|
| 225 |
+
'guide': True,
|
| 226 |
+
'language': language,
|
| 227 |
+
'needle_file_name': needle_file_name,
|
| 228 |
+
'num_needles': num_needles,
|
| 229 |
+
'diff': diff,
|
| 230 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 231 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 232 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 233 |
+
}
|
| 234 |
+
needlebench_3needle_zh_datasets.append(dataset_dict)
|
| 235 |
+
|
| 236 |
+
num_needles = 4
|
| 237 |
+
needlebench_4needle_zh_datasets = []
|
| 238 |
+
|
| 239 |
+
for original_context_length in context_lengths:
|
| 240 |
+
for depth_percent in depths_list:
|
| 241 |
+
dataset_dict = {
|
| 242 |
+
'abbr': f'Length{original_context_length}'
|
| 243 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
| 244 |
+
'type': NeedleBenchMultiDataset,
|
| 245 |
+
'path': base_path,
|
| 246 |
+
'length': original_context_length,
|
| 247 |
+
'depth': int(depth_percent),
|
| 248 |
+
'tokenizer_model': 'gpt-4',
|
| 249 |
+
'file_list': file_list,
|
| 250 |
+
'num_repeats_per_file': 10,
|
| 251 |
+
'length_buffer': 200,
|
| 252 |
+
'guide': True,
|
| 253 |
+
'language': language,
|
| 254 |
+
'needle_file_name': needle_file_name,
|
| 255 |
+
'num_needles': num_needles,
|
| 256 |
+
'diff': diff,
|
| 257 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 258 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 259 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 260 |
+
}
|
| 261 |
+
needlebench_4needle_zh_datasets.append(dataset_dict)
|
| 262 |
+
|
| 263 |
+
num_needles = 5
|
| 264 |
+
needlebench_5needle_zh_datasets = []
|
| 265 |
+
|
| 266 |
+
for original_context_length in context_lengths:
|
| 267 |
+
for depth_percent in depths_list:
|
| 268 |
+
dataset_dict = {
|
| 269 |
+
'abbr': f'Length{original_context_length}'
|
| 270 |
+
f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k',
|
| 271 |
+
'type': NeedleBenchMultiDataset,
|
| 272 |
+
'path': base_path,
|
| 273 |
+
'length': original_context_length,
|
| 274 |
+
'depth': int(depth_percent),
|
| 275 |
+
'tokenizer_model': 'gpt-4',
|
| 276 |
+
'file_list': file_list,
|
| 277 |
+
'num_repeats_per_file': 10,
|
| 278 |
+
'length_buffer': 200,
|
| 279 |
+
'guide': True,
|
| 280 |
+
'language': language,
|
| 281 |
+
'needle_file_name': needle_file_name,
|
| 282 |
+
'num_needles': num_needles,
|
| 283 |
+
'diff': diff,
|
| 284 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 285 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 286 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 287 |
+
}
|
| 288 |
+
needlebench_5needle_zh_datasets.append(dataset_dict)
|
build/lib/opencompass/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval_1000k.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
|
| 5 |
+
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
|
| 6 |
+
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
| 7 |
+
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
| 8 |
+
import math
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def logistic(x, L=100, x0=50, k=0.1):
|
| 12 |
+
return round(L / (1 + math.exp(-k * (x - x0))), 3)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def generate_linear_space(start, end, num):
|
| 16 |
+
if num == 1:
|
| 17 |
+
return [start]
|
| 18 |
+
elif num < 1:
|
| 19 |
+
raise ValueError('num must be at least 1.')
|
| 20 |
+
step = (end - start) / (num - 1)
|
| 21 |
+
return [start + step * i for i in range(num)]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def generate_depth_percents(intervals, interval_type):
|
| 25 |
+
if interval_type == 'linear':
|
| 26 |
+
return generate_linear_space(0, 100, intervals)
|
| 27 |
+
elif interval_type == 'sigmoid':
|
| 28 |
+
linear_space = generate_linear_space(0, 100, intervals)
|
| 29 |
+
return [logistic(x) for x in linear_space]
|
| 30 |
+
else:
|
| 31 |
+
raise ValueError('Unsupported interval type')
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
| 35 |
+
|
| 36 |
+
needlebench_infer_cfg = dict(
|
| 37 |
+
prompt_template=dict(
|
| 38 |
+
type=PromptTemplate,
|
| 39 |
+
template=dict(
|
| 40 |
+
round=[
|
| 41 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
| 42 |
+
dict(role='BOT', prompt='{answer}\n'),
|
| 43 |
+
]
|
| 44 |
+
),
|
| 45 |
+
),
|
| 46 |
+
retriever=dict(type=ZeroRetriever),
|
| 47 |
+
inferencer=dict(type=GenInferencer),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
needlebench_eval_cfg = dict(
|
| 51 |
+
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
| 52 |
+
pred_postprocessor=dict(type=needlebench_postprocess),
|
| 53 |
+
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
| 54 |
+
pred_role='BOT',
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
| 58 |
+
document_depth_percent_intervals = 20
|
| 59 |
+
document_depth_percent_interval_type = 'linear'
|
| 60 |
+
|
| 61 |
+
base_path = 'opencompass/needlebench'
|
| 62 |
+
file_list = ['PaulGrahamEssays.jsonl']
|
| 63 |
+
needlebench_en_datasets = []
|
| 64 |
+
needle_file_name = 'needles.jsonl'
|
| 65 |
+
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
| 66 |
+
|
| 67 |
+
for original_context_length in context_lengths:
|
| 68 |
+
dataset_dict = {
|
| 69 |
+
'abbr': f'Length{original_context_length}' f'_parallel_en_1000k',
|
| 70 |
+
'type': NeedleBenchParallelDataset,
|
| 71 |
+
'path': base_path,
|
| 72 |
+
'needle_file_name': needle_file_name,
|
| 73 |
+
'length': original_context_length,
|
| 74 |
+
'depths': depths,
|
| 75 |
+
'tokenizer_model': 'gpt-4',
|
| 76 |
+
'file_list': file_list,
|
| 77 |
+
'num_repeats_per_file': 25,
|
| 78 |
+
'length_buffer': 3000,
|
| 79 |
+
'guide': True,
|
| 80 |
+
'language': 'English',
|
| 81 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 82 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 83 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 84 |
+
}
|
| 85 |
+
needlebench_en_datasets.append(dataset_dict)
|
| 86 |
+
|
| 87 |
+
file_list = ['zh_finance.jsonl']
|
| 88 |
+
needlebench_zh_datasets = []
|
| 89 |
+
|
| 90 |
+
for original_context_length in context_lengths:
|
| 91 |
+
dataset_dict = {
|
| 92 |
+
'abbr': f'Length{original_context_length}' f'_parallel_zh_1000k',
|
| 93 |
+
'type': NeedleBenchParallelDataset,
|
| 94 |
+
'path': base_path,
|
| 95 |
+
'needle_file_name': needle_file_name,
|
| 96 |
+
'length': original_context_length,
|
| 97 |
+
'depths': depths,
|
| 98 |
+
'tokenizer_model': 'gpt-4',
|
| 99 |
+
'file_list': file_list,
|
| 100 |
+
'num_repeats_per_file': 25,
|
| 101 |
+
'length_buffer': 200,
|
| 102 |
+
'guide': True,
|
| 103 |
+
'language': 'Chinese',
|
| 104 |
+
'reader_cfg': needlebench_reader_cfg,
|
| 105 |
+
'infer_cfg': needlebench_infer_cfg,
|
| 106 |
+
'eval_cfg': needlebench_eval_cfg,
|
| 107 |
+
}
|
| 108 |
+
needlebench_zh_datasets.append(dataset_dict)
|