File size: 8,751 Bytes
da806fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import os.path as osp
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
## Core Set
# ## Examination
# ## Reasoning
from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
cmmlu_datasets
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
drop_datasets
# ## Scientific
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
gsm8k_datasets
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets
# ## Coding
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
humaneval_datasets
# TODO: Add LiveCodeBench
# ## Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
ifeval_datasets
# ## Math
from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
math_datasets
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
mathbench_datasets
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
sanitized_mbpp_datasets
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
mmlu_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups
# Summarizer
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
# Model List
# from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
# from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
# from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
# from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
# from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
# from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
# with read_base():
core_summary_groups = [
{
'name':
'core_average',
'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'], ['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'], ['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'], ['hellaswag', 'accuracy'],
['mathbench-t (average)', 'naive_average']],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
['mmlu', 'accuracy'],
['mmlu_pro', 'accuracy'],
['cmmlu', 'accuracy'],
['bbh', 'score'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
['drop', 'accuracy'],
['sanitized_mbpp', 'score'],
['gsm8k', 'accuracy'],
['hellaswag', 'accuracy'],
'mathbench-a (average)',
'mathbench-t (average)'
'',
['mmlu', 'accuracy'],
['mmlu-stem', 'accuracy'],
['mmlu-social-science', 'accuracy'],
['mmlu-humanities', 'accuracy'],
['mmlu-other', 'accuracy'],
'',
['mmlu_pro', 'accuracy'],
['mmlu_pro_math', 'accuracy'],
['mmlu_pro_physics', 'accuracy'],
['mmlu_pro_chemistry', 'accuracy'],
['mmlu_pro_law', 'accuracy'],
['mmlu_pro_engineering', 'accuracy'],
['mmlu_pro_other', 'accuracy'],
['mmlu_pro_economics', 'accuracy'],
['mmlu_pro_health', 'accuracy'],
['mmlu_pro_psychology', 'accuracy'],
['mmlu_pro_business', 'accuracy'],
['mmlu_pro_biology', 'accuracy'],
['mmlu_pro_philosophy', 'accuracy'],
['mmlu_pro_computer_science', 'accuracy'],
['mmlu_pro_history', 'accuracy'],
'',
['cmmlu', 'accuracy'],
['cmmlu-stem', 'accuracy'],
['cmmlu-social-science', 'accuracy'],
['cmmlu-humanities', 'accuracy'],
['cmmlu-other', 'accuracy'],
['cmmlu-china-specific', 'accuracy'],
'',
['bbh', 'extract_rate'],
['math', 'extract_rate'],
# ['openai_humaneval', 'extract_rate'],
['GPQA_diamond', 'extract_rate'],
# ['IFEval', 'extract_rate'],
'',
['mmlu', 'extract_rate'],
['mmlu-stem', 'extract_rate'],
['mmlu-social-science', 'extract_rate'],
['mmlu-humanities', 'extract_rate'],
['mmlu-other', 'extract_rate'],
'',
['mmlu_pro', 'extract_rate'],
['mmlu_pro_math', 'extract_rate'],
['mmlu_pro_physics', 'extract_rate'],
['mmlu_pro_chemistry', 'extract_rate'],
['mmlu_pro_law', 'extract_rate'],
['mmlu_pro_engineering', 'extract_rate'],
['mmlu_pro_other', 'extract_rate'],
['mmlu_pro_economics', 'extract_rate'],
['mmlu_pro_health', 'extract_rate'],
['mmlu_pro_psychology', 'extract_rate'],
['mmlu_pro_business', 'extract_rate'],
['mmlu_pro_biology', 'extract_rate'],
['mmlu_pro_philosophy', 'extract_rate'],
['mmlu_pro_computer_science', 'extract_rate'],
['mmlu_pro_history', 'extract_rate'],
'',
['cmmlu', 'extract_rate'],
['cmmlu-stem', 'extract_rate'],
['cmmlu-social-science', 'extract_rate'],
['cmmlu-humanities', 'extract_rate'],
['cmmlu-other', 'extract_rate'],
['cmmlu-china-specific', 'extract_rate'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask)),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
base_exp_dir = 'outputs/corebench_2409_objective/'
work_dir = osp.join(base_exp_dir, 'chat_objective')
|