File size: 5,999 Bytes
8082566 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# flake8: noqa
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets
from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import (
gpqa_datasets,
)
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import (
mmlu_pro_datasets,
)
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
ifeval_datasets,
)
from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import (
smolinstruct_datasets_0shot_instruct as smolinstruct_datasets,
)
from opencompass.configs.datasets.ChemBench.ChemBench_llmjudge_gen_c584cf import (
chembench_datasets,
)
from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import (
matbench_datasets,
)
from opencompass.configs.datasets.ProteinLMBench.ProteinLMBench_llmjudge_gen_a67965 import (
proteinlmbench_datasets,
)
# Summary Groups
from opencompass.configs.summarizers.groups.mmlu_pro import (
mmlu_pro_summary_groups,
)
# Models
from opencompass.configs.models.interns1.intern_s1 import \
models as interns1_model
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
# Only take LCB generation for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
[])
# LLM judge config: using LLM to evaluate predictions
judge_cfg = dict()
for item in datasets:
item['infer_cfg']['inferencer']['max_out_len'] = 65536
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys() and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.extend(
[
{
'name': 'ChemBench',
'subsets': [
'ChemBench_Name_Conversion',
'ChemBench_Property_Prediction',
'ChemBench_Mol2caption',
'ChemBench_Caption2mol',
'ChemBench_Product_Prediction',
'ChemBench_Retrosynthesis',
'ChemBench_Yield_Prediction',
'ChemBench_Temperature_Prediction',
],
},
]
)
summarizer = dict(
dataset_abbrs=[
'Knowledge',
['mmlu_pro', 'accuracy'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['GPQA_diamond', 'accuracy'],
'',
'Math Calculation',
['aime2025', 'accuracy'],
'',
'Academic',
['ChemBench', 'naive_average'],
['ProteinLMBench', 'accuracy'],
'',
'SmolInstruct',
['NC-I2F-0shot-instruct', 'score'],
['NC-I2S-0shot-instruct', 'score'],
['NC-S2F-0shot-instruct', 'score'],
['NC-S2I-0shot-instruct', 'score'],
['PP-ESOL-0shot-instruct', 'score'],
['PP-Lipo-0shot-instruct', 'score'],
['PP-BBBP-0shot-instruct', 'accuracy'],
['PP-ClinTox-0shot-instruct', 'accuracy'],
['PP-HIV-0shot-instruct', 'accuracy'],
['PP-SIDER-0shot-instruct', 'accuracy'],
['MC-0shot-instruct', 'score'],
['MG-0shot-instruct', 'score'],
['FS-0shot-instruct', 'score'],
['RS-0shot-instruct', 'score'],
'',
['matbench_expt_gap', 'mae'],
['matbench_steels', 'mae'],
['matbench_expt_is_metal', 'accuracy'],
['matbench_glass', 'accuracy'],
'',
],
summary_groups=summary_groups,
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# infer with local runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask),
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
work_dir = './outputs/oc_bench_intern_s1'
|