File size: 7,671 Bytes
a908f55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard
import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
TurboMindModelwithChatTemplate,
)
#######################################################################
# PART 1 Datasets List #
#######################################################################
with read_base():
# You can comment out the datasets you don't want to evaluate
# Datasets
# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
# Summarizer
from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)
# Set LLM Verifier used for each dataset
verifier_cfg = dict(
abbr='qwen2-5-32B-Instruct',
type=OpenAISDK,
path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
key='sk-1234', # You need to set your own API key
openai_api_base=[
'http://172.30.56.1:4000/v1', # You need to set your own API base
],
meta_template=dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
),
query_per_second=16,
batch_size=1024,
temperature=0.001,
tokenizer_path='gpt-4o-2024-05-13',
verbose=True,
max_out_len=16384,
# max_seq_len=32768,
max_seq_len=49152,
)
for item in datasets:
# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
#######################################################################
# PART 2 Model List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
models += [
# You can comment out the models you don't want to evaluate
# All models use sampling mode
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
gen_config=dict(
do_sample=True,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=64,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-14b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=32768),
# max_seq_len=32768,
# max_out_len=32768,
# batch_size=128,
# run_cfg=dict(num_gpus=2),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
# dict(
# type=TurboMindModelwithChatTemplate,
# abbr='deepseek-r1-distill-qwen-32b-turbomind',
# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
# gen_config=dict(
# do_sample=True,
# temperature=0.6,
# top_p=0.95,
# max_new_tokens=16384),
# max_seq_len=32768,
# max_out_len=16384,
# batch_size=128,
# run_cfg=dict(num_gpus=4),
# pred_postprocessor=dict(type=extract_non_reasoning_content)
# ),
]
#######################################################################
# PART 3 Inference/Evaluation #
#######################################################################
# Inference configuration
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=1
# Similar with data-parallelism, how many workers for evaluation,
# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
# to max-utilize the GPUs.
# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask)
),
)
# Evaluation configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner, n=8
),
runner=dict(
type=LocalRunner,
task=dict(
type=OpenICLEvalTask)
),
)
#######################################################################
# PART 4 Summarizer #
#######################################################################
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.extend([
{
'name': 'AIME2024-Aveage8',
'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
},
{
'name': 'LiveMathBench-v202412-Hard-Aveage8',
'subsets':[[
f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
]
}
])
# Summarizer
summarizer = dict(
dataset_abbrs=[
'MATH',
# ['LiveMathBench-k1-n1', 'pass@1'],
# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
# ['aime2024', 'accuracy'],
['math_prm800k_500-llmjudge', 'accuracy'],
['AIME2024-Aveage8', 'naive_average'],
['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
['OlympiadBenchMath', 'accuracy'],
['OmniMath', 'accuracy'],
],
summary_groups=summary_groups,
)
#######################################################################
# PART 5 Utils #
#######################################################################
work_dir = 'outputs/deepseek_r1_reasoning'
|