opencompass / examples /eval_deepseek_r1.py

Add files using upload-large-folder tool

a908f55 verified 19 days ago

7.67 kB

	# Support AIME-2024 with Repeat8
	# Support MATH-500
	# Support OlympiadBench
	# Support OmniMath
	# Support LiveMathBench-202412-Hard

	import os.path as osp
	from itertools import product
	from opencompass.models import OpenAISDK
	from mmengine.config import read_base
	from opencompass.utils.text_postprocessors import extract_non_reasoning_content
	from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
	from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
	from opencompass.runners import LocalRunner
	from opencompass.models import (
	TurboMindModelwithChatTemplate,
	)

	#######################################################################
	# PART 1 Datasets List #
	#######################################################################
	with read_base():
	# You can comment out the datasets you don't want to evaluate

	# Datasets
	# from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
	from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
	# from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
	# from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
	# from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets


	# Summarizer
	from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups

	datasets = sum(
	(v for k, v in locals().items() if k.endswith('_datasets')),
	[],
	)

	# Set LLM Verifier used for each dataset

	verifier_cfg = dict(
	abbr='qwen2-5-32B-Instruct',
	type=OpenAISDK,
	path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
	key='sk-1234', # You need to set your own API key
	openai_api_base=[
	'http://172.30.56.1:4000/v1', # You need to set your own API base
	],
	meta_template=dict(
	round=[
	dict(role='HUMAN', api_role='HUMAN'),
	dict(role='BOT', api_role='BOT', generate=True),
	],
	),
	query_per_second=16,
	batch_size=1024,
	temperature=0.001,
	tokenizer_path='gpt-4o-2024-05-13',
	verbose=True,
	max_out_len=16384,
	# max_seq_len=32768,
	max_seq_len=49152,
	)

	for item in datasets:
	# item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
	if 'judge_cfg' in item['eval_cfg']['evaluator']:
	item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg


	#######################################################################
	# PART 2 Model List #
	#######################################################################

	models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

	models += [
	# You can comment out the models you don't want to evaluate
	# All models use sampling mode
	dict(
	type=TurboMindModelwithChatTemplate,
	abbr='deepseek-r1-distill-qwen-7b-turbomind',
	path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
	engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
	gen_config=dict(
	do_sample=True,
	temperature=0.6,
	top_p=0.95,
	max_new_tokens=32768),
	max_seq_len=32768,
	max_out_len=32768,
	batch_size=64,
	run_cfg=dict(num_gpus=1),
	pred_postprocessor=dict(type=extract_non_reasoning_content)
	),
	# dict(
	# type=TurboMindModelwithChatTemplate,
	# abbr='deepseek-r1-distill-qwen-14b-turbomind',
	# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
	# engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
	# gen_config=dict(
	# do_sample=True,
	# temperature=0.6,
	# top_p=0.95,
	# max_new_tokens=32768),
	# max_seq_len=32768,
	# max_out_len=32768,
	# batch_size=128,
	# run_cfg=dict(num_gpus=2),
	# pred_postprocessor=dict(type=extract_non_reasoning_content)
	# ),
	# dict(
	# type=TurboMindModelwithChatTemplate,
	# abbr='deepseek-r1-distill-qwen-32b-turbomind',
	# path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
	# engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
	# gen_config=dict(
	# do_sample=True,
	# temperature=0.6,
	# top_p=0.95,
	# max_new_tokens=16384),
	# max_seq_len=32768,
	# max_out_len=16384,
	# batch_size=128,
	# run_cfg=dict(num_gpus=4),
	# pred_postprocessor=dict(type=extract_non_reasoning_content)
	# ),
	]

	#######################################################################
	# PART 3 Inference/Evaluation #
	#######################################################################

	# Inference configuration
	infer = dict(
	partitioner=dict(
	type=NumWorkerPartitioner,
	num_worker=1
	# Similar with data-parallelism, how many workers for evaluation,
	# each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
	# For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
	# to max-utilize the GPUs.
	# If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
	),
	runner=dict(
	type=LocalRunner,
	task=dict(type=OpenICLInferTask)
	),
	)

	# Evaluation configuration
	eval = dict(
	partitioner=dict(
	type=NaivePartitioner, n=8
	),
	runner=dict(
	type=LocalRunner,
	task=dict(
	type=OpenICLEvalTask)
	),
	)


	#######################################################################
	# PART 4 Summarizer #
	#######################################################################


	summary_groups = sum(
	[v for k, v in locals().items() if k.endswith('_summary_groups')], []
	)

	summary_groups.extend([
	{
	'name': 'AIME2024-Aveage8',
	'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
	},
	{
	'name': 'LiveMathBench-v202412-Hard-Aveage8',
	'subsets':[[
	f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
	for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
	]
	}
	])

	# Summarizer
	summarizer = dict(
	dataset_abbrs=[
	'MATH',
	# ['LiveMathBench-k1-n1', 'pass@1'],
	# ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
	# ['aime2024', 'accuracy'],
	['math_prm800k_500-llmjudge', 'accuracy'],
	['AIME2024-Aveage8', 'naive_average'],
	['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
	['OlympiadBenchMath', 'accuracy'],
	['OmniMath', 'accuracy'],
	],
	summary_groups=summary_groups,
	)


	#######################################################################
	# PART 5 Utils #
	#######################################################################

	work_dir = 'outputs/deepseek_r1_reasoning'