opencompass / .github /scripts /eval_regression_chat_sub_fullbench.py
msj19's picture
Add files using upload-large-folder tool
9b40ad5 verified
from mmengine.config import read_base
from opencompass.models import (HuggingFacewithChatTemplate,
TurboMindModelwithChatTemplate)
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
# read hf models - chat models
# Dataset
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
csimpleqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
alignbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
alpacav2_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
arenahard_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
compassarena_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
followbench_llmeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
mtbench101_datasets # noqa: F401, E501
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
wildbench_datasets # noqa: F401, E501
from ...rjob import infer, sub_eval # noqa: F401, E501
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
and 'mtbench101' not in k and 'wildbench' not in k), [])
datasets += mtbench101_datasets # noqa: F401, E501
datasets += wildbench_datasets # noqa: F401, E501
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
hf_model = dict(type=HuggingFacewithChatTemplate,
abbr='qwen-3-8b-hf-fullbench',
path='Qwen/Qwen3-8B',
max_out_len=8192,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
tm_model = dict(type=TurboMindModelwithChatTemplate,
abbr='qwen-3-8b-fullbench',
path='Qwen/Qwen3-8B',
engine_config=dict(session_len=32768, max_batch_size=1, tp=1),
gen_config=dict(do_sample=False, enable_thinking=True),
max_seq_len=32768,
max_out_len=32768,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
models = [hf_model, tm_model]
judge_models = [
dict(type=TurboMindModelwithChatTemplate,
abbr='qwen-3-8b-judger',
path='Qwen/Qwen3-8B',
engine_config=dict(session_len=46000, max_batch_size=1, tp=1),
gen_config=dict(do_sample=False, enable_thinking=False),
max_seq_len=46000,
max_out_len=46000,
batch_size=1,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content))
]
sub_eval['partitioner']['judge_models'] = judge_models
eval = sub_eval
summary_groups = []
summary_groups.append({
'name': 'compassarena_language',
'subsets': [
['compassarena_language', '内容总结'],
],
})
summary_groups.append({
'name': 'compassarena_knowledge',
'subsets': [
['compassarena_knowledge', '生活常识_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_reason_v2',
'subsets': [
['compassarena_reason_v2', 'reasoning'],
],
})
summary_groups.append({
'name': 'compassarena_math_v2',
'subsets': [
['compassarena_math_v2', '高等数学_ZH'],
],
})
summary_groups.append({
'name': 'compassarena_creationv2_zh',
'subsets': [
['compassarena_creationv2_zh', '内容扩写_ZH'],
],
})
summary_groups.append({
'name':
'CompassArena',
'subsets': [
'compassarena_language',
'compassarena_knowledge',
'compassarena_reason_v2',
'compassarena_math_v2',
'compassarena_creationv2_zh',
],
})
summary_groups.append({
'name':
'FoFo',
'subsets': [['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall']],
})
summary_groups.append({
'name':
'Followbench',
'subsets': [
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
],
})
# Summarizer
summarizer = dict(
dataset_abbrs=[
['alignment_bench_v1_1', '总分'],
['alpaca_eval', 'total'],
['arenahard', 'score'],
['Followbench', 'naive_average'],
['CompassArena', 'naive_average'],
['FoFo', 'naive_average'],
['mtbench101', 'avg'],
['wildbench', 'average'],
['simpleqa', 'accuracy_given_attempted'],
['chinese_simpleqa', 'given_attempted_accuracy'],
'',
['alignment_bench_v1_1', '专业能力'],
['alignment_bench_v1_1', '数学计算'],
['alignment_bench_v1_1', '基本任务'],
['alignment_bench_v1_1', '逻辑推理'],
['alignment_bench_v1_1', '中文理解'],
['alignment_bench_v1_1', '文本写作'],
['alignment_bench_v1_1', '角色扮演'],
['alignment_bench_v1_1', '综合问答'],
['alpaca_eval', 'helpful_base'],
['alpaca_eval', 'koala'],
['alpaca_eval', 'oasst'],
['alpaca_eval', 'selfinstruct'],
['alpaca_eval', 'vicuna'],
['compassarena_language', 'naive_average'],
['compassarena_knowledge', 'naive_average'],
['compassarena_reason_v2', 'naive_average'],
['compassarena_math_v2', 'naive_average'],
['compassarena_creationv2_zh', 'naive_average'],
['fofo_test_prompts', 'overall'],
['fofo_test_prompts_cn', 'overall'],
['followbench_llmeval_en', 'HSR_AVG'],
['followbench_llmeval_en', 'SSR_AVG'],
['followbench_llmeval_en', 'HSR_L1'],
['followbench_llmeval_en', 'HSR_L2'],
['followbench_llmeval_en', 'HSR_L3'],
['followbench_llmeval_en', 'HSR_L4'],
['followbench_llmeval_en', 'HSR_L5'],
['followbench_llmeval_en', 'SSR_L1'],
['followbench_llmeval_en', 'SSR_L2'],
['followbench_llmeval_en', 'SSR_L3'],
['followbench_llmeval_en', 'SSR_L4'],
['followbench_llmeval_en', 'SSR_L5'],
['simpleqa', 'f1'],
],
type=DefaultSubjectiveSummarizer,
summary_groups=summary_groups,
)