opencompass / .github /scripts /oc_score_baseline_fullbench.yaml
msj19's picture
Add files using upload-large-folder tool
9b40ad5 verified
qwen-3-8b-hf-fullbench:
objective_other:
C-MHChem_accuracy: 81.25
C-MHChem_f1: 0.81
C-MHChem_accuracy_given_attempted: 81.25
C-MHChem_attempted_ratio: 100
C-MHChem_correct_count: 13
C-MHChem_incorrect_count: 3
C-MHChem_not_attempted_count: 0
CPsyExam_accuracy: 87.5
CPsyExam_f1: 0.88
CPsyExam_accuracy_given_attempted: 87.5
CPsyExam_attempted_ratio: 100
CPsyExam_correct_count: 14
CPsyExam_incorrect_count: 2
CPsyExam_not_attempted_count: 0
MaScQA_accuracy: 75
MaScQA_f1: 0.75
MaScQA_accuracy_given_attempted: 75
MaScQA_attempted_ratio: 100
MaScQA_correct_count: 12
MaScQA_incorrect_count: 4
MaScQA_not_attempted_count: 0
UGPhysics_AtomicPhysics_zh_accuracy: 56.25
objective_v5:
race-high_accuracy: 87.5
ARC-c_accuracy: 100
BoolQ_accuracy: 93.75
triviaqa_wiki_1shot_score: 0
nq_open_1shot_score: 0
IFEval_Prompt-level-strict-accuracy: 87.50
drop_accuracy: 93.75
GPQA_diamond_accuracy: 62.50
hellaswag_accuracy: 75
TheoremQA_score: 6.25
musr_average_naive_average: 14.58
korbench_single_naive_average: 75.00
gsm8k_accuracy: 75
math_accuracy: 93.75
cmo_fib_accuracy: 0
aime2024_accuracy: 6.25
wikibench-wiki-single_choice_cncircular_perf_4: 25
sanitized_mbpp_score: 6.25
lcb_code_generation_pass@1: 6.25
lcb_code_execution_pass@1: 50
lcb_test_output_pass@1: 0
teval_naive_average: 69.16
bbh-logical_deduction_seven_objects_score: 12.5
bbh-multistep_arithmetic_two_score: 0
mmlu-other_accuracy: 77.88
cmmlu-china-specific_accuracy: 74.17
mmlu_pro_math_accuracy: 81.25
openai_mmmlu_lite_AR-XY_accuracy: 50
college_naive_average: 37.5
college_knowledge_naive_average: 50
objective_v6:
aime2024_accuracy: 87.5
aime2024_f1: 0.88
aime2024_accuracy_given_attempted: 87.5
aime2024_attempted_ratio: 100
aime2024_correct_count: 14
aime2024_incorrect_count: 2
aime2024_not_attempted_count: 0
aime2025_accuracy: 56.25
aime2025_f1: 0.56
aime2025_accuracy_given_attempted: 56.25
aime2025_attempted_ratio: 100
aime2025_correct_count: 9
aime2025_incorrect_count: 7
aime2025_not_attempted_count: 0
bbh-temporal_sequences_score: 75
bbh-temporal_sequences_score_given_attempted: 75
bbh-temporal_sequences_attempted_ratio: 100
bbh-temporal_sequences_correct_count: 12
bbh-temporal_sequences_incorrect_count: 4
bbh-temporal_sequences_not_attempted_count: 0
cmo_fib_accuracy: 31.25
drop_accuracy: 56.25
drop_f1: 0.56
drop_accuracy_given_attempted: 56.25
drop_attempted_ratio: 100
drop_correct_count: 9
drop_incorrect_count: 7
drop_not_attempted_count: 0
GaokaoBench_2010-2022_Math_II_MCQs_score: 100
GPQA_diamond_accuracy: 56.25
GPQA_diamond_f1: 0.56
GPQA_diamond_accuracy_given_attempted: 56.25
GPQA_diamond_attempted_ratio: 100
GPQA_diamond_correct_count: 9
GPQA_diamond_incorrect_count: 7
GPQA_diamond_not_attempted_count: 0
gsm8k_accuracy: 93.75
hellaswag_accuracy: 100
hellaswag_f1: 1
hellaswag_accuracy_given_attempted: 100
hellaswag_attempted_ratio: 100
hellaswag_correct_count: 16
hellaswag_incorrect_count: 0
hellaswag_not_attempted_count: 0
korbench_cipher_accuracy: 93.75
korbench_cipher_f1: 0.94
korbench_cipher_accuracy_given_attempted: 93.75
korbench_cipher_attempted_ratio: 100
korbench_cipher_correct_count: 15
korbench_cipher_incorrect_count: 1
korbench_cipher_not_attempted_count: 0
math_prm800k_500-llmjudge_accuracy: 93.75
math_prm800k_500-llmjudge_f1: 0.94
math_prm800k_500-llmjudge_accuracy_given_attempted: 93.75
math_prm800k_500-llmjudge_attempted_ratio: 100
math_prm800k_500-llmjudge_correct_count: 15
math_prm800k_500-llmjudge_incorrect_count: 1
math_prm800k_500-llmjudge_not_attempted_count: 0
mathbench-college-single_choice_cn_acc_4: 87.5
mathbench-college-single_choice_cn_acc_1: 75
mathbench-college-single_choice_cn_more_1_0: 100
mathbench-college-single_choice_cn_more_1_1: 75
mathbench-college-single_choice_cn_more_4_0: 100
mathbench-college-single_choice_cn_more_4_1: 100
mathbench-college-single_choice_cn_more_4_2: 100
mathbench-college-single_choice_cn_more_4_3: 100
mathbench-college-single_choice_cn_more_4_4: 50
mathbench-college-single_choice_cn_perf_1: 75
mathbench-college-single_choice_cn_perf_4: 50
mathbench-college-single_choice_cn_vote_4: 100
mathbench-college-single_choice_cn_vote_1: 75
mathbench-college-single_choice_cn_prior_A: 31.25
mathbench-college-single_choice_cn_prior_B: 12.5
mathbench-college-single_choice_cn_prior_C: 25
mathbench-college-single_choice_cn_prior_D: 25
mathbench-college-single_choice_cn_prior_-: 6.25
musr_murder_mysteries_accuracy: 68.75
musr_murder_mysteries_f1: 0.69
musr_murder_mysteries_accuracy_given_attempted: 68.75
musr_murder_mysteries_attempted_ratio: 100
musr_murder_mysteries_correct_count: 11
musr_murder_mysteries_incorrect_count: 5
musr_murder_mysteries_not_attempted_count: 0
supergpqa_accuracy: 68.75
supergpqa_total_correct: 11
supergpqa_total_count: 16
supergpqa_SuperGPQA-Engineering: 80
supergpqa_SuperGPQA-Philosophy: 0
supergpqa_SuperGPQA-Medicine: 100
supergpqa_SuperGPQA-Economics: 50
supergpqa_SuperGPQA-Science: 66.67
supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100
supergpqa_SuperGPQA-Philosophy-Philosophy: 0
supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100
supergpqa_SuperGPQA-Economics-Applied Economics: 0
supergpqa_SuperGPQA-Science-Mathematics: 80
supergpqa_SuperGPQA-Science-Physics: 0
supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100
supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100
supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50
supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100
supergpqa_SuperGPQA-Economics-Theoretical Economics: 100
supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100
supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0
supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100
supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 0
supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100
supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0
supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100
supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100
supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100
supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0
supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50
supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100
supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100
supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100
triviaqa_wiki_1shot_score: 50
ARC_Prize_Public_Evaluation_accuracy: 0.06
objective_v7:
aime2024_accuracy: 50
aime2025_accuracy: 25
bbeh_boolean_expressions_accuracy: 75
bbeh_boolean_expressions_f1: 0.75
bbeh_boolean_expressions_accuracy_given_attempted: 75
bbeh_boolean_expressions_attempted_ratio: 100
bbeh_boolean_expressions_correct_count: 12
bbeh_boolean_expressions_incorrect_count: 4
bbeh_boolean_expressions_not_attempted_count: 0
Chem_exam-competition_final_score: 40.39
Chem_exam-gaokao_final_score: 78.12
ChemBench_Name_Conversion_accuracy: 93.75
ChemBench_Name_Conversion_f1: 0.94
ChemBench_Name_Conversion_accuracy_given_attempted: 93.75
ChemBench_Name_Conversion_attempted_ratio: 100
ChemBench_Name_Conversion_correct_count: 15
ChemBench_Name_Conversion_incorrect_count: 1
ChemBench_Name_Conversion_not_attempted_count: 0
ClimaQA_Gold_mcq_accuracy: 87.5
ClimaQA_Gold_mcq_f1: 0.88
ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5
ClimaQA_Gold_mcq_attempted_ratio: 100
ClimaQA_Gold_mcq_correct_count: 14
ClimaQA_Gold_mcq_incorrect_count: 2
ClimaQA_Gold_mcq_not_attempted_count: 0
cmmlu-agronomy_accuracy: 87.5
cmmlu-agronomy_f1: 0.88
cmmlu-agronomy_accuracy_given_attempted: 87.5
cmmlu-agronomy_attempted_ratio: 100
cmmlu-agronomy_correct_count: 14
cmmlu-agronomy_incorrect_count: 2
cmmlu-agronomy_not_attempted_count: 0
earth_silver_mcq_accuracy: 75
earth_silver_mcq_f1: 0.75
earth_silver_mcq_accuracy_given_attempted: 75
earth_silver_mcq_attempted_ratio: 100
earth_silver_mcq_correct_count: 12
earth_silver_mcq_incorrect_count: 4
earth_silver_mcq_not_attempted_count: 0
GPQA_diamond_accuracy: 56.25
hle_llmjudge_accuracy: 56.25
hle_llmjudge_f1: 0.56
hle_llmjudge_accuracy_given_attempted: 56.25
hle_llmjudge_attempted_ratio: 100
hle_llmjudge_correct_count: 9
hle_llmjudge_incorrect_count: 7
hle_llmjudge_not_attempted_count: 0
IFEval_Prompt-level-strict-accuracy: 87.5
IFEval_Inst-level-strict-accuracy: 91.67
IFEval_Prompt-level-loose-accuracy: 87.5
IFEval_Inst-level-loose-accuracy: 91.67
kcle_accuracy: 62.5
kcle_f1: 0.62
kcle_accuracy_given_attempted: 62.5
kcle_attempted_ratio: 100
kcle_correct_count: 10
kcle_incorrect_count: 6
kcle_not_attempted_count: 0
korbench_cipher_accuracy: 87.5
livemathbench_hard_custom_hard_cn_accuracy: 18.75
matbench_steels_mae: 635.88
math_prm800k_500_accuracy: 93.75
sanitized_mbpp_score: 6.25
sanitized_mbpp_pass: 1
sanitized_mbpp_timeout: 0
sanitized_mbpp_failed: 15
sanitized_mbpp_wrong_answer: 0
medxpertqa_accuracy: 50
medxpertqa_total_correct: 8
medxpertqa_total_count: 16
medxpertqa_MedXpertQA-Basic Science: 50
medxpertqa_MedXpertQA-Diagnosis: 37.5
medxpertqa_MedXpertQA-Treatment: 75
medxpertqa_MedXpertQA-Skeletal: 33.33
medxpertqa_MedXpertQA-Muscular: 100
medxpertqa_MedXpertQA-Respiratory: 0
medxpertqa_MedXpertQA-Endocrine: 100
medxpertqa_MedXpertQA-Cardiovascular: 50
medxpertqa_MedXpertQA-Lymphatic: 0
medxpertqa_MedXpertQA-Nervous: 100
medxpertqa_MedXpertQA-Reproductive: 0
medxpertqa_MedXpertQA-Reasoning: 50
medxpertqa_MedXpertQA-Understanding: 50
lukaemon_mmlu_college_biology_accuracy: 81.25
lukaemon_mmlu_college_biology_f1: 0.81
lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25
lukaemon_mmlu_college_biology_attempted_ratio: 100
lukaemon_mmlu_college_biology_correct_count: 13
lukaemon_mmlu_college_biology_incorrect_count: 3
lukaemon_mmlu_college_biology_not_attempted_count: 0
mmlu_pro_math_accuracy: 81.25
mmlu_pro_math_f1: 0.81
mmlu_pro_math_accuracy_given_attempted: 81.25
mmlu_pro_math_attempted_ratio: 100
mmlu_pro_math_correct_count: 13
mmlu_pro_math_incorrect_count: 3
mmlu_pro_math_not_attempted_count: 0
olymmath_llmjudge_en-hard_accuracy: 68.75
olymmath_llmjudge_en-hard_f1: 0.69
olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75
olymmath_llmjudge_en-hard_attempted_ratio: 100
olymmath_llmjudge_en-hard_correct_count: 11
olymmath_llmjudge_en-hard_incorrect_count: 5
olymmath_llmjudge_en-hard_not_attempted_count: 0
OlympiadBench_OE_TO_maths_en_COMP_accuracy: 50
OlympiadBench_OE_TO_maths_en_COMP_f1: 0.5
OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 50
OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100
OlympiadBench_OE_TO_maths_en_COMP_correct_count: 8
OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 8
OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0
phybench-eed_accuracy: 6.25
PHYSICS_atomic_dataset_textonly_accuracy: 81.25
PHYSICS_atomic_dataset_textonly_f1: 0.81
PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 81.25
PHYSICS_atomic_dataset_textonly_attempted_ratio: 100
PHYSICS_atomic_dataset_textonly_correct_count: 13
PHYSICS_atomic_dataset_textonly_incorrect_count: 3
PHYSICS_atomic_dataset_textonly_not_attempted_count: 0
ProteinLMBench_accuracy: 62.5
ProteinLMBench_f1: 0.62
ProteinLMBench_accuracy_given_attempted: 62.5
ProteinLMBench_attempted_ratio: 100
ProteinLMBench_correct_count: 10
ProteinLMBench_incorrect_count: 6
ProteinLMBench_not_attempted_count: 0
R-Bench_en_accuracy: 62.5
R-Bench_en_f1: 0.62
R-Bench_en_accuracy_given_attempted: 62.5
R-Bench_en_attempted_ratio: 100
R-Bench_en_correct_count: 10
R-Bench_en_incorrect_count: 6
R-Bench_en_not_attempted_count: 0
NC-I2F-0shot-instruct_score: 0
NC-I2F-0shot-instruct_valid_score: 68.75
srbench_mean_RMSE: 30057.14
srbench_mean_NMSE: 29.42
srbench_mean_R2: -28.42
srbench_SymbolicMatch: 0.18
supergpqa_Electronic_Science_and_Technology_accuracy: 68.75
lcb_code_generation_v6_pass@1: 37.5
chat_subjective:
alignment_bench_v1_1_总分: 0.46
arenahard_score: 100
Followbench_naive_average: 1
mtbench101_avg: 9
wildbench_average: 66.72
simpleqa_accuracy_given_attempted: 0.73
chinese_simpleqa_given_attempted_accuracy: 0.5
alignment_bench_v1_1_专业能力: 5.56
alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0
alignment_bench_v1_1_中文理解: 0
alignment_bench_v1_1_文本写作: 0
alignment_bench_v1_1_角色扮演: 0
alignment_bench_v1_1_综合问答: 0
compassarena_language_naive_average: 52.63
compassarena_knowledge_naive_average: 85
compassarena_reason_v2_naive_average: 50
compassarena_creationv2_zh_naive_average: 95
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0.62
qwen-3-8b-fullbench:
objective_other:
C-MHChem_accuracy: 75
C-MHChem_f1: 0.75
C-MHChem_accuracy_given_attempted: 75
C-MHChem_attempted_ratio: 100
C-MHChem_correct_count: 12
C-MHChem_incorrect_count: 4
C-MHChem_not_attempted_count: 0
CPsyExam_accuracy: 93.75
CPsyExam_f1: 0.94
CPsyExam_accuracy_given_attempted: 93.75
CPsyExam_attempted_ratio: 100
CPsyExam_correct_count: 15
CPsyExam_incorrect_count: 1
CPsyExam_not_attempted_count: 0
MaScQA_accuracy: 81.25
MaScQA_f1: 0.81
MaScQA_accuracy_given_attempted: 81.25
MaScQA_attempted_ratio: 100
MaScQA_correct_count: 13
MaScQA_incorrect_count: 3
MaScQA_not_attempted_count: 0
UGPhysics_AtomicPhysics_zh_accuracy: 68.75
objective_v5:
race-high_accuracy: 81.25
ARC-c_accuracy: 100
BoolQ_accuracy: 87.5
triviaqa_wiki_1shot_score: 0
nq_open_1shot_score: 0
IFEval_Prompt-level-strict-accuracy: 87.50
drop_accuracy: 93.75
GPQA_diamond_accuracy: 75
hellaswag_accuracy: 87.5
TheoremQA_score: 6.25
musr_average_naive_average: 20.83
korbench_single_naive_average: 72.5
gsm8k_accuracy: 68.75
math_accuracy: 93.75
cmo_fib_accuracy: 0
aime2024_accuracy: 0
wikibench-wiki-single_choice_cncircular_perf_4: 50
sanitized_mbpp_score: 6.25
lcb_code_generation_pass@1: 6.25
lcb_code_execution_pass@1: 68.75
lcb_test_output_pass@1: 0
teval_naive_average: 67.43
bbh-logical_deduction_seven_objects_score: 6.25
bbh-multistep_arithmetic_two_score: 0
mmlu-other_accuracy: 81.25
cmmlu-china-specific_accuracy: 75.83
mmlu_pro_math_accuracy: 87.5
openai_mmmlu_lite_AR-XY_accuracy: 43.75
college_naive_average: 25
college_knowledge_naive_average: 37.5
objective_v6:
aime2024_accuracy: 68.75
aime2024_f1: 0.69
aime2024_accuracy_given_attempted: 68.75
aime2024_attempted_ratio: 100
aime2024_correct_count: 11
aime2024_incorrect_count: 5
aime2024_not_attempted_count: 0
aime2025_accuracy: 56.25
aime2025_f1: 0.56
aime2025_accuracy_given_attempted: 56.25
aime2025_attempted_ratio: 100
aime2025_correct_count: 9
aime2025_incorrect_count: 7
aime2025_not_attempted_count: 0
bbh-temporal_sequences_score: 75
bbh-temporal_sequences_score_given_attempted: 75
bbh-temporal_sequences_attempted_ratio: 100
bbh-temporal_sequences_correct_count: 12
bbh-temporal_sequences_incorrect_count: 4
bbh-temporal_sequences_not_attempted_count: 0
cmo_fib_accuracy: 75
drop_accuracy: 56.25
drop_f1: 0.56
drop_accuracy_given_attempted: 56.25
drop_attempted_ratio: 100
drop_correct_count: 9
drop_incorrect_count: 7
drop_not_attempted_count: 0
GaokaoBench_2010-2022_Math_II_MCQs_score: 100
GPQA_diamond_accuracy: 62.5
GPQA_diamond_f1: 0.62
GPQA_diamond_accuracy_given_attempted: 62.5
GPQA_diamond_attempted_ratio: 100
GPQA_diamond_correct_count: 10
GPQA_diamond_incorrect_count: 6
GPQA_diamond_not_attempted_count: 0
gsm8k_accuracy: 100
hellaswag_accuracy: 100
hellaswag_f1: 1
hellaswag_accuracy_given_attempted: 100
hellaswag_attempted_ratio: 100
hellaswag_correct_count: 16
hellaswag_incorrect_count: 0
hellaswag_not_attempted_count: 0
korbench_cipher_accuracy: 93.75
korbench_cipher_f1: 0.94
korbench_cipher_accuracy_given_attempted: 93.75
korbench_cipher_attempted_ratio: 100
korbench_cipher_correct_count: 15
korbench_cipher_incorrect_count: 1
korbench_cipher_not_attempted_count: 0
math_prm800k_500-llmjudge_accuracy: 100
math_prm800k_500-llmjudge_f1: 1
math_prm800k_500-llmjudge_accuracy_given_attempted: 100
math_prm800k_500-llmjudge_attempted_ratio: 100
math_prm800k_500-llmjudge_correct_count: 16
math_prm800k_500-llmjudge_incorrect_count: 0
math_prm800k_500-llmjudge_not_attempted_count: 0
mathbench-college-single_choice_cn_acc_4: 93.75
mathbench-college-single_choice_cn_acc_1: 75
mathbench-college-single_choice_cn_more_1_0: 100
mathbench-college-single_choice_cn_more_1_1: 75
mathbench-college-single_choice_cn_more_4_0: 100
mathbench-college-single_choice_cn_more_4_1: 100
mathbench-college-single_choice_cn_more_4_2: 100
mathbench-college-single_choice_cn_more_4_3: 100
mathbench-college-single_choice_cn_more_4_4: 75
mathbench-college-single_choice_cn_perf_1: 75
mathbench-college-single_choice_cn_perf_4: 75
mathbench-college-single_choice_cn_vote_4: 100
mathbench-college-single_choice_cn_vote_1: 75
mathbench-college-single_choice_cn_prior_A: 31.25
mathbench-college-single_choice_cn_prior_B: 18.75
mathbench-college-single_choice_cn_prior_C: 25
mathbench-college-single_choice_cn_prior_D: 25
mathbench-college-single_choice_cn_prior_-: 0
musr_murder_mysteries_accuracy: 75
musr_murder_mysteries_f1: 0.75
musr_murder_mysteries_accuracy_given_attempted: 75
musr_murder_mysteries_attempted_ratio: 100
musr_murder_mysteries_correct_count: 12
musr_murder_mysteries_incorrect_count: 4
musr_murder_mysteries_not_attempted_count: 0
supergpqa_accuracy: 75
supergpqa_total_correct: 12
supergpqa_total_count: 16
supergpqa_SuperGPQA-Engineering: 80
supergpqa_SuperGPQA-Philosophy: 0
supergpqa_SuperGPQA-Medicine: 100
supergpqa_SuperGPQA-Economics: 100
supergpqa_SuperGPQA-Science: 66.67
supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100
supergpqa_SuperGPQA-Philosophy-Philosophy: 0
supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100
supergpqa_SuperGPQA-Economics-Applied Economics: 100
supergpqa_SuperGPQA-Science-Mathematics: 80
supergpqa_SuperGPQA-Science-Physics: 0
supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100
supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100
supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50
supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100
supergpqa_SuperGPQA-Economics-Theoretical Economics: 100
supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100
supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0
supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100
supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 100
supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100
supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0
supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100
supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100
supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100
supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0
supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50
supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100
supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100
supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100
triviaqa_wiki_1shot_score: 50
ARC_Prize_Public_Evaluation_accuracy: 0.06
objective_v7:
aime2024_accuracy: 75
aime2025_accuracy: 56.25
bbeh_boolean_expressions_accuracy: 68.75
bbeh_boolean_expressions_f1: 0.69
bbeh_boolean_expressions_accuracy_given_attempted: 68.75
bbeh_boolean_expressions_attempted_ratio: 100
bbeh_boolean_expressions_correct_count: 11
bbeh_boolean_expressions_incorrect_count: 5
bbeh_boolean_expressions_not_attempted_count: 0
Chem_exam-competition_final_score: 42.29
Chem_exam-gaokao_final_score: 71.88
ChemBench_Name_Conversion_accuracy: 100.00
ChemBench_Name_Conversion_f1: 1.00
ChemBench_Name_Conversion_accuracy_given_attempted: 100
ChemBench_Name_Conversion_attempted_ratio: 100.00
ChemBench_Name_Conversion_correct_count: 16.00
ChemBench_Name_Conversion_incorrect_count: 0
ChemBench_Name_Conversion_not_attempted_count: 0
ClimaQA_Gold_mcq_accuracy: 87.5
ClimaQA_Gold_mcq_f1: 0.88
ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5
ClimaQA_Gold_mcq_attempted_ratio: 100
ClimaQA_Gold_mcq_correct_count: 14
ClimaQA_Gold_mcq_incorrect_count: 2
ClimaQA_Gold_mcq_not_attempted_count: 0
cmmlu-agronomy_accuracy: 87.50
cmmlu-agronomy_f1: 0.88
cmmlu-agronomy_accuracy_given_attempted: 87.50
cmmlu-agronomy_attempted_ratio: 100.00
cmmlu-agronomy_correct_count: 14.00
cmmlu-agronomy_incorrect_count: 2
cmmlu-agronomy_not_attempted_count: 0.00
earth_silver_mcq_accuracy: 81.25
earth_silver_mcq_f1: 0.81
earth_silver_mcq_accuracy_given_attempted: 81.25
earth_silver_mcq_attempted_ratio: 100
earth_silver_mcq_correct_count: 13
earth_silver_mcq_incorrect_count: 3
earth_silver_mcq_not_attempted_count: 0
GPQA_diamond_accuracy: 68.75
hle_llmjudge_accuracy: 43.75
hle_llmjudge_f1: 0.43
hle_llmjudge_accuracy_given_attempted: 43.75
hle_llmjudge_attempted_ratio: 100.00
hle_llmjudge_correct_count: 7.00
hle_llmjudge_incorrect_count: 9.00
hle_llmjudge_not_attempted_count: 0.00
IFEval_Prompt-level-strict-accuracy: 87.5
IFEval_Inst-level-strict-accuracy: 91.67
IFEval_Prompt-level-loose-accuracy: 87.5
IFEval_Inst-level-loose-accuracy: 91.67
kcle_accuracy: 75.00
kcle_f1: 0.75
kcle_accuracy_given_attempted: 75.00
kcle_attempted_ratio: 100.00
kcle_correct_count: 12
kcle_incorrect_count: 4
kcle_not_attempted_count: 0
korbench_cipher_accuracy: 81.25
livemathbench_hard_custom_hard_cn_accuracy: 25
matbench_steels_mae: 816.50
math_prm800k_500_accuracy: 100
sanitized_mbpp_score: 6.25
sanitized_mbpp_pass: 1
sanitized_mbpp_timeout: 0
sanitized_mbpp_failed: 15
sanitized_mbpp_wrong_answer: 0
medxpertqa_accuracy: 62.5
medxpertqa_total_correct: 10
medxpertqa_total_count: 16
medxpertqa_MedXpertQA-Basic Science: 75
medxpertqa_MedXpertQA-Diagnosis: 50
medxpertqa_MedXpertQA-Treatment: 75
medxpertqa_MedXpertQA-Skeletal: 66.67
medxpertqa_MedXpertQA-Muscular: 100
medxpertqa_MedXpertQA-Respiratory: 33.33
medxpertqa_MedXpertQA-Endocrine: 66.67
medxpertqa_MedXpertQA-Cardiovascular: 100
medxpertqa_MedXpertQA-Lymphatic: 0
medxpertqa_MedXpertQA-Nervous: 100
medxpertqa_MedXpertQA-Reproductive: 0
medxpertqa_MedXpertQA-Reasoning: 64.29
medxpertqa_MedXpertQA-Understanding: 50
lukaemon_mmlu_college_biology_accuracy: 81.25
lukaemon_mmlu_college_biology_f1: 0.81
lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25
lukaemon_mmlu_college_biology_attempted_ratio: 100
lukaemon_mmlu_college_biology_correct_count: 13
lukaemon_mmlu_college_biology_incorrect_count: 3
lukaemon_mmlu_college_biology_not_attempted_count: 0
mmlu_pro_math_accuracy: 68.75
mmlu_pro_math_f1: 0.69
mmlu_pro_math_accuracy_given_attempted: 68.75
mmlu_pro_math_attempted_ratio: 100
mmlu_pro_math_correct_count: 11
mmlu_pro_math_incorrect_count: 5
mmlu_pro_math_not_attempted_count: 0
olymmath_llmjudge_en-hard_accuracy: 68.75
olymmath_llmjudge_en-hard_f1: 0.69
olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75
olymmath_llmjudge_en-hard_attempted_ratio: 100.00
olymmath_llmjudge_en-hard_correct_count: 11
olymmath_llmjudge_en-hard_incorrect_count: 5
olymmath_llmjudge_en-hard_not_attempted_count: 0
OlympiadBench_OE_TO_maths_en_COMP_accuracy: 43.75
OlympiadBench_OE_TO_maths_en_COMP_f1: 0.44
OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 43.75
OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100.00
OlympiadBench_OE_TO_maths_en_COMP_correct_count: 7
OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 9
OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0
phybench-eed_accuracy: 27.94
PHYSICS_atomic_dataset_textonly_accuracy: 93.75
PHYSICS_atomic_dataset_textonly_f1: 0.94
PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 93.75
PHYSICS_atomic_dataset_textonly_attempted_ratio: 100
PHYSICS_atomic_dataset_textonly_correct_count: 15
PHYSICS_atomic_dataset_textonly_incorrect_count: 1
PHYSICS_atomic_dataset_textonly_not_attempted_count: 0
ProteinLMBench_accuracy: 68.75
ProteinLMBench_f1: 0.69
ProteinLMBench_accuracy_given_attempted: 68.75
ProteinLMBench_attempted_ratio: 100
ProteinLMBench_correct_count: 11
ProteinLMBench_incorrect_count: 5
ProteinLMBench_not_attempted_count: 0
R-Bench_en_accuracy: 50.00
R-Bench_en_f1: 0.50
R-Bench_en_accuracy_given_attempted: 50.00
R-Bench_en_attempted_ratio: 100.00
R-Bench_en_correct_count: 8
R-Bench_en_incorrect_count: 8
R-Bench_en_not_attempted_count: 0
NC-I2F-0shot-instruct_score: 12.5
NC-I2F-0shot-instruct_valid_score: 75
srbench_mean_RMSE: 61339597.01
srbench_mean_NMSE: 83.61
srbench_mean_R2: -82.61
srbench_SymbolicMatch: 0.14
supergpqa_Electronic_Science_and_Technology_accuracy: 56.25
lcb_code_generation_v6_pass@1: 50
chat_longtext:
babilong_qa1_256k_score: 0.00
LongBench_2wikimqa_score: 5.43
Length16000Depth0_2needle_en_128k_score: 100.00
ruler_cwe_128k_score: 0.00
chat_subjective:
alignment_bench_v1_1_总分: 0.48
arenahard_score: 92.07
Followbench_naive_average: 1
mtbench101_avg: 8.7
wildbench_average: 75.17
simpleqa_accuracy_given_attempted: 0.55
chinese_simpleqa_given_attempted_accuracy: 0.64
alignment_bench_v1_1_专业能力: 5.81
alignment_bench_v1_1_数学计算: 0
alignment_bench_v1_1_基本任务: 0
alignment_bench_v1_1_逻辑推理: 0
alignment_bench_v1_1_中文理解: 0
alignment_bench_v1_1_文本写作: 0
alignment_bench_v1_1_角色扮演: 0
alignment_bench_v1_1_综合问答: 0
compassarena_language_naive_average: 55
compassarena_knowledge_naive_average: 100
compassarena_reason_v2_naive_average: 78.95
compassarena_creationv2_zh_naive_average: 100
followbench_llmeval_en_HSR_AVG: 1
followbench_llmeval_en_SSR_AVG: 1
followbench_llmeval_en_HSR_L1: 1
followbench_llmeval_en_HSR_L2: 1
followbench_llmeval_en_HSR_L3: 1
followbench_llmeval_en_HSR_L4: 1
followbench_llmeval_en_HSR_L5: 1
followbench_llmeval_en_SSR_L1: 1
followbench_llmeval_en_SSR_L2: 1
followbench_llmeval_en_SSR_L3: 1
followbench_llmeval_en_SSR_L4: 1
followbench_llmeval_en_SSR_L5: 1
simpleqa_f1: 0.46
qwen3-8b-base-turbomind:
base_longtext:
LongBench_2wikimqa_score: 26.98
Length32000Depth0_origin_en_32000_score: 100.00
qwen-3-8b-base-hf-fullbench:
objective_base:
race-high_accuracy: 87.5
ARC-c_accuracy: 75
BoolQ_accuracy: 75
triviaqa_wiki_1shot_score: 37.5
nq_open_1shot_score: 6.25
drop_accuracy: 75
GPQA_diamond_accuracy: 81.25
hellaswag_accuracy: 81.25
TheoremQA_score: 18.75
winogrande_accuracy: 81.25
gsm8k_accuracy: 81.25
GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
math_accuracy: 87.5
wikibench-wiki-single_choice_cncircular_perf_4: 50
sanitized_mbpp_score: 93.75
mmlu-other_accuracy: 76.44
cmmlu-china-specific_accuracy: 82.5
mmlu_pro_math_accuracy: 50
bbh-logical_deduction_seven_objects_score: 37.5
bbh-multistep_arithmetic_two_score: 0
college_naive_average: 37.5
college_knowledge_naive_average: 87.5
qwen-3-8b-base-fullbench:
objective_base:
race-high_accuracy: 87.5
ARC-c_accuracy: 75
BoolQ_accuracy: 81.25
triviaqa_wiki_1shot_score: 37.5
nq_open_1shot_score: 6.25
drop_accuracy: 81.25
GPQA_diamond_accuracy: 81.25
hellaswag_accuracy: 75
TheoremQA_score: 18.75
winogrande_accuracy: 81.25
gsm8k_accuracy: 81.25
GaokaoBench_2010-2022_Math_II_MCQs_score: 100
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
math_accuracy: 81.25
wikibench-wiki-single_choice_cncircular_perf_4: 50
sanitized_mbpp_score: 93.75
mmlu-other_accuracy: 75.48
cmmlu-china-specific_accuracy: 83.75
mmlu_pro_math_accuracy: 50
bbh-logical_deduction_seven_objects_score: 62.5
bbh-multistep_arithmetic_two_score: 100
college_naive_average: 37.5
college_knowledge_naive_average: 87.5