qwen-3-8b-hf-fullbench: objective_other: C-MHChem_accuracy: 81.25 C-MHChem_f1: 0.81 C-MHChem_accuracy_given_attempted: 81.25 C-MHChem_attempted_ratio: 100 C-MHChem_correct_count: 13 C-MHChem_incorrect_count: 3 C-MHChem_not_attempted_count: 0 CPsyExam_accuracy: 87.5 CPsyExam_f1: 0.88 CPsyExam_accuracy_given_attempted: 87.5 CPsyExam_attempted_ratio: 100 CPsyExam_correct_count: 14 CPsyExam_incorrect_count: 2 CPsyExam_not_attempted_count: 0 MaScQA_accuracy: 75 MaScQA_f1: 0.75 MaScQA_accuracy_given_attempted: 75 MaScQA_attempted_ratio: 100 MaScQA_correct_count: 12 MaScQA_incorrect_count: 4 MaScQA_not_attempted_count: 0 UGPhysics_AtomicPhysics_zh_accuracy: 56.25 objective_v5: race-high_accuracy: 87.5 ARC-c_accuracy: 100 BoolQ_accuracy: 93.75 triviaqa_wiki_1shot_score: 0 nq_open_1shot_score: 0 IFEval_Prompt-level-strict-accuracy: 87.50 drop_accuracy: 93.75 GPQA_diamond_accuracy: 62.50 hellaswag_accuracy: 75 TheoremQA_score: 6.25 musr_average_naive_average: 14.58 korbench_single_naive_average: 75.00 gsm8k_accuracy: 75 math_accuracy: 93.75 cmo_fib_accuracy: 0 aime2024_accuracy: 6.25 wikibench-wiki-single_choice_cncircular_perf_4: 25 sanitized_mbpp_score: 6.25 lcb_code_generation_pass@1: 6.25 lcb_code_execution_pass@1: 50 lcb_test_output_pass@1: 0 teval_naive_average: 69.16 bbh-logical_deduction_seven_objects_score: 12.5 bbh-multistep_arithmetic_two_score: 0 mmlu-other_accuracy: 77.88 cmmlu-china-specific_accuracy: 74.17 mmlu_pro_math_accuracy: 81.25 openai_mmmlu_lite_AR-XY_accuracy: 50 college_naive_average: 37.5 college_knowledge_naive_average: 50 objective_v6: aime2024_accuracy: 87.5 aime2024_f1: 0.88 aime2024_accuracy_given_attempted: 87.5 aime2024_attempted_ratio: 100 aime2024_correct_count: 14 aime2024_incorrect_count: 2 aime2024_not_attempted_count: 0 aime2025_accuracy: 56.25 aime2025_f1: 0.56 aime2025_accuracy_given_attempted: 56.25 aime2025_attempted_ratio: 100 aime2025_correct_count: 9 aime2025_incorrect_count: 7 aime2025_not_attempted_count: 0 bbh-temporal_sequences_score: 75 bbh-temporal_sequences_score_given_attempted: 75 bbh-temporal_sequences_attempted_ratio: 100 bbh-temporal_sequences_correct_count: 12 bbh-temporal_sequences_incorrect_count: 4 bbh-temporal_sequences_not_attempted_count: 0 cmo_fib_accuracy: 31.25 drop_accuracy: 56.25 drop_f1: 0.56 drop_accuracy_given_attempted: 56.25 drop_attempted_ratio: 100 drop_correct_count: 9 drop_incorrect_count: 7 drop_not_attempted_count: 0 GaokaoBench_2010-2022_Math_II_MCQs_score: 100 GPQA_diamond_accuracy: 56.25 GPQA_diamond_f1: 0.56 GPQA_diamond_accuracy_given_attempted: 56.25 GPQA_diamond_attempted_ratio: 100 GPQA_diamond_correct_count: 9 GPQA_diamond_incorrect_count: 7 GPQA_diamond_not_attempted_count: 0 gsm8k_accuracy: 93.75 hellaswag_accuracy: 100 hellaswag_f1: 1 hellaswag_accuracy_given_attempted: 100 hellaswag_attempted_ratio: 100 hellaswag_correct_count: 16 hellaswag_incorrect_count: 0 hellaswag_not_attempted_count: 0 korbench_cipher_accuracy: 93.75 korbench_cipher_f1: 0.94 korbench_cipher_accuracy_given_attempted: 93.75 korbench_cipher_attempted_ratio: 100 korbench_cipher_correct_count: 15 korbench_cipher_incorrect_count: 1 korbench_cipher_not_attempted_count: 0 math_prm800k_500-llmjudge_accuracy: 93.75 math_prm800k_500-llmjudge_f1: 0.94 math_prm800k_500-llmjudge_accuracy_given_attempted: 93.75 math_prm800k_500-llmjudge_attempted_ratio: 100 math_prm800k_500-llmjudge_correct_count: 15 math_prm800k_500-llmjudge_incorrect_count: 1 math_prm800k_500-llmjudge_not_attempted_count: 0 mathbench-college-single_choice_cn_acc_4: 87.5 mathbench-college-single_choice_cn_acc_1: 75 mathbench-college-single_choice_cn_more_1_0: 100 mathbench-college-single_choice_cn_more_1_1: 75 mathbench-college-single_choice_cn_more_4_0: 100 mathbench-college-single_choice_cn_more_4_1: 100 mathbench-college-single_choice_cn_more_4_2: 100 mathbench-college-single_choice_cn_more_4_3: 100 mathbench-college-single_choice_cn_more_4_4: 50 mathbench-college-single_choice_cn_perf_1: 75 mathbench-college-single_choice_cn_perf_4: 50 mathbench-college-single_choice_cn_vote_4: 100 mathbench-college-single_choice_cn_vote_1: 75 mathbench-college-single_choice_cn_prior_A: 31.25 mathbench-college-single_choice_cn_prior_B: 12.5 mathbench-college-single_choice_cn_prior_C: 25 mathbench-college-single_choice_cn_prior_D: 25 mathbench-college-single_choice_cn_prior_-: 6.25 musr_murder_mysteries_accuracy: 68.75 musr_murder_mysteries_f1: 0.69 musr_murder_mysteries_accuracy_given_attempted: 68.75 musr_murder_mysteries_attempted_ratio: 100 musr_murder_mysteries_correct_count: 11 musr_murder_mysteries_incorrect_count: 5 musr_murder_mysteries_not_attempted_count: 0 supergpqa_accuracy: 68.75 supergpqa_total_correct: 11 supergpqa_total_count: 16 supergpqa_SuperGPQA-Engineering: 80 supergpqa_SuperGPQA-Philosophy: 0 supergpqa_SuperGPQA-Medicine: 100 supergpqa_SuperGPQA-Economics: 50 supergpqa_SuperGPQA-Science: 66.67 supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100 supergpqa_SuperGPQA-Philosophy-Philosophy: 0 supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100 supergpqa_SuperGPQA-Economics-Applied Economics: 0 supergpqa_SuperGPQA-Science-Mathematics: 80 supergpqa_SuperGPQA-Science-Physics: 0 supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100 supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100 supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50 supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100 supergpqa_SuperGPQA-Economics-Theoretical Economics: 100 supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100 supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0 supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100 supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 0 supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100 supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0 supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100 supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100 supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100 supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0 supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50 supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100 supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100 supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100 triviaqa_wiki_1shot_score: 50 ARC_Prize_Public_Evaluation_accuracy: 0.06 objective_v7: aime2024_accuracy: 50 aime2025_accuracy: 25 bbeh_boolean_expressions_accuracy: 75 bbeh_boolean_expressions_f1: 0.75 bbeh_boolean_expressions_accuracy_given_attempted: 75 bbeh_boolean_expressions_attempted_ratio: 100 bbeh_boolean_expressions_correct_count: 12 bbeh_boolean_expressions_incorrect_count: 4 bbeh_boolean_expressions_not_attempted_count: 0 Chem_exam-competition_final_score: 40.39 Chem_exam-gaokao_final_score: 78.12 ChemBench_Name_Conversion_accuracy: 93.75 ChemBench_Name_Conversion_f1: 0.94 ChemBench_Name_Conversion_accuracy_given_attempted: 93.75 ChemBench_Name_Conversion_attempted_ratio: 100 ChemBench_Name_Conversion_correct_count: 15 ChemBench_Name_Conversion_incorrect_count: 1 ChemBench_Name_Conversion_not_attempted_count: 0 ClimaQA_Gold_mcq_accuracy: 87.5 ClimaQA_Gold_mcq_f1: 0.88 ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5 ClimaQA_Gold_mcq_attempted_ratio: 100 ClimaQA_Gold_mcq_correct_count: 14 ClimaQA_Gold_mcq_incorrect_count: 2 ClimaQA_Gold_mcq_not_attempted_count: 0 cmmlu-agronomy_accuracy: 87.5 cmmlu-agronomy_f1: 0.88 cmmlu-agronomy_accuracy_given_attempted: 87.5 cmmlu-agronomy_attempted_ratio: 100 cmmlu-agronomy_correct_count: 14 cmmlu-agronomy_incorrect_count: 2 cmmlu-agronomy_not_attempted_count: 0 earth_silver_mcq_accuracy: 75 earth_silver_mcq_f1: 0.75 earth_silver_mcq_accuracy_given_attempted: 75 earth_silver_mcq_attempted_ratio: 100 earth_silver_mcq_correct_count: 12 earth_silver_mcq_incorrect_count: 4 earth_silver_mcq_not_attempted_count: 0 GPQA_diamond_accuracy: 56.25 hle_llmjudge_accuracy: 56.25 hle_llmjudge_f1: 0.56 hle_llmjudge_accuracy_given_attempted: 56.25 hle_llmjudge_attempted_ratio: 100 hle_llmjudge_correct_count: 9 hle_llmjudge_incorrect_count: 7 hle_llmjudge_not_attempted_count: 0 IFEval_Prompt-level-strict-accuracy: 87.5 IFEval_Inst-level-strict-accuracy: 91.67 IFEval_Prompt-level-loose-accuracy: 87.5 IFEval_Inst-level-loose-accuracy: 91.67 kcle_accuracy: 62.5 kcle_f1: 0.62 kcle_accuracy_given_attempted: 62.5 kcle_attempted_ratio: 100 kcle_correct_count: 10 kcle_incorrect_count: 6 kcle_not_attempted_count: 0 korbench_cipher_accuracy: 87.5 livemathbench_hard_custom_hard_cn_accuracy: 18.75 matbench_steels_mae: 635.88 math_prm800k_500_accuracy: 93.75 sanitized_mbpp_score: 6.25 sanitized_mbpp_pass: 1 sanitized_mbpp_timeout: 0 sanitized_mbpp_failed: 15 sanitized_mbpp_wrong_answer: 0 medxpertqa_accuracy: 50 medxpertqa_total_correct: 8 medxpertqa_total_count: 16 medxpertqa_MedXpertQA-Basic Science: 50 medxpertqa_MedXpertQA-Diagnosis: 37.5 medxpertqa_MedXpertQA-Treatment: 75 medxpertqa_MedXpertQA-Skeletal: 33.33 medxpertqa_MedXpertQA-Muscular: 100 medxpertqa_MedXpertQA-Respiratory: 0 medxpertqa_MedXpertQA-Endocrine: 100 medxpertqa_MedXpertQA-Cardiovascular: 50 medxpertqa_MedXpertQA-Lymphatic: 0 medxpertqa_MedXpertQA-Nervous: 100 medxpertqa_MedXpertQA-Reproductive: 0 medxpertqa_MedXpertQA-Reasoning: 50 medxpertqa_MedXpertQA-Understanding: 50 lukaemon_mmlu_college_biology_accuracy: 81.25 lukaemon_mmlu_college_biology_f1: 0.81 lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25 lukaemon_mmlu_college_biology_attempted_ratio: 100 lukaemon_mmlu_college_biology_correct_count: 13 lukaemon_mmlu_college_biology_incorrect_count: 3 lukaemon_mmlu_college_biology_not_attempted_count: 0 mmlu_pro_math_accuracy: 81.25 mmlu_pro_math_f1: 0.81 mmlu_pro_math_accuracy_given_attempted: 81.25 mmlu_pro_math_attempted_ratio: 100 mmlu_pro_math_correct_count: 13 mmlu_pro_math_incorrect_count: 3 mmlu_pro_math_not_attempted_count: 0 olymmath_llmjudge_en-hard_accuracy: 68.75 olymmath_llmjudge_en-hard_f1: 0.69 olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75 olymmath_llmjudge_en-hard_attempted_ratio: 100 olymmath_llmjudge_en-hard_correct_count: 11 olymmath_llmjudge_en-hard_incorrect_count: 5 olymmath_llmjudge_en-hard_not_attempted_count: 0 OlympiadBench_OE_TO_maths_en_COMP_accuracy: 50 OlympiadBench_OE_TO_maths_en_COMP_f1: 0.5 OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 50 OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100 OlympiadBench_OE_TO_maths_en_COMP_correct_count: 8 OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 8 OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0 phybench-eed_accuracy: 6.25 PHYSICS_atomic_dataset_textonly_accuracy: 81.25 PHYSICS_atomic_dataset_textonly_f1: 0.81 PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 81.25 PHYSICS_atomic_dataset_textonly_attempted_ratio: 100 PHYSICS_atomic_dataset_textonly_correct_count: 13 PHYSICS_atomic_dataset_textonly_incorrect_count: 3 PHYSICS_atomic_dataset_textonly_not_attempted_count: 0 ProteinLMBench_accuracy: 62.5 ProteinLMBench_f1: 0.62 ProteinLMBench_accuracy_given_attempted: 62.5 ProteinLMBench_attempted_ratio: 100 ProteinLMBench_correct_count: 10 ProteinLMBench_incorrect_count: 6 ProteinLMBench_not_attempted_count: 0 R-Bench_en_accuracy: 62.5 R-Bench_en_f1: 0.62 R-Bench_en_accuracy_given_attempted: 62.5 R-Bench_en_attempted_ratio: 100 R-Bench_en_correct_count: 10 R-Bench_en_incorrect_count: 6 R-Bench_en_not_attempted_count: 0 NC-I2F-0shot-instruct_score: 0 NC-I2F-0shot-instruct_valid_score: 68.75 srbench_mean_RMSE: 30057.14 srbench_mean_NMSE: 29.42 srbench_mean_R2: -28.42 srbench_SymbolicMatch: 0.18 supergpqa_Electronic_Science_and_Technology_accuracy: 68.75 lcb_code_generation_v6_pass@1: 37.5 chat_subjective: alignment_bench_v1_1_总分: 0.46 arenahard_score: 100 Followbench_naive_average: 1 mtbench101_avg: 9 wildbench_average: 66.72 simpleqa_accuracy_given_attempted: 0.73 chinese_simpleqa_given_attempted_accuracy: 0.5 alignment_bench_v1_1_专业能力: 5.56 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_中文理解: 0 alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 compassarena_language_naive_average: 52.63 compassarena_knowledge_naive_average: 85 compassarena_reason_v2_naive_average: 50 compassarena_creationv2_zh_naive_average: 95 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_HSR_L2: 1 followbench_llmeval_en_HSR_L3: 1 followbench_llmeval_en_HSR_L4: 1 followbench_llmeval_en_HSR_L5: 1 followbench_llmeval_en_SSR_L1: 1 followbench_llmeval_en_SSR_L2: 1 followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L5: 1 simpleqa_f1: 0.62 qwen-3-8b-fullbench: objective_other: C-MHChem_accuracy: 75 C-MHChem_f1: 0.75 C-MHChem_accuracy_given_attempted: 75 C-MHChem_attempted_ratio: 100 C-MHChem_correct_count: 12 C-MHChem_incorrect_count: 4 C-MHChem_not_attempted_count: 0 CPsyExam_accuracy: 93.75 CPsyExam_f1: 0.94 CPsyExam_accuracy_given_attempted: 93.75 CPsyExam_attempted_ratio: 100 CPsyExam_correct_count: 15 CPsyExam_incorrect_count: 1 CPsyExam_not_attempted_count: 0 MaScQA_accuracy: 81.25 MaScQA_f1: 0.81 MaScQA_accuracy_given_attempted: 81.25 MaScQA_attempted_ratio: 100 MaScQA_correct_count: 13 MaScQA_incorrect_count: 3 MaScQA_not_attempted_count: 0 UGPhysics_AtomicPhysics_zh_accuracy: 68.75 objective_v5: race-high_accuracy: 81.25 ARC-c_accuracy: 100 BoolQ_accuracy: 87.5 triviaqa_wiki_1shot_score: 0 nq_open_1shot_score: 0 IFEval_Prompt-level-strict-accuracy: 87.50 drop_accuracy: 93.75 GPQA_diamond_accuracy: 75 hellaswag_accuracy: 87.5 TheoremQA_score: 6.25 musr_average_naive_average: 20.83 korbench_single_naive_average: 72.5 gsm8k_accuracy: 68.75 math_accuracy: 93.75 cmo_fib_accuracy: 0 aime2024_accuracy: 0 wikibench-wiki-single_choice_cncircular_perf_4: 50 sanitized_mbpp_score: 6.25 lcb_code_generation_pass@1: 6.25 lcb_code_execution_pass@1: 68.75 lcb_test_output_pass@1: 0 teval_naive_average: 67.43 bbh-logical_deduction_seven_objects_score: 6.25 bbh-multistep_arithmetic_two_score: 0 mmlu-other_accuracy: 81.25 cmmlu-china-specific_accuracy: 75.83 mmlu_pro_math_accuracy: 87.5 openai_mmmlu_lite_AR-XY_accuracy: 43.75 college_naive_average: 25 college_knowledge_naive_average: 37.5 objective_v6: aime2024_accuracy: 68.75 aime2024_f1: 0.69 aime2024_accuracy_given_attempted: 68.75 aime2024_attempted_ratio: 100 aime2024_correct_count: 11 aime2024_incorrect_count: 5 aime2024_not_attempted_count: 0 aime2025_accuracy: 56.25 aime2025_f1: 0.56 aime2025_accuracy_given_attempted: 56.25 aime2025_attempted_ratio: 100 aime2025_correct_count: 9 aime2025_incorrect_count: 7 aime2025_not_attempted_count: 0 bbh-temporal_sequences_score: 75 bbh-temporal_sequences_score_given_attempted: 75 bbh-temporal_sequences_attempted_ratio: 100 bbh-temporal_sequences_correct_count: 12 bbh-temporal_sequences_incorrect_count: 4 bbh-temporal_sequences_not_attempted_count: 0 cmo_fib_accuracy: 75 drop_accuracy: 56.25 drop_f1: 0.56 drop_accuracy_given_attempted: 56.25 drop_attempted_ratio: 100 drop_correct_count: 9 drop_incorrect_count: 7 drop_not_attempted_count: 0 GaokaoBench_2010-2022_Math_II_MCQs_score: 100 GPQA_diamond_accuracy: 62.5 GPQA_diamond_f1: 0.62 GPQA_diamond_accuracy_given_attempted: 62.5 GPQA_diamond_attempted_ratio: 100 GPQA_diamond_correct_count: 10 GPQA_diamond_incorrect_count: 6 GPQA_diamond_not_attempted_count: 0 gsm8k_accuracy: 100 hellaswag_accuracy: 100 hellaswag_f1: 1 hellaswag_accuracy_given_attempted: 100 hellaswag_attempted_ratio: 100 hellaswag_correct_count: 16 hellaswag_incorrect_count: 0 hellaswag_not_attempted_count: 0 korbench_cipher_accuracy: 93.75 korbench_cipher_f1: 0.94 korbench_cipher_accuracy_given_attempted: 93.75 korbench_cipher_attempted_ratio: 100 korbench_cipher_correct_count: 15 korbench_cipher_incorrect_count: 1 korbench_cipher_not_attempted_count: 0 math_prm800k_500-llmjudge_accuracy: 100 math_prm800k_500-llmjudge_f1: 1 math_prm800k_500-llmjudge_accuracy_given_attempted: 100 math_prm800k_500-llmjudge_attempted_ratio: 100 math_prm800k_500-llmjudge_correct_count: 16 math_prm800k_500-llmjudge_incorrect_count: 0 math_prm800k_500-llmjudge_not_attempted_count: 0 mathbench-college-single_choice_cn_acc_4: 93.75 mathbench-college-single_choice_cn_acc_1: 75 mathbench-college-single_choice_cn_more_1_0: 100 mathbench-college-single_choice_cn_more_1_1: 75 mathbench-college-single_choice_cn_more_4_0: 100 mathbench-college-single_choice_cn_more_4_1: 100 mathbench-college-single_choice_cn_more_4_2: 100 mathbench-college-single_choice_cn_more_4_3: 100 mathbench-college-single_choice_cn_more_4_4: 75 mathbench-college-single_choice_cn_perf_1: 75 mathbench-college-single_choice_cn_perf_4: 75 mathbench-college-single_choice_cn_vote_4: 100 mathbench-college-single_choice_cn_vote_1: 75 mathbench-college-single_choice_cn_prior_A: 31.25 mathbench-college-single_choice_cn_prior_B: 18.75 mathbench-college-single_choice_cn_prior_C: 25 mathbench-college-single_choice_cn_prior_D: 25 mathbench-college-single_choice_cn_prior_-: 0 musr_murder_mysteries_accuracy: 75 musr_murder_mysteries_f1: 0.75 musr_murder_mysteries_accuracy_given_attempted: 75 musr_murder_mysteries_attempted_ratio: 100 musr_murder_mysteries_correct_count: 12 musr_murder_mysteries_incorrect_count: 4 musr_murder_mysteries_not_attempted_count: 0 supergpqa_accuracy: 75 supergpqa_total_correct: 12 supergpqa_total_count: 16 supergpqa_SuperGPQA-Engineering: 80 supergpqa_SuperGPQA-Philosophy: 0 supergpqa_SuperGPQA-Medicine: 100 supergpqa_SuperGPQA-Economics: 100 supergpqa_SuperGPQA-Science: 66.67 supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100 supergpqa_SuperGPQA-Philosophy-Philosophy: 0 supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100 supergpqa_SuperGPQA-Economics-Applied Economics: 100 supergpqa_SuperGPQA-Science-Mathematics: 80 supergpqa_SuperGPQA-Science-Physics: 0 supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100 supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100 supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50 supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100 supergpqa_SuperGPQA-Economics-Theoretical Economics: 100 supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100 supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0 supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100 supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 100 supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100 supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0 supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100 supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100 supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100 supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0 supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50 supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100 supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100 supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100 triviaqa_wiki_1shot_score: 50 ARC_Prize_Public_Evaluation_accuracy: 0.06 objective_v7: aime2024_accuracy: 75 aime2025_accuracy: 56.25 bbeh_boolean_expressions_accuracy: 68.75 bbeh_boolean_expressions_f1: 0.69 bbeh_boolean_expressions_accuracy_given_attempted: 68.75 bbeh_boolean_expressions_attempted_ratio: 100 bbeh_boolean_expressions_correct_count: 11 bbeh_boolean_expressions_incorrect_count: 5 bbeh_boolean_expressions_not_attempted_count: 0 Chem_exam-competition_final_score: 42.29 Chem_exam-gaokao_final_score: 71.88 ChemBench_Name_Conversion_accuracy: 100.00 ChemBench_Name_Conversion_f1: 1.00 ChemBench_Name_Conversion_accuracy_given_attempted: 100 ChemBench_Name_Conversion_attempted_ratio: 100.00 ChemBench_Name_Conversion_correct_count: 16.00 ChemBench_Name_Conversion_incorrect_count: 0 ChemBench_Name_Conversion_not_attempted_count: 0 ClimaQA_Gold_mcq_accuracy: 87.5 ClimaQA_Gold_mcq_f1: 0.88 ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5 ClimaQA_Gold_mcq_attempted_ratio: 100 ClimaQA_Gold_mcq_correct_count: 14 ClimaQA_Gold_mcq_incorrect_count: 2 ClimaQA_Gold_mcq_not_attempted_count: 0 cmmlu-agronomy_accuracy: 87.50 cmmlu-agronomy_f1: 0.88 cmmlu-agronomy_accuracy_given_attempted: 87.50 cmmlu-agronomy_attempted_ratio: 100.00 cmmlu-agronomy_correct_count: 14.00 cmmlu-agronomy_incorrect_count: 2 cmmlu-agronomy_not_attempted_count: 0.00 earth_silver_mcq_accuracy: 81.25 earth_silver_mcq_f1: 0.81 earth_silver_mcq_accuracy_given_attempted: 81.25 earth_silver_mcq_attempted_ratio: 100 earth_silver_mcq_correct_count: 13 earth_silver_mcq_incorrect_count: 3 earth_silver_mcq_not_attempted_count: 0 GPQA_diamond_accuracy: 68.75 hle_llmjudge_accuracy: 43.75 hle_llmjudge_f1: 0.43 hle_llmjudge_accuracy_given_attempted: 43.75 hle_llmjudge_attempted_ratio: 100.00 hle_llmjudge_correct_count: 7.00 hle_llmjudge_incorrect_count: 9.00 hle_llmjudge_not_attempted_count: 0.00 IFEval_Prompt-level-strict-accuracy: 87.5 IFEval_Inst-level-strict-accuracy: 91.67 IFEval_Prompt-level-loose-accuracy: 87.5 IFEval_Inst-level-loose-accuracy: 91.67 kcle_accuracy: 75.00 kcle_f1: 0.75 kcle_accuracy_given_attempted: 75.00 kcle_attempted_ratio: 100.00 kcle_correct_count: 12 kcle_incorrect_count: 4 kcle_not_attempted_count: 0 korbench_cipher_accuracy: 81.25 livemathbench_hard_custom_hard_cn_accuracy: 25 matbench_steels_mae: 816.50 math_prm800k_500_accuracy: 100 sanitized_mbpp_score: 6.25 sanitized_mbpp_pass: 1 sanitized_mbpp_timeout: 0 sanitized_mbpp_failed: 15 sanitized_mbpp_wrong_answer: 0 medxpertqa_accuracy: 62.5 medxpertqa_total_correct: 10 medxpertqa_total_count: 16 medxpertqa_MedXpertQA-Basic Science: 75 medxpertqa_MedXpertQA-Diagnosis: 50 medxpertqa_MedXpertQA-Treatment: 75 medxpertqa_MedXpertQA-Skeletal: 66.67 medxpertqa_MedXpertQA-Muscular: 100 medxpertqa_MedXpertQA-Respiratory: 33.33 medxpertqa_MedXpertQA-Endocrine: 66.67 medxpertqa_MedXpertQA-Cardiovascular: 100 medxpertqa_MedXpertQA-Lymphatic: 0 medxpertqa_MedXpertQA-Nervous: 100 medxpertqa_MedXpertQA-Reproductive: 0 medxpertqa_MedXpertQA-Reasoning: 64.29 medxpertqa_MedXpertQA-Understanding: 50 lukaemon_mmlu_college_biology_accuracy: 81.25 lukaemon_mmlu_college_biology_f1: 0.81 lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25 lukaemon_mmlu_college_biology_attempted_ratio: 100 lukaemon_mmlu_college_biology_correct_count: 13 lukaemon_mmlu_college_biology_incorrect_count: 3 lukaemon_mmlu_college_biology_not_attempted_count: 0 mmlu_pro_math_accuracy: 68.75 mmlu_pro_math_f1: 0.69 mmlu_pro_math_accuracy_given_attempted: 68.75 mmlu_pro_math_attempted_ratio: 100 mmlu_pro_math_correct_count: 11 mmlu_pro_math_incorrect_count: 5 mmlu_pro_math_not_attempted_count: 0 olymmath_llmjudge_en-hard_accuracy: 68.75 olymmath_llmjudge_en-hard_f1: 0.69 olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75 olymmath_llmjudge_en-hard_attempted_ratio: 100.00 olymmath_llmjudge_en-hard_correct_count: 11 olymmath_llmjudge_en-hard_incorrect_count: 5 olymmath_llmjudge_en-hard_not_attempted_count: 0 OlympiadBench_OE_TO_maths_en_COMP_accuracy: 43.75 OlympiadBench_OE_TO_maths_en_COMP_f1: 0.44 OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 43.75 OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100.00 OlympiadBench_OE_TO_maths_en_COMP_correct_count: 7 OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 9 OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0 phybench-eed_accuracy: 27.94 PHYSICS_atomic_dataset_textonly_accuracy: 93.75 PHYSICS_atomic_dataset_textonly_f1: 0.94 PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 93.75 PHYSICS_atomic_dataset_textonly_attempted_ratio: 100 PHYSICS_atomic_dataset_textonly_correct_count: 15 PHYSICS_atomic_dataset_textonly_incorrect_count: 1 PHYSICS_atomic_dataset_textonly_not_attempted_count: 0 ProteinLMBench_accuracy: 68.75 ProteinLMBench_f1: 0.69 ProteinLMBench_accuracy_given_attempted: 68.75 ProteinLMBench_attempted_ratio: 100 ProteinLMBench_correct_count: 11 ProteinLMBench_incorrect_count: 5 ProteinLMBench_not_attempted_count: 0 R-Bench_en_accuracy: 50.00 R-Bench_en_f1: 0.50 R-Bench_en_accuracy_given_attempted: 50.00 R-Bench_en_attempted_ratio: 100.00 R-Bench_en_correct_count: 8 R-Bench_en_incorrect_count: 8 R-Bench_en_not_attempted_count: 0 NC-I2F-0shot-instruct_score: 12.5 NC-I2F-0shot-instruct_valid_score: 75 srbench_mean_RMSE: 61339597.01 srbench_mean_NMSE: 83.61 srbench_mean_R2: -82.61 srbench_SymbolicMatch: 0.14 supergpqa_Electronic_Science_and_Technology_accuracy: 56.25 lcb_code_generation_v6_pass@1: 50 chat_longtext: babilong_qa1_256k_score: 0.00 LongBench_2wikimqa_score: 5.43 Length16000Depth0_2needle_en_128k_score: 100.00 ruler_cwe_128k_score: 0.00 chat_subjective: alignment_bench_v1_1_总分: 0.48 arenahard_score: 92.07 Followbench_naive_average: 1 mtbench101_avg: 8.7 wildbench_average: 75.17 simpleqa_accuracy_given_attempted: 0.55 chinese_simpleqa_given_attempted_accuracy: 0.64 alignment_bench_v1_1_专业能力: 5.81 alignment_bench_v1_1_数学计算: 0 alignment_bench_v1_1_基本任务: 0 alignment_bench_v1_1_逻辑推理: 0 alignment_bench_v1_1_中文理解: 0 alignment_bench_v1_1_文本写作: 0 alignment_bench_v1_1_角色扮演: 0 alignment_bench_v1_1_综合问答: 0 compassarena_language_naive_average: 55 compassarena_knowledge_naive_average: 100 compassarena_reason_v2_naive_average: 78.95 compassarena_creationv2_zh_naive_average: 100 followbench_llmeval_en_HSR_AVG: 1 followbench_llmeval_en_SSR_AVG: 1 followbench_llmeval_en_HSR_L1: 1 followbench_llmeval_en_HSR_L2: 1 followbench_llmeval_en_HSR_L3: 1 followbench_llmeval_en_HSR_L4: 1 followbench_llmeval_en_HSR_L5: 1 followbench_llmeval_en_SSR_L1: 1 followbench_llmeval_en_SSR_L2: 1 followbench_llmeval_en_SSR_L3: 1 followbench_llmeval_en_SSR_L4: 1 followbench_llmeval_en_SSR_L5: 1 simpleqa_f1: 0.46 qwen3-8b-base-turbomind: base_longtext: LongBench_2wikimqa_score: 26.98 Length32000Depth0_origin_en_32000_score: 100.00 qwen-3-8b-base-hf-fullbench: objective_base: race-high_accuracy: 87.5 ARC-c_accuracy: 75 BoolQ_accuracy: 75 triviaqa_wiki_1shot_score: 37.5 nq_open_1shot_score: 6.25 drop_accuracy: 75 GPQA_diamond_accuracy: 81.25 hellaswag_accuracy: 81.25 TheoremQA_score: 18.75 winogrande_accuracy: 81.25 gsm8k_accuracy: 81.25 GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 math_accuracy: 87.5 wikibench-wiki-single_choice_cncircular_perf_4: 50 sanitized_mbpp_score: 93.75 mmlu-other_accuracy: 76.44 cmmlu-china-specific_accuracy: 82.5 mmlu_pro_math_accuracy: 50 bbh-logical_deduction_seven_objects_score: 37.5 bbh-multistep_arithmetic_two_score: 0 college_naive_average: 37.5 college_knowledge_naive_average: 87.5 qwen-3-8b-base-fullbench: objective_base: race-high_accuracy: 87.5 ARC-c_accuracy: 75 BoolQ_accuracy: 81.25 triviaqa_wiki_1shot_score: 37.5 nq_open_1shot_score: 6.25 drop_accuracy: 81.25 GPQA_diamond_accuracy: 81.25 hellaswag_accuracy: 75 TheoremQA_score: 18.75 winogrande_accuracy: 81.25 gsm8k_accuracy: 81.25 GaokaoBench_2010-2022_Math_II_MCQs_score: 100 GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 math_accuracy: 81.25 wikibench-wiki-single_choice_cncircular_perf_4: 50 sanitized_mbpp_score: 93.75 mmlu-other_accuracy: 75.48 cmmlu-china-specific_accuracy: 83.75 mmlu_pro_math_accuracy: 50 bbh-logical_deduction_seven_objects_score: 62.5 bbh-multistep_arithmetic_two_score: 100 college_naive_average: 37.5 college_knowledge_naive_average: 87.5