| qwen-3-8b-hf-fullbench: | |
| objective_other: | |
| C-MHChem_accuracy: 81.25 | |
| C-MHChem_f1: 0.81 | |
| C-MHChem_accuracy_given_attempted: 81.25 | |
| C-MHChem_attempted_ratio: 100 | |
| C-MHChem_correct_count: 13 | |
| C-MHChem_incorrect_count: 3 | |
| C-MHChem_not_attempted_count: 0 | |
| CPsyExam_accuracy: 87.5 | |
| CPsyExam_f1: 0.88 | |
| CPsyExam_accuracy_given_attempted: 87.5 | |
| CPsyExam_attempted_ratio: 100 | |
| CPsyExam_correct_count: 14 | |
| CPsyExam_incorrect_count: 2 | |
| CPsyExam_not_attempted_count: 0 | |
| MaScQA_accuracy: 75 | |
| MaScQA_f1: 0.75 | |
| MaScQA_accuracy_given_attempted: 75 | |
| MaScQA_attempted_ratio: 100 | |
| MaScQA_correct_count: 12 | |
| MaScQA_incorrect_count: 4 | |
| MaScQA_not_attempted_count: 0 | |
| UGPhysics_AtomicPhysics_zh_accuracy: 56.25 | |
| objective_v5: | |
| race-high_accuracy: 87.5 | |
| ARC-c_accuracy: 100 | |
| BoolQ_accuracy: 93.75 | |
| triviaqa_wiki_1shot_score: 0 | |
| nq_open_1shot_score: 0 | |
| IFEval_Prompt-level-strict-accuracy: 87.50 | |
| drop_accuracy: 93.75 | |
| GPQA_diamond_accuracy: 62.50 | |
| hellaswag_accuracy: 75 | |
| TheoremQA_score: 6.25 | |
| musr_average_naive_average: 14.58 | |
| korbench_single_naive_average: 75.00 | |
| gsm8k_accuracy: 75 | |
| math_accuracy: 93.75 | |
| cmo_fib_accuracy: 0 | |
| aime2024_accuracy: 6.25 | |
| wikibench-wiki-single_choice_cncircular_perf_4: 25 | |
| sanitized_mbpp_score: 6.25 | |
| lcb_code_generation_pass@1: 6.25 | |
| lcb_code_execution_pass@1: 50 | |
| lcb_test_output_pass@1: 0 | |
| teval_naive_average: 69.16 | |
| bbh-logical_deduction_seven_objects_score: 12.5 | |
| bbh-multistep_arithmetic_two_score: 0 | |
| mmlu-other_accuracy: 77.88 | |
| cmmlu-china-specific_accuracy: 74.17 | |
| mmlu_pro_math_accuracy: 81.25 | |
| openai_mmmlu_lite_AR-XY_accuracy: 50 | |
| college_naive_average: 37.5 | |
| college_knowledge_naive_average: 50 | |
| objective_v6: | |
| aime2024_accuracy: 87.5 | |
| aime2024_f1: 0.88 | |
| aime2024_accuracy_given_attempted: 87.5 | |
| aime2024_attempted_ratio: 100 | |
| aime2024_correct_count: 14 | |
| aime2024_incorrect_count: 2 | |
| aime2024_not_attempted_count: 0 | |
| aime2025_accuracy: 56.25 | |
| aime2025_f1: 0.56 | |
| aime2025_accuracy_given_attempted: 56.25 | |
| aime2025_attempted_ratio: 100 | |
| aime2025_correct_count: 9 | |
| aime2025_incorrect_count: 7 | |
| aime2025_not_attempted_count: 0 | |
| bbh-temporal_sequences_score: 75 | |
| bbh-temporal_sequences_score_given_attempted: 75 | |
| bbh-temporal_sequences_attempted_ratio: 100 | |
| bbh-temporal_sequences_correct_count: 12 | |
| bbh-temporal_sequences_incorrect_count: 4 | |
| bbh-temporal_sequences_not_attempted_count: 0 | |
| cmo_fib_accuracy: 31.25 | |
| drop_accuracy: 56.25 | |
| drop_f1: 0.56 | |
| drop_accuracy_given_attempted: 56.25 | |
| drop_attempted_ratio: 100 | |
| drop_correct_count: 9 | |
| drop_incorrect_count: 7 | |
| drop_not_attempted_count: 0 | |
| GaokaoBench_2010-2022_Math_II_MCQs_score: 100 | |
| GPQA_diamond_accuracy: 56.25 | |
| GPQA_diamond_f1: 0.56 | |
| GPQA_diamond_accuracy_given_attempted: 56.25 | |
| GPQA_diamond_attempted_ratio: 100 | |
| GPQA_diamond_correct_count: 9 | |
| GPQA_diamond_incorrect_count: 7 | |
| GPQA_diamond_not_attempted_count: 0 | |
| gsm8k_accuracy: 93.75 | |
| hellaswag_accuracy: 100 | |
| hellaswag_f1: 1 | |
| hellaswag_accuracy_given_attempted: 100 | |
| hellaswag_attempted_ratio: 100 | |
| hellaswag_correct_count: 16 | |
| hellaswag_incorrect_count: 0 | |
| hellaswag_not_attempted_count: 0 | |
| korbench_cipher_accuracy: 93.75 | |
| korbench_cipher_f1: 0.94 | |
| korbench_cipher_accuracy_given_attempted: 93.75 | |
| korbench_cipher_attempted_ratio: 100 | |
| korbench_cipher_correct_count: 15 | |
| korbench_cipher_incorrect_count: 1 | |
| korbench_cipher_not_attempted_count: 0 | |
| math_prm800k_500-llmjudge_accuracy: 93.75 | |
| math_prm800k_500-llmjudge_f1: 0.94 | |
| math_prm800k_500-llmjudge_accuracy_given_attempted: 93.75 | |
| math_prm800k_500-llmjudge_attempted_ratio: 100 | |
| math_prm800k_500-llmjudge_correct_count: 15 | |
| math_prm800k_500-llmjudge_incorrect_count: 1 | |
| math_prm800k_500-llmjudge_not_attempted_count: 0 | |
| mathbench-college-single_choice_cn_acc_4: 87.5 | |
| mathbench-college-single_choice_cn_acc_1: 75 | |
| mathbench-college-single_choice_cn_more_1_0: 100 | |
| mathbench-college-single_choice_cn_more_1_1: 75 | |
| mathbench-college-single_choice_cn_more_4_0: 100 | |
| mathbench-college-single_choice_cn_more_4_1: 100 | |
| mathbench-college-single_choice_cn_more_4_2: 100 | |
| mathbench-college-single_choice_cn_more_4_3: 100 | |
| mathbench-college-single_choice_cn_more_4_4: 50 | |
| mathbench-college-single_choice_cn_perf_1: 75 | |
| mathbench-college-single_choice_cn_perf_4: 50 | |
| mathbench-college-single_choice_cn_vote_4: 100 | |
| mathbench-college-single_choice_cn_vote_1: 75 | |
| mathbench-college-single_choice_cn_prior_A: 31.25 | |
| mathbench-college-single_choice_cn_prior_B: 12.5 | |
| mathbench-college-single_choice_cn_prior_C: 25 | |
| mathbench-college-single_choice_cn_prior_D: 25 | |
| mathbench-college-single_choice_cn_prior_-: 6.25 | |
| musr_murder_mysteries_accuracy: 68.75 | |
| musr_murder_mysteries_f1: 0.69 | |
| musr_murder_mysteries_accuracy_given_attempted: 68.75 | |
| musr_murder_mysteries_attempted_ratio: 100 | |
| musr_murder_mysteries_correct_count: 11 | |
| musr_murder_mysteries_incorrect_count: 5 | |
| musr_murder_mysteries_not_attempted_count: 0 | |
| supergpqa_accuracy: 68.75 | |
| supergpqa_total_correct: 11 | |
| supergpqa_total_count: 16 | |
| supergpqa_SuperGPQA-Engineering: 80 | |
| supergpqa_SuperGPQA-Philosophy: 0 | |
| supergpqa_SuperGPQA-Medicine: 100 | |
| supergpqa_SuperGPQA-Economics: 50 | |
| supergpqa_SuperGPQA-Science: 66.67 | |
| supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100 | |
| supergpqa_SuperGPQA-Philosophy-Philosophy: 0 | |
| supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100 | |
| supergpqa_SuperGPQA-Economics-Applied Economics: 0 | |
| supergpqa_SuperGPQA-Science-Mathematics: 80 | |
| supergpqa_SuperGPQA-Science-Physics: 0 | |
| supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100 | |
| supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100 | |
| supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50 | |
| supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100 | |
| supergpqa_SuperGPQA-Economics-Theoretical Economics: 100 | |
| supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100 | |
| supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0 | |
| supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100 | |
| supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 0 | |
| supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100 | |
| supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0 | |
| supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100 | |
| supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100 | |
| supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100 | |
| supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0 | |
| supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50 | |
| supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100 | |
| supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100 | |
| supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100 | |
| triviaqa_wiki_1shot_score: 50 | |
| ARC_Prize_Public_Evaluation_accuracy: 0.06 | |
| objective_v7: | |
| aime2024_accuracy: 50 | |
| aime2025_accuracy: 25 | |
| bbeh_boolean_expressions_accuracy: 75 | |
| bbeh_boolean_expressions_f1: 0.75 | |
| bbeh_boolean_expressions_accuracy_given_attempted: 75 | |
| bbeh_boolean_expressions_attempted_ratio: 100 | |
| bbeh_boolean_expressions_correct_count: 12 | |
| bbeh_boolean_expressions_incorrect_count: 4 | |
| bbeh_boolean_expressions_not_attempted_count: 0 | |
| Chem_exam-competition_final_score: 40.39 | |
| Chem_exam-gaokao_final_score: 78.12 | |
| ChemBench_Name_Conversion_accuracy: 93.75 | |
| ChemBench_Name_Conversion_f1: 0.94 | |
| ChemBench_Name_Conversion_accuracy_given_attempted: 93.75 | |
| ChemBench_Name_Conversion_attempted_ratio: 100 | |
| ChemBench_Name_Conversion_correct_count: 15 | |
| ChemBench_Name_Conversion_incorrect_count: 1 | |
| ChemBench_Name_Conversion_not_attempted_count: 0 | |
| ClimaQA_Gold_mcq_accuracy: 87.5 | |
| ClimaQA_Gold_mcq_f1: 0.88 | |
| ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5 | |
| ClimaQA_Gold_mcq_attempted_ratio: 100 | |
| ClimaQA_Gold_mcq_correct_count: 14 | |
| ClimaQA_Gold_mcq_incorrect_count: 2 | |
| ClimaQA_Gold_mcq_not_attempted_count: 0 | |
| cmmlu-agronomy_accuracy: 87.5 | |
| cmmlu-agronomy_f1: 0.88 | |
| cmmlu-agronomy_accuracy_given_attempted: 87.5 | |
| cmmlu-agronomy_attempted_ratio: 100 | |
| cmmlu-agronomy_correct_count: 14 | |
| cmmlu-agronomy_incorrect_count: 2 | |
| cmmlu-agronomy_not_attempted_count: 0 | |
| earth_silver_mcq_accuracy: 75 | |
| earth_silver_mcq_f1: 0.75 | |
| earth_silver_mcq_accuracy_given_attempted: 75 | |
| earth_silver_mcq_attempted_ratio: 100 | |
| earth_silver_mcq_correct_count: 12 | |
| earth_silver_mcq_incorrect_count: 4 | |
| earth_silver_mcq_not_attempted_count: 0 | |
| GPQA_diamond_accuracy: 56.25 | |
| hle_llmjudge_accuracy: 56.25 | |
| hle_llmjudge_f1: 0.56 | |
| hle_llmjudge_accuracy_given_attempted: 56.25 | |
| hle_llmjudge_attempted_ratio: 100 | |
| hle_llmjudge_correct_count: 9 | |
| hle_llmjudge_incorrect_count: 7 | |
| hle_llmjudge_not_attempted_count: 0 | |
| IFEval_Prompt-level-strict-accuracy: 87.5 | |
| IFEval_Inst-level-strict-accuracy: 91.67 | |
| IFEval_Prompt-level-loose-accuracy: 87.5 | |
| IFEval_Inst-level-loose-accuracy: 91.67 | |
| kcle_accuracy: 62.5 | |
| kcle_f1: 0.62 | |
| kcle_accuracy_given_attempted: 62.5 | |
| kcle_attempted_ratio: 100 | |
| kcle_correct_count: 10 | |
| kcle_incorrect_count: 6 | |
| kcle_not_attempted_count: 0 | |
| korbench_cipher_accuracy: 87.5 | |
| livemathbench_hard_custom_hard_cn_accuracy: 18.75 | |
| matbench_steels_mae: 635.88 | |
| math_prm800k_500_accuracy: 93.75 | |
| sanitized_mbpp_score: 6.25 | |
| sanitized_mbpp_pass: 1 | |
| sanitized_mbpp_timeout: 0 | |
| sanitized_mbpp_failed: 15 | |
| sanitized_mbpp_wrong_answer: 0 | |
| medxpertqa_accuracy: 50 | |
| medxpertqa_total_correct: 8 | |
| medxpertqa_total_count: 16 | |
| medxpertqa_MedXpertQA-Basic Science: 50 | |
| medxpertqa_MedXpertQA-Diagnosis: 37.5 | |
| medxpertqa_MedXpertQA-Treatment: 75 | |
| medxpertqa_MedXpertQA-Skeletal: 33.33 | |
| medxpertqa_MedXpertQA-Muscular: 100 | |
| medxpertqa_MedXpertQA-Respiratory: 0 | |
| medxpertqa_MedXpertQA-Endocrine: 100 | |
| medxpertqa_MedXpertQA-Cardiovascular: 50 | |
| medxpertqa_MedXpertQA-Lymphatic: 0 | |
| medxpertqa_MedXpertQA-Nervous: 100 | |
| medxpertqa_MedXpertQA-Reproductive: 0 | |
| medxpertqa_MedXpertQA-Reasoning: 50 | |
| medxpertqa_MedXpertQA-Understanding: 50 | |
| lukaemon_mmlu_college_biology_accuracy: 81.25 | |
| lukaemon_mmlu_college_biology_f1: 0.81 | |
| lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25 | |
| lukaemon_mmlu_college_biology_attempted_ratio: 100 | |
| lukaemon_mmlu_college_biology_correct_count: 13 | |
| lukaemon_mmlu_college_biology_incorrect_count: 3 | |
| lukaemon_mmlu_college_biology_not_attempted_count: 0 | |
| mmlu_pro_math_accuracy: 81.25 | |
| mmlu_pro_math_f1: 0.81 | |
| mmlu_pro_math_accuracy_given_attempted: 81.25 | |
| mmlu_pro_math_attempted_ratio: 100 | |
| mmlu_pro_math_correct_count: 13 | |
| mmlu_pro_math_incorrect_count: 3 | |
| mmlu_pro_math_not_attempted_count: 0 | |
| olymmath_llmjudge_en-hard_accuracy: 68.75 | |
| olymmath_llmjudge_en-hard_f1: 0.69 | |
| olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75 | |
| olymmath_llmjudge_en-hard_attempted_ratio: 100 | |
| olymmath_llmjudge_en-hard_correct_count: 11 | |
| olymmath_llmjudge_en-hard_incorrect_count: 5 | |
| olymmath_llmjudge_en-hard_not_attempted_count: 0 | |
| OlympiadBench_OE_TO_maths_en_COMP_accuracy: 50 | |
| OlympiadBench_OE_TO_maths_en_COMP_f1: 0.5 | |
| OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 50 | |
| OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100 | |
| OlympiadBench_OE_TO_maths_en_COMP_correct_count: 8 | |
| OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 8 | |
| OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0 | |
| phybench-eed_accuracy: 6.25 | |
| PHYSICS_atomic_dataset_textonly_accuracy: 81.25 | |
| PHYSICS_atomic_dataset_textonly_f1: 0.81 | |
| PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 81.25 | |
| PHYSICS_atomic_dataset_textonly_attempted_ratio: 100 | |
| PHYSICS_atomic_dataset_textonly_correct_count: 13 | |
| PHYSICS_atomic_dataset_textonly_incorrect_count: 3 | |
| PHYSICS_atomic_dataset_textonly_not_attempted_count: 0 | |
| ProteinLMBench_accuracy: 62.5 | |
| ProteinLMBench_f1: 0.62 | |
| ProteinLMBench_accuracy_given_attempted: 62.5 | |
| ProteinLMBench_attempted_ratio: 100 | |
| ProteinLMBench_correct_count: 10 | |
| ProteinLMBench_incorrect_count: 6 | |
| ProteinLMBench_not_attempted_count: 0 | |
| R-Bench_en_accuracy: 62.5 | |
| R-Bench_en_f1: 0.62 | |
| R-Bench_en_accuracy_given_attempted: 62.5 | |
| R-Bench_en_attempted_ratio: 100 | |
| R-Bench_en_correct_count: 10 | |
| R-Bench_en_incorrect_count: 6 | |
| R-Bench_en_not_attempted_count: 0 | |
| NC-I2F-0shot-instruct_score: 0 | |
| NC-I2F-0shot-instruct_valid_score: 68.75 | |
| srbench_mean_RMSE: 30057.14 | |
| srbench_mean_NMSE: 29.42 | |
| srbench_mean_R2: -28.42 | |
| srbench_SymbolicMatch: 0.18 | |
| supergpqa_Electronic_Science_and_Technology_accuracy: 68.75 | |
| lcb_code_generation_v6_pass@1: 37.5 | |
| chat_subjective: | |
| alignment_bench_v1_1_总分: 0.46 | |
| arenahard_score: 100 | |
| Followbench_naive_average: 1 | |
| mtbench101_avg: 9 | |
| wildbench_average: 66.72 | |
| simpleqa_accuracy_given_attempted: 0.73 | |
| chinese_simpleqa_given_attempted_accuracy: 0.5 | |
| alignment_bench_v1_1_专业能力: 5.56 | |
| alignment_bench_v1_1_数学计算: 0 | |
| alignment_bench_v1_1_基本任务: 0 | |
| alignment_bench_v1_1_逻辑推理: 0 | |
| alignment_bench_v1_1_中文理解: 0 | |
| alignment_bench_v1_1_文本写作: 0 | |
| alignment_bench_v1_1_角色扮演: 0 | |
| alignment_bench_v1_1_综合问答: 0 | |
| compassarena_language_naive_average: 52.63 | |
| compassarena_knowledge_naive_average: 85 | |
| compassarena_reason_v2_naive_average: 50 | |
| compassarena_creationv2_zh_naive_average: 95 | |
| followbench_llmeval_en_HSR_AVG: 1 | |
| followbench_llmeval_en_SSR_AVG: 1 | |
| followbench_llmeval_en_HSR_L1: 1 | |
| followbench_llmeval_en_HSR_L2: 1 | |
| followbench_llmeval_en_HSR_L3: 1 | |
| followbench_llmeval_en_HSR_L4: 1 | |
| followbench_llmeval_en_HSR_L5: 1 | |
| followbench_llmeval_en_SSR_L1: 1 | |
| followbench_llmeval_en_SSR_L2: 1 | |
| followbench_llmeval_en_SSR_L3: 1 | |
| followbench_llmeval_en_SSR_L4: 1 | |
| followbench_llmeval_en_SSR_L5: 1 | |
| simpleqa_f1: 0.62 | |
| qwen-3-8b-fullbench: | |
| objective_other: | |
| C-MHChem_accuracy: 75 | |
| C-MHChem_f1: 0.75 | |
| C-MHChem_accuracy_given_attempted: 75 | |
| C-MHChem_attempted_ratio: 100 | |
| C-MHChem_correct_count: 12 | |
| C-MHChem_incorrect_count: 4 | |
| C-MHChem_not_attempted_count: 0 | |
| CPsyExam_accuracy: 93.75 | |
| CPsyExam_f1: 0.94 | |
| CPsyExam_accuracy_given_attempted: 93.75 | |
| CPsyExam_attempted_ratio: 100 | |
| CPsyExam_correct_count: 15 | |
| CPsyExam_incorrect_count: 1 | |
| CPsyExam_not_attempted_count: 0 | |
| MaScQA_accuracy: 81.25 | |
| MaScQA_f1: 0.81 | |
| MaScQA_accuracy_given_attempted: 81.25 | |
| MaScQA_attempted_ratio: 100 | |
| MaScQA_correct_count: 13 | |
| MaScQA_incorrect_count: 3 | |
| MaScQA_not_attempted_count: 0 | |
| UGPhysics_AtomicPhysics_zh_accuracy: 68.75 | |
| objective_v5: | |
| race-high_accuracy: 81.25 | |
| ARC-c_accuracy: 100 | |
| BoolQ_accuracy: 87.5 | |
| triviaqa_wiki_1shot_score: 0 | |
| nq_open_1shot_score: 0 | |
| IFEval_Prompt-level-strict-accuracy: 87.50 | |
| drop_accuracy: 93.75 | |
| GPQA_diamond_accuracy: 75 | |
| hellaswag_accuracy: 87.5 | |
| TheoremQA_score: 6.25 | |
| musr_average_naive_average: 20.83 | |
| korbench_single_naive_average: 72.5 | |
| gsm8k_accuracy: 68.75 | |
| math_accuracy: 93.75 | |
| cmo_fib_accuracy: 0 | |
| aime2024_accuracy: 0 | |
| wikibench-wiki-single_choice_cncircular_perf_4: 50 | |
| sanitized_mbpp_score: 6.25 | |
| lcb_code_generation_pass@1: 6.25 | |
| lcb_code_execution_pass@1: 68.75 | |
| lcb_test_output_pass@1: 0 | |
| teval_naive_average: 67.43 | |
| bbh-logical_deduction_seven_objects_score: 6.25 | |
| bbh-multistep_arithmetic_two_score: 0 | |
| mmlu-other_accuracy: 81.25 | |
| cmmlu-china-specific_accuracy: 75.83 | |
| mmlu_pro_math_accuracy: 87.5 | |
| openai_mmmlu_lite_AR-XY_accuracy: 43.75 | |
| college_naive_average: 25 | |
| college_knowledge_naive_average: 37.5 | |
| objective_v6: | |
| aime2024_accuracy: 68.75 | |
| aime2024_f1: 0.69 | |
| aime2024_accuracy_given_attempted: 68.75 | |
| aime2024_attempted_ratio: 100 | |
| aime2024_correct_count: 11 | |
| aime2024_incorrect_count: 5 | |
| aime2024_not_attempted_count: 0 | |
| aime2025_accuracy: 56.25 | |
| aime2025_f1: 0.56 | |
| aime2025_accuracy_given_attempted: 56.25 | |
| aime2025_attempted_ratio: 100 | |
| aime2025_correct_count: 9 | |
| aime2025_incorrect_count: 7 | |
| aime2025_not_attempted_count: 0 | |
| bbh-temporal_sequences_score: 75 | |
| bbh-temporal_sequences_score_given_attempted: 75 | |
| bbh-temporal_sequences_attempted_ratio: 100 | |
| bbh-temporal_sequences_correct_count: 12 | |
| bbh-temporal_sequences_incorrect_count: 4 | |
| bbh-temporal_sequences_not_attempted_count: 0 | |
| cmo_fib_accuracy: 75 | |
| drop_accuracy: 56.25 | |
| drop_f1: 0.56 | |
| drop_accuracy_given_attempted: 56.25 | |
| drop_attempted_ratio: 100 | |
| drop_correct_count: 9 | |
| drop_incorrect_count: 7 | |
| drop_not_attempted_count: 0 | |
| GaokaoBench_2010-2022_Math_II_MCQs_score: 100 | |
| GPQA_diamond_accuracy: 62.5 | |
| GPQA_diamond_f1: 0.62 | |
| GPQA_diamond_accuracy_given_attempted: 62.5 | |
| GPQA_diamond_attempted_ratio: 100 | |
| GPQA_diamond_correct_count: 10 | |
| GPQA_diamond_incorrect_count: 6 | |
| GPQA_diamond_not_attempted_count: 0 | |
| gsm8k_accuracy: 100 | |
| hellaswag_accuracy: 100 | |
| hellaswag_f1: 1 | |
| hellaswag_accuracy_given_attempted: 100 | |
| hellaswag_attempted_ratio: 100 | |
| hellaswag_correct_count: 16 | |
| hellaswag_incorrect_count: 0 | |
| hellaswag_not_attempted_count: 0 | |
| korbench_cipher_accuracy: 93.75 | |
| korbench_cipher_f1: 0.94 | |
| korbench_cipher_accuracy_given_attempted: 93.75 | |
| korbench_cipher_attempted_ratio: 100 | |
| korbench_cipher_correct_count: 15 | |
| korbench_cipher_incorrect_count: 1 | |
| korbench_cipher_not_attempted_count: 0 | |
| math_prm800k_500-llmjudge_accuracy: 100 | |
| math_prm800k_500-llmjudge_f1: 1 | |
| math_prm800k_500-llmjudge_accuracy_given_attempted: 100 | |
| math_prm800k_500-llmjudge_attempted_ratio: 100 | |
| math_prm800k_500-llmjudge_correct_count: 16 | |
| math_prm800k_500-llmjudge_incorrect_count: 0 | |
| math_prm800k_500-llmjudge_not_attempted_count: 0 | |
| mathbench-college-single_choice_cn_acc_4: 93.75 | |
| mathbench-college-single_choice_cn_acc_1: 75 | |
| mathbench-college-single_choice_cn_more_1_0: 100 | |
| mathbench-college-single_choice_cn_more_1_1: 75 | |
| mathbench-college-single_choice_cn_more_4_0: 100 | |
| mathbench-college-single_choice_cn_more_4_1: 100 | |
| mathbench-college-single_choice_cn_more_4_2: 100 | |
| mathbench-college-single_choice_cn_more_4_3: 100 | |
| mathbench-college-single_choice_cn_more_4_4: 75 | |
| mathbench-college-single_choice_cn_perf_1: 75 | |
| mathbench-college-single_choice_cn_perf_4: 75 | |
| mathbench-college-single_choice_cn_vote_4: 100 | |
| mathbench-college-single_choice_cn_vote_1: 75 | |
| mathbench-college-single_choice_cn_prior_A: 31.25 | |
| mathbench-college-single_choice_cn_prior_B: 18.75 | |
| mathbench-college-single_choice_cn_prior_C: 25 | |
| mathbench-college-single_choice_cn_prior_D: 25 | |
| mathbench-college-single_choice_cn_prior_-: 0 | |
| musr_murder_mysteries_accuracy: 75 | |
| musr_murder_mysteries_f1: 0.75 | |
| musr_murder_mysteries_accuracy_given_attempted: 75 | |
| musr_murder_mysteries_attempted_ratio: 100 | |
| musr_murder_mysteries_correct_count: 12 | |
| musr_murder_mysteries_incorrect_count: 4 | |
| musr_murder_mysteries_not_attempted_count: 0 | |
| supergpqa_accuracy: 75 | |
| supergpqa_total_correct: 12 | |
| supergpqa_total_count: 16 | |
| supergpqa_SuperGPQA-Engineering: 80 | |
| supergpqa_SuperGPQA-Philosophy: 0 | |
| supergpqa_SuperGPQA-Medicine: 100 | |
| supergpqa_SuperGPQA-Economics: 100 | |
| supergpqa_SuperGPQA-Science: 66.67 | |
| supergpqa_SuperGPQA-Engineering-Electronic Science and Technology: 100 | |
| supergpqa_SuperGPQA-Philosophy-Philosophy: 0 | |
| supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine: 100 | |
| supergpqa_SuperGPQA-Economics-Applied Economics: 100 | |
| supergpqa_SuperGPQA-Science-Mathematics: 80 | |
| supergpqa_SuperGPQA-Science-Physics: 0 | |
| supergpqa_SuperGPQA-Medicine-Clinical Medicine: 100 | |
| supergpqa_SuperGPQA-Engineering-Computer Science and Technology: 100 | |
| supergpqa_SuperGPQA-Engineering-Information and Communication Engineering: 50 | |
| supergpqa_SuperGPQA-Engineering-Control Science and Engineering: 100 | |
| supergpqa_SuperGPQA-Economics-Theoretical Economics: 100 | |
| supergpqa_SuperGPQA-Engineering-Electronic Science and Technology-Circuits and Systems: 100 | |
| supergpqa_SuperGPQA-Philosophy-Philosophy-Philosophical Aesthetics: 0 | |
| supergpqa_SuperGPQA-Medicine-Traditional Chinese Medicine-Traditional Chinese Medicine Theory: 100 | |
| supergpqa_SuperGPQA-Economics-Applied Economics-Finance: 100 | |
| supergpqa_SuperGPQA-Science-Mathematics-Combinatorial Mathematics: 100 | |
| supergpqa_SuperGPQA-Science-Physics-Particle and Nuclear Physics: 0 | |
| supergpqa_SuperGPQA-Medicine-Clinical Medicine-Internal Medicine: 100 | |
| supergpqa_SuperGPQA-Engineering-Computer Science and Technology-Computer Architecture: 100 | |
| supergpqa_SuperGPQA-Science-Mathematics-Ordinary Differential Equations: 100 | |
| supergpqa_SuperGPQA-Science-Mathematics-Mathematical Analysis: 0 | |
| supergpqa_SuperGPQA-Engineering-Information and Communication Engineering-Signal and Information Processing: 50 | |
| supergpqa_SuperGPQA-Science-Mathematics-Advanced Algebra: 100 | |
| supergpqa_SuperGPQA-Engineering-Control Science and Engineering-Control Theory and Control Engineering: 100 | |
| supergpqa_SuperGPQA-Economics-Theoretical Economics-Political Economy: 100 | |
| triviaqa_wiki_1shot_score: 50 | |
| ARC_Prize_Public_Evaluation_accuracy: 0.06 | |
| objective_v7: | |
| aime2024_accuracy: 75 | |
| aime2025_accuracy: 56.25 | |
| bbeh_boolean_expressions_accuracy: 68.75 | |
| bbeh_boolean_expressions_f1: 0.69 | |
| bbeh_boolean_expressions_accuracy_given_attempted: 68.75 | |
| bbeh_boolean_expressions_attempted_ratio: 100 | |
| bbeh_boolean_expressions_correct_count: 11 | |
| bbeh_boolean_expressions_incorrect_count: 5 | |
| bbeh_boolean_expressions_not_attempted_count: 0 | |
| Chem_exam-competition_final_score: 42.29 | |
| Chem_exam-gaokao_final_score: 71.88 | |
| ChemBench_Name_Conversion_accuracy: 100.00 | |
| ChemBench_Name_Conversion_f1: 1.00 | |
| ChemBench_Name_Conversion_accuracy_given_attempted: 100 | |
| ChemBench_Name_Conversion_attempted_ratio: 100.00 | |
| ChemBench_Name_Conversion_correct_count: 16.00 | |
| ChemBench_Name_Conversion_incorrect_count: 0 | |
| ChemBench_Name_Conversion_not_attempted_count: 0 | |
| ClimaQA_Gold_mcq_accuracy: 87.5 | |
| ClimaQA_Gold_mcq_f1: 0.88 | |
| ClimaQA_Gold_mcq_accuracy_given_attempted: 87.5 | |
| ClimaQA_Gold_mcq_attempted_ratio: 100 | |
| ClimaQA_Gold_mcq_correct_count: 14 | |
| ClimaQA_Gold_mcq_incorrect_count: 2 | |
| ClimaQA_Gold_mcq_not_attempted_count: 0 | |
| cmmlu-agronomy_accuracy: 87.50 | |
| cmmlu-agronomy_f1: 0.88 | |
| cmmlu-agronomy_accuracy_given_attempted: 87.50 | |
| cmmlu-agronomy_attempted_ratio: 100.00 | |
| cmmlu-agronomy_correct_count: 14.00 | |
| cmmlu-agronomy_incorrect_count: 2 | |
| cmmlu-agronomy_not_attempted_count: 0.00 | |
| earth_silver_mcq_accuracy: 81.25 | |
| earth_silver_mcq_f1: 0.81 | |
| earth_silver_mcq_accuracy_given_attempted: 81.25 | |
| earth_silver_mcq_attempted_ratio: 100 | |
| earth_silver_mcq_correct_count: 13 | |
| earth_silver_mcq_incorrect_count: 3 | |
| earth_silver_mcq_not_attempted_count: 0 | |
| GPQA_diamond_accuracy: 68.75 | |
| hle_llmjudge_accuracy: 43.75 | |
| hle_llmjudge_f1: 0.43 | |
| hle_llmjudge_accuracy_given_attempted: 43.75 | |
| hle_llmjudge_attempted_ratio: 100.00 | |
| hle_llmjudge_correct_count: 7.00 | |
| hle_llmjudge_incorrect_count: 9.00 | |
| hle_llmjudge_not_attempted_count: 0.00 | |
| IFEval_Prompt-level-strict-accuracy: 87.5 | |
| IFEval_Inst-level-strict-accuracy: 91.67 | |
| IFEval_Prompt-level-loose-accuracy: 87.5 | |
| IFEval_Inst-level-loose-accuracy: 91.67 | |
| kcle_accuracy: 75.00 | |
| kcle_f1: 0.75 | |
| kcle_accuracy_given_attempted: 75.00 | |
| kcle_attempted_ratio: 100.00 | |
| kcle_correct_count: 12 | |
| kcle_incorrect_count: 4 | |
| kcle_not_attempted_count: 0 | |
| korbench_cipher_accuracy: 81.25 | |
| livemathbench_hard_custom_hard_cn_accuracy: 25 | |
| matbench_steels_mae: 816.50 | |
| math_prm800k_500_accuracy: 100 | |
| sanitized_mbpp_score: 6.25 | |
| sanitized_mbpp_pass: 1 | |
| sanitized_mbpp_timeout: 0 | |
| sanitized_mbpp_failed: 15 | |
| sanitized_mbpp_wrong_answer: 0 | |
| medxpertqa_accuracy: 62.5 | |
| medxpertqa_total_correct: 10 | |
| medxpertqa_total_count: 16 | |
| medxpertqa_MedXpertQA-Basic Science: 75 | |
| medxpertqa_MedXpertQA-Diagnosis: 50 | |
| medxpertqa_MedXpertQA-Treatment: 75 | |
| medxpertqa_MedXpertQA-Skeletal: 66.67 | |
| medxpertqa_MedXpertQA-Muscular: 100 | |
| medxpertqa_MedXpertQA-Respiratory: 33.33 | |
| medxpertqa_MedXpertQA-Endocrine: 66.67 | |
| medxpertqa_MedXpertQA-Cardiovascular: 100 | |
| medxpertqa_MedXpertQA-Lymphatic: 0 | |
| medxpertqa_MedXpertQA-Nervous: 100 | |
| medxpertqa_MedXpertQA-Reproductive: 0 | |
| medxpertqa_MedXpertQA-Reasoning: 64.29 | |
| medxpertqa_MedXpertQA-Understanding: 50 | |
| lukaemon_mmlu_college_biology_accuracy: 81.25 | |
| lukaemon_mmlu_college_biology_f1: 0.81 | |
| lukaemon_mmlu_college_biology_accuracy_given_attempted: 81.25 | |
| lukaemon_mmlu_college_biology_attempted_ratio: 100 | |
| lukaemon_mmlu_college_biology_correct_count: 13 | |
| lukaemon_mmlu_college_biology_incorrect_count: 3 | |
| lukaemon_mmlu_college_biology_not_attempted_count: 0 | |
| mmlu_pro_math_accuracy: 68.75 | |
| mmlu_pro_math_f1: 0.69 | |
| mmlu_pro_math_accuracy_given_attempted: 68.75 | |
| mmlu_pro_math_attempted_ratio: 100 | |
| mmlu_pro_math_correct_count: 11 | |
| mmlu_pro_math_incorrect_count: 5 | |
| mmlu_pro_math_not_attempted_count: 0 | |
| olymmath_llmjudge_en-hard_accuracy: 68.75 | |
| olymmath_llmjudge_en-hard_f1: 0.69 | |
| olymmath_llmjudge_en-hard_accuracy_given_attempted: 68.75 | |
| olymmath_llmjudge_en-hard_attempted_ratio: 100.00 | |
| olymmath_llmjudge_en-hard_correct_count: 11 | |
| olymmath_llmjudge_en-hard_incorrect_count: 5 | |
| olymmath_llmjudge_en-hard_not_attempted_count: 0 | |
| OlympiadBench_OE_TO_maths_en_COMP_accuracy: 43.75 | |
| OlympiadBench_OE_TO_maths_en_COMP_f1: 0.44 | |
| OlympiadBench_OE_TO_maths_en_COMP_accuracy_given_attempted: 43.75 | |
| OlympiadBench_OE_TO_maths_en_COMP_attempted_ratio: 100.00 | |
| OlympiadBench_OE_TO_maths_en_COMP_correct_count: 7 | |
| OlympiadBench_OE_TO_maths_en_COMP_incorrect_count: 9 | |
| OlympiadBench_OE_TO_maths_en_COMP_not_attempted_count: 0 | |
| phybench-eed_accuracy: 27.94 | |
| PHYSICS_atomic_dataset_textonly_accuracy: 93.75 | |
| PHYSICS_atomic_dataset_textonly_f1: 0.94 | |
| PHYSICS_atomic_dataset_textonly_accuracy_given_attempted: 93.75 | |
| PHYSICS_atomic_dataset_textonly_attempted_ratio: 100 | |
| PHYSICS_atomic_dataset_textonly_correct_count: 15 | |
| PHYSICS_atomic_dataset_textonly_incorrect_count: 1 | |
| PHYSICS_atomic_dataset_textonly_not_attempted_count: 0 | |
| ProteinLMBench_accuracy: 68.75 | |
| ProteinLMBench_f1: 0.69 | |
| ProteinLMBench_accuracy_given_attempted: 68.75 | |
| ProteinLMBench_attempted_ratio: 100 | |
| ProteinLMBench_correct_count: 11 | |
| ProteinLMBench_incorrect_count: 5 | |
| ProteinLMBench_not_attempted_count: 0 | |
| R-Bench_en_accuracy: 50.00 | |
| R-Bench_en_f1: 0.50 | |
| R-Bench_en_accuracy_given_attempted: 50.00 | |
| R-Bench_en_attempted_ratio: 100.00 | |
| R-Bench_en_correct_count: 8 | |
| R-Bench_en_incorrect_count: 8 | |
| R-Bench_en_not_attempted_count: 0 | |
| NC-I2F-0shot-instruct_score: 12.5 | |
| NC-I2F-0shot-instruct_valid_score: 75 | |
| srbench_mean_RMSE: 61339597.01 | |
| srbench_mean_NMSE: 83.61 | |
| srbench_mean_R2: -82.61 | |
| srbench_SymbolicMatch: 0.14 | |
| supergpqa_Electronic_Science_and_Technology_accuracy: 56.25 | |
| lcb_code_generation_v6_pass@1: 50 | |
| chat_longtext: | |
| babilong_qa1_256k_score: 0.00 | |
| LongBench_2wikimqa_score: 5.43 | |
| Length16000Depth0_2needle_en_128k_score: 100.00 | |
| ruler_cwe_128k_score: 0.00 | |
| chat_subjective: | |
| alignment_bench_v1_1_总分: 0.48 | |
| arenahard_score: 92.07 | |
| Followbench_naive_average: 1 | |
| mtbench101_avg: 8.7 | |
| wildbench_average: 75.17 | |
| simpleqa_accuracy_given_attempted: 0.55 | |
| chinese_simpleqa_given_attempted_accuracy: 0.64 | |
| alignment_bench_v1_1_专业能力: 5.81 | |
| alignment_bench_v1_1_数学计算: 0 | |
| alignment_bench_v1_1_基本任务: 0 | |
| alignment_bench_v1_1_逻辑推理: 0 | |
| alignment_bench_v1_1_中文理解: 0 | |
| alignment_bench_v1_1_文本写作: 0 | |
| alignment_bench_v1_1_角色扮演: 0 | |
| alignment_bench_v1_1_综合问答: 0 | |
| compassarena_language_naive_average: 55 | |
| compassarena_knowledge_naive_average: 100 | |
| compassarena_reason_v2_naive_average: 78.95 | |
| compassarena_creationv2_zh_naive_average: 100 | |
| followbench_llmeval_en_HSR_AVG: 1 | |
| followbench_llmeval_en_SSR_AVG: 1 | |
| followbench_llmeval_en_HSR_L1: 1 | |
| followbench_llmeval_en_HSR_L2: 1 | |
| followbench_llmeval_en_HSR_L3: 1 | |
| followbench_llmeval_en_HSR_L4: 1 | |
| followbench_llmeval_en_HSR_L5: 1 | |
| followbench_llmeval_en_SSR_L1: 1 | |
| followbench_llmeval_en_SSR_L2: 1 | |
| followbench_llmeval_en_SSR_L3: 1 | |
| followbench_llmeval_en_SSR_L4: 1 | |
| followbench_llmeval_en_SSR_L5: 1 | |
| simpleqa_f1: 0.46 | |
| qwen3-8b-base-turbomind: | |
| base_longtext: | |
| LongBench_2wikimqa_score: 26.98 | |
| Length32000Depth0_origin_en_32000_score: 100.00 | |
| qwen-3-8b-base-hf-fullbench: | |
| objective_base: | |
| race-high_accuracy: 87.5 | |
| ARC-c_accuracy: 75 | |
| BoolQ_accuracy: 75 | |
| triviaqa_wiki_1shot_score: 37.5 | |
| nq_open_1shot_score: 6.25 | |
| drop_accuracy: 75 | |
| GPQA_diamond_accuracy: 81.25 | |
| hellaswag_accuracy: 81.25 | |
| TheoremQA_score: 18.75 | |
| winogrande_accuracy: 81.25 | |
| gsm8k_accuracy: 81.25 | |
| GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75 | |
| GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 | |
| math_accuracy: 87.5 | |
| wikibench-wiki-single_choice_cncircular_perf_4: 50 | |
| sanitized_mbpp_score: 93.75 | |
| mmlu-other_accuracy: 76.44 | |
| cmmlu-china-specific_accuracy: 82.5 | |
| mmlu_pro_math_accuracy: 50 | |
| bbh-logical_deduction_seven_objects_score: 37.5 | |
| bbh-multistep_arithmetic_two_score: 0 | |
| college_naive_average: 37.5 | |
| college_knowledge_naive_average: 87.5 | |
| qwen-3-8b-base-fullbench: | |
| objective_base: | |
| race-high_accuracy: 87.5 | |
| ARC-c_accuracy: 75 | |
| BoolQ_accuracy: 81.25 | |
| triviaqa_wiki_1shot_score: 37.5 | |
| nq_open_1shot_score: 6.25 | |
| drop_accuracy: 81.25 | |
| GPQA_diamond_accuracy: 81.25 | |
| hellaswag_accuracy: 75 | |
| TheoremQA_score: 18.75 | |
| winogrande_accuracy: 81.25 | |
| gsm8k_accuracy: 81.25 | |
| GaokaoBench_2010-2022_Math_II_MCQs_score: 100 | |
| GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0 | |
| math_accuracy: 81.25 | |
| wikibench-wiki-single_choice_cncircular_perf_4: 50 | |
| sanitized_mbpp_score: 93.75 | |
| mmlu-other_accuracy: 75.48 | |
| cmmlu-china-specific_accuracy: 83.75 | |
| mmlu_pro_math_accuracy: 50 | |
| bbh-logical_deduction_seven_objects_score: 62.5 | |
| bbh-multistep_arithmetic_two_score: 100 | |
| college_naive_average: 37.5 | |
| college_knowledge_naive_average: 87.5 |