{ "date": "2025-12-08", "total_questions_num": 1500, "inference_iterations": 3, "total_samples_num": 4500, "fail_samples_num": 0, "inference_inconsistent_samples_num": 0, "average_overall_metric": 0.4499528005964066, "inference_iteration_1_overall_metric": 0.4519835462001885, "inference_iteration_2_overall_metric": 0.4481755772504262, "inference_iteration_3_overall_metric": 0.4496992783386054, "average_token_length_metric": { "8k": 0.485225729559654, "16k": 0.4524723240855649, "32k": 0.46920448352940436, "64k": 0.44046374240515457, "128k": 0.4133092627171987, "256k": 0.43904126128146514 }, "average_contextual_requirement_metric": { "Full": 0.4116545212336913, "Partial": 0.49869606523986354 }, "average_difficulty_metric": { "Easy": 0.6191934548978654, "Moderate": 0.4082147550465631, "Hard": 0.3801988071084879, "Extreme": 0.33778735493415807 }, "average_primary_task_metric": { "T1. Retrieval & Ranking": 0.6830330861399296, "T2. Sequencing & Structure Reconstruction": 0.6403219944448011, "T3. Evidence-Grounded QA": 0.4833333333333333, "T4. Summarization & Synthesis": 0.5086176566073063, "T5. Attribution & Citation Alignment": 0.416914270509611, "T6. Aggregation & Clustering": 0.4334853794839026, "T7. Consistency & Compliance Checking": 0.27119391146489646, "T8. Structured & Numeric Reasoning": 0.38966049382716056, "T9. Version & Code Diff Analysis": 0.4348929522191275, "T10. Rule Induction & In-Context Learning": 0.41300925925925924, "T11. Dialogue Memory & Long-Horizon Tracking": 0.4027777777777778 }, "average_language_metric": { "Chinese": 0.45819903421860664, "English": 0.4417065669742075 }, "BoN-1": { "overall_metric": 0.4519835462001885, "token_length": { "8k": 0.4879779124929164, "16k": 0.4554840853531918, "32k": 0.4648286187996774, "64k": 0.42985632449506034, "128k": 0.4307020670264534, "256k": 0.443052269033835 }, "contextual_requirement": { "Full": 0.41228070711895354, "Partial": 0.5025144323035801 }, "difficulty": { "Easy": 0.6285595261886431, "Moderate": 0.4057015689049336, "Hard": 0.37791019658117175, "Extreme": 0.33760415329971205 }, "primary_task": { "T1. Retrieval & Ranking": 0.6904671153390595, "T2. Sequencing & Structure Reconstruction": 0.6319390331890332, "T3. Evidence-Grounded QA": 0.44166666666666665, "T4. Summarization & Synthesis": 0.5079368349605524, "T5. Attribution & Citation Alignment": 0.3963567606333699, "T6. Aggregation & Clustering": 0.4315669444489273, "T7. Consistency & Compliance Checking": 0.26717481095169254, "T8. Structured & Numeric Reasoning": 0.40648148148148144, "T9. Version & Code Diff Analysis": 0.4533152836127507, "T10. Rule Induction & In-Context Learning": 0.4119444444444444, "T11. Dialogue Memory & Long-Horizon Tracking": 0.4583333333333333 }, "language": { "Chinese": 0.47043321599568005, "English": 0.43353387640469826 } }, "pass@1": 0.21, "BoN-2": { "overall_metric": 0.5523435379453717, "token_length": { "8k": 0.6041368153338821, "16k": 0.553143416205592, "32k": 0.5547357356840433, "64k": 0.5474714891955119, "128k": 0.5080001305944092, "256k": 0.5465736406587951 }, "contextual_requirement": { "Full": 0.5094377873914124, "Partial": 0.6069508568322307 }, "difficulty": { "Easy": 0.7582302423860908, "Moderate": 0.5069058318579235, "Hard": 0.47636905813527697, "Extreme": 0.40654974290892704 }, "primary_task": { "T1. Retrieval & Ranking": 0.7753422134076285, "T2. Sequencing & Structure Reconstruction": 0.7188864376364378, "T3. Evidence-Grounded QA": 0.6083333333333333, "T4. Summarization & Synthesis": 0.5276281423571613, "T5. Attribution & Citation Alignment": 0.573374443874177, "T6. Aggregation & Clustering": 0.5278895685136558, "T7. Consistency & Compliance Checking": 0.35338346649949204, "T8. Structured & Numeric Reasoning": 0.5027777777777779, "T9. Version & Code Diff Analysis": 0.552570101188694, "T10. Rule Induction & In-Context Learning": 0.5220833333333333, "T11. Dialogue Memory & Long-Horizon Tracking": 0.55 }, "language": { "Chinese": 0.5649919855932303, "English": 0.5396950902975145 } }, "pass@2": 0.2753333333333333, "BoN-3": { "overall_metric": 0.5997056103547938, "token_length": { "8k": 0.6457585156659336, "16k": 0.6123141997231359, "32k": 0.6242961953070552, "64k": 0.5876928890236057, "128k": 0.5497742714361217, "256k": 0.5783975909729129 }, "contextual_requirement": { "Full": 0.5540426758661396, "Partial": 0.6578220724312633 }, "difficulty": { "Easy": 0.8056166900447767, "Moderate": 0.5614066990728871, "Hard": 0.5178805893116146, "Extreme": 0.45303896497156343 }, "primary_task": { "T1. Retrieval & Ranking": 0.8083334868935526, "T2. Sequencing & Structure Reconstruction": 0.7593085155585156, "T3. Evidence-Grounded QA": 0.6833333333333333, "T4. Summarization & Synthesis": 0.5344316475303361, "T5. Attribution & Citation Alignment": 0.6383957562170883, "T6. Aggregation & Clustering": 0.5743997782942697, "T7. Consistency & Compliance Checking": 0.39861351698347697, "T8. Structured & Numeric Reasoning": 0.5527777777777778, "T9. Version & Code Diff Analysis": 0.5853585580965909, "T10. Rule Induction & In-Context Learning": 0.5984722222222222, "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 }, "language": { "Chinese": 0.6144058958264887, "English": 0.5850053248830991 } }, "pass@3": 0.31933333333333336 }