{ "date": "2025-12-08", "total_questions_num": 1500, "inference_iterations": 3, "total_samples_num": 4500, "fail_samples_num": 0, "inference_inconsistent_samples_num": 0, "average_overall_metric": 0.5320685707653132, "inference_iteration_1_overall_metric": 0.535180398833494, "inference_iteration_2_overall_metric": 0.5311849506804371, "inference_iteration_3_overall_metric": 0.5298403627820072, "average_token_length_metric": { "8k": 0.654795970947119, "16k": 0.5832041701523042, "32k": 0.5830505446766833, "64k": 0.5201561955794758, "128k": 0.5060838591020447, "256k": 0.3451206841342513 }, "average_contextual_requirement_metric": { "Full": 0.4938467607068266, "Partial": 0.5807145108397509 }, "average_difficulty_metric": { "Easy": 0.7219874781362817, "Moderate": 0.599199335465557, "Hard": 0.4257653962693645, "Extreme": 0.34975019139747615 }, "average_primary_task_metric": { "T1. Retrieval & Ranking": 0.767571983047427, "T2. Sequencing & Structure Reconstruction": 0.7186696477094124, "T3. Evidence-Grounded QA": 0.4972222222222222, "T4. Summarization & Synthesis": 0.4696599254603241, "T5. Attribution & Citation Alignment": 0.54344042963745, "T6. Aggregation & Clustering": 0.5123089198769455, "T7. Consistency & Compliance Checking": 0.31381086481875964, "T8. Structured & Numeric Reasoning": 0.6038580246913581, "T9. Version & Code Diff Analysis": 0.5619188050015754, "T10. Rule Induction & In-Context Learning": 0.5529629629629632, "T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454 }, "average_language_metric": { "Chinese": 0.5354797348772966, "English": 0.5286574066533296 }, "BoN-1": { "overall_metric": 0.535180398833494, "token_length": { "8k": 0.6571274187960493, "16k": 0.5855452098864022, "32k": 0.6094638772285274, "64k": 0.5094373867375244, "128k": 0.5028727484199556, "256k": 0.34663575193250185 }, "contextual_requirement": { "Full": 0.4983496461876354, "Partial": 0.5820559022009494 }, "difficulty": { "Easy": 0.7348029222520711, "Moderate": 0.6076522249303262, "Hard": 0.4165082385065274, "Extreme": 0.3468482177079349 }, "primary_task": { "T1. Retrieval & Ranking": 0.7483964432006224, "T2. Sequencing & Structure Reconstruction": 0.6994320017261199, "T3. Evidence-Grounded QA": 0.5, "T4. Summarization & Synthesis": 0.46659438196842223, "T5. Attribution & Citation Alignment": 0.5466093432829364, "T6. Aggregation & Clustering": 0.5244645023077399, "T7. Consistency & Compliance Checking": 0.32026132009110975, "T8. Structured & Numeric Reasoning": 0.6097222222222223, "T9. Version & Code Diff Analysis": 0.5581618594200692, "T10. Rule Induction & In-Context Learning": 0.5638888888888888, "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 }, "language": { "Chinese": 0.5363273796225, "English": 0.5340334180444871 } }, "pass@1": 0.30133333333333334, "BoN-2": { "overall_metric": 0.631079137318095, "token_length": { "8k": 0.7607802022833232, "16k": 0.6922523863155777, "32k": 0.696226877019834, "64k": 0.6061494664574102, "128k": 0.5803070617163435, "256k": 0.45075883011608214 }, "contextual_requirement": { "Full": 0.5863061857618213, "Partial": 0.6880628938442622 }, "difficulty": { "Easy": 0.8429412379045521, "Moderate": 0.7260325873797594, "Hard": 0.5037591992235261, "Extreme": 0.42025273404272534 }, "primary_task": { "T1. Retrieval & Ranking": 0.854967268320159, "T2. Sequencing & Structure Reconstruction": 0.7950413527388575, "T3. Evidence-Grounded QA": 0.6333333333333333, "T4. Summarization & Synthesis": 0.48790929716965253, "T5. Attribution & Citation Alignment": 0.6628232709674524, "T6. Aggregation & Clustering": 0.6070531962911038, "T7. Consistency & Compliance Checking": 0.4273910542891768, "T8. Structured & Numeric Reasoning": 0.6976851851851852, "T9. Version & Code Diff Analysis": 0.6589983180763119, "T10. Rule Induction & In-Context Learning": 0.663888888888889, "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333 }, "language": { "Chinese": 0.6243713115466586, "English": 0.6377869630895318 } }, "pass@2": 0.38133333333333336, "BoN-3": { "overall_metric": 0.6838483190204042, "token_length": { "8k": 0.8056053063071357, "16k": 0.754121676530954, "32k": 0.7309373525434467, "64k": 0.6802921620132278, "128k": 0.6428076963616377, "256k": 0.48932572036602695 }, "contextual_requirement": { "Full": 0.6418563038987366, "Partial": 0.7372927019025284 }, "difficulty": { "Easy": 0.8932565923719491, "Moderate": 0.7958991372015439, "Hard": 0.5571346872299623, "Extreme": 0.46408187669686923 }, "primary_task": { "T1. Retrieval & Ranking": 0.8856810957992655, "T2. Sequencing & Structure Reconstruction": 0.8257039309014353, "T3. Evidence-Grounded QA": 0.7083333333333334, "T4. Summarization & Synthesis": 0.5020042367803014, "T5. Attribution & Citation Alignment": 0.729994986816228, "T6. Aggregation & Clustering": 0.6651081533166491, "T7. Consistency & Compliance Checking": 0.49051401515979876, "T8. Structured & Numeric Reasoning": 0.7680555555555556, "T9. Version & Code Diff Analysis": 0.7036475958542688, "T10. Rule Induction & In-Context Learning": 0.7072222222222223, "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 }, "language": { "Chinese": 0.6727473750313149, "English": 0.694949263009495 } }, "pass@3": 0.444 }