| { | |
| "date": "2025-12-08", | |
| "total_questions_num": 1500, | |
| "inference_iterations": 3, | |
| "total_samples_num": 4500, | |
| "fail_samples_num": 0, | |
| "inference_inconsistent_samples_num": 0, | |
| "average_overall_metric": 0.5320685707653132, | |
| "inference_iteration_1_overall_metric": 0.535180398833494, | |
| "inference_iteration_2_overall_metric": 0.5311849506804371, | |
| "inference_iteration_3_overall_metric": 0.5298403627820072, | |
| "average_token_length_metric": { | |
| "8k": 0.654795970947119, | |
| "16k": 0.5832041701523042, | |
| "32k": 0.5830505446766833, | |
| "64k": 0.5201561955794758, | |
| "128k": 0.5060838591020447, | |
| "256k": 0.3451206841342513 | |
| }, | |
| "average_contextual_requirement_metric": { | |
| "Full": 0.4938467607068266, | |
| "Partial": 0.5807145108397509 | |
| }, | |
| "average_difficulty_metric": { | |
| "Easy": 0.7219874781362817, | |
| "Moderate": 0.599199335465557, | |
| "Hard": 0.4257653962693645, | |
| "Extreme": 0.34975019139747615 | |
| }, | |
| "average_primary_task_metric": { | |
| "T1. Retrieval & Ranking": 0.767571983047427, | |
| "T2. Sequencing & Structure Reconstruction": 0.7186696477094124, | |
| "T3. Evidence-Grounded QA": 0.4972222222222222, | |
| "T4. Summarization & Synthesis": 0.4696599254603241, | |
| "T5. Attribution & Citation Alignment": 0.54344042963745, | |
| "T6. Aggregation & Clustering": 0.5123089198769455, | |
| "T7. Consistency & Compliance Checking": 0.31381086481875964, | |
| "T8. Structured & Numeric Reasoning": 0.6038580246913581, | |
| "T9. Version & Code Diff Analysis": 0.5619188050015754, | |
| "T10. Rule Induction & In-Context Learning": 0.5529629629629632, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454 | |
| }, | |
| "average_language_metric": { | |
| "Chinese": 0.5354797348772966, | |
| "English": 0.5286574066533296 | |
| }, | |
| "BoN-1": { | |
| "overall_metric": 0.535180398833494, | |
| "token_length": { | |
| "8k": 0.6571274187960493, | |
| "16k": 0.5855452098864022, | |
| "32k": 0.6094638772285274, | |
| "64k": 0.5094373867375244, | |
| "128k": 0.5028727484199556, | |
| "256k": 0.34663575193250185 | |
| }, | |
| "contextual_requirement": { | |
| "Full": 0.4983496461876354, | |
| "Partial": 0.5820559022009494 | |
| }, | |
| "difficulty": { | |
| "Easy": 0.7348029222520711, | |
| "Moderate": 0.6076522249303262, | |
| "Hard": 0.4165082385065274, | |
| "Extreme": 0.3468482177079349 | |
| }, | |
| "primary_task": { | |
| "T1. Retrieval & Ranking": 0.7483964432006224, | |
| "T2. Sequencing & Structure Reconstruction": 0.6994320017261199, | |
| "T3. Evidence-Grounded QA": 0.5, | |
| "T4. Summarization & Synthesis": 0.46659438196842223, | |
| "T5. Attribution & Citation Alignment": 0.5466093432829364, | |
| "T6. Aggregation & Clustering": 0.5244645023077399, | |
| "T7. Consistency & Compliance Checking": 0.32026132009110975, | |
| "T8. Structured & Numeric Reasoning": 0.6097222222222223, | |
| "T9. Version & Code Diff Analysis": 0.5581618594200692, | |
| "T10. Rule Induction & In-Context Learning": 0.5638888888888888, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.425 | |
| }, | |
| "language": { | |
| "Chinese": 0.5363273796225, | |
| "English": 0.5340334180444871 | |
| } | |
| }, | |
| "pass@1": 0.30133333333333334, | |
| "BoN-2": { | |
| "overall_metric": 0.631079137318095, | |
| "token_length": { | |
| "8k": 0.7607802022833232, | |
| "16k": 0.6922523863155777, | |
| "32k": 0.696226877019834, | |
| "64k": 0.6061494664574102, | |
| "128k": 0.5803070617163435, | |
| "256k": 0.45075883011608214 | |
| }, | |
| "contextual_requirement": { | |
| "Full": 0.5863061857618213, | |
| "Partial": 0.6880628938442622 | |
| }, | |
| "difficulty": { | |
| "Easy": 0.8429412379045521, | |
| "Moderate": 0.7260325873797594, | |
| "Hard": 0.5037591992235261, | |
| "Extreme": 0.42025273404272534 | |
| }, | |
| "primary_task": { | |
| "T1. Retrieval & Ranking": 0.854967268320159, | |
| "T2. Sequencing & Structure Reconstruction": 0.7950413527388575, | |
| "T3. Evidence-Grounded QA": 0.6333333333333333, | |
| "T4. Summarization & Synthesis": 0.48790929716965253, | |
| "T5. Attribution & Citation Alignment": 0.6628232709674524, | |
| "T6. Aggregation & Clustering": 0.6070531962911038, | |
| "T7. Consistency & Compliance Checking": 0.4273910542891768, | |
| "T8. Structured & Numeric Reasoning": 0.6976851851851852, | |
| "T9. Version & Code Diff Analysis": 0.6589983180763119, | |
| "T10. Rule Induction & In-Context Learning": 0.663888888888889, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333 | |
| }, | |
| "language": { | |
| "Chinese": 0.6243713115466586, | |
| "English": 0.6377869630895318 | |
| } | |
| }, | |
| "pass@2": 0.38133333333333336, | |
| "BoN-3": { | |
| "overall_metric": 0.6838483190204042, | |
| "token_length": { | |
| "8k": 0.8056053063071357, | |
| "16k": 0.754121676530954, | |
| "32k": 0.7309373525434467, | |
| "64k": 0.6802921620132278, | |
| "128k": 0.6428076963616377, | |
| "256k": 0.48932572036602695 | |
| }, | |
| "contextual_requirement": { | |
| "Full": 0.6418563038987366, | |
| "Partial": 0.7372927019025284 | |
| }, | |
| "difficulty": { | |
| "Easy": 0.8932565923719491, | |
| "Moderate": 0.7958991372015439, | |
| "Hard": 0.5571346872299623, | |
| "Extreme": 0.46408187669686923 | |
| }, | |
| "primary_task": { | |
| "T1. Retrieval & Ranking": 0.8856810957992655, | |
| "T2. Sequencing & Structure Reconstruction": 0.8257039309014353, | |
| "T3. Evidence-Grounded QA": 0.7083333333333334, | |
| "T4. Summarization & Synthesis": 0.5020042367803014, | |
| "T5. Attribution & Citation Alignment": 0.729994986816228, | |
| "T6. Aggregation & Clustering": 0.6651081533166491, | |
| "T7. Consistency & Compliance Checking": 0.49051401515979876, | |
| "T8. Structured & Numeric Reasoning": 0.7680555555555556, | |
| "T9. Version & Code Diff Analysis": 0.7036475958542688, | |
| "T10. Rule Induction & In-Context Learning": 0.7072222222222223, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.6 | |
| }, | |
| "language": { | |
| "Chinese": 0.6727473750313149, | |
| "English": 0.694949263009495 | |
| } | |
| }, | |
| "pass@3": 0.444 | |
| } |