LongBench-Pro-Leaderboard / output /MiniMax-M2 /thinking_context-1000000_bon-3_summary.json
czyPL's picture
change name
5e81af2
{
"date": "2025-12-08",
"total_questions_num": 1500,
"inference_iterations": 3,
"total_samples_num": 4500,
"fail_samples_num": 0,
"inference_inconsistent_samples_num": 0,
"average_overall_metric": 0.5320685707653132,
"inference_iteration_1_overall_metric": 0.535180398833494,
"inference_iteration_2_overall_metric": 0.5311849506804371,
"inference_iteration_3_overall_metric": 0.5298403627820072,
"average_token_length_metric": {
"8k": 0.654795970947119,
"16k": 0.5832041701523042,
"32k": 0.5830505446766833,
"64k": 0.5201561955794758,
"128k": 0.5060838591020447,
"256k": 0.3451206841342513
},
"average_contextual_requirement_metric": {
"Full": 0.4938467607068266,
"Partial": 0.5807145108397509
},
"average_difficulty_metric": {
"Easy": 0.7219874781362817,
"Moderate": 0.599199335465557,
"Hard": 0.4257653962693645,
"Extreme": 0.34975019139747615
},
"average_primary_task_metric": {
"T1. Retrieval & Ranking": 0.767571983047427,
"T2. Sequencing & Structure Reconstruction": 0.7186696477094124,
"T3. Evidence-Grounded QA": 0.4972222222222222,
"T4. Summarization & Synthesis": 0.4696599254603241,
"T5. Attribution & Citation Alignment": 0.54344042963745,
"T6. Aggregation & Clustering": 0.5123089198769455,
"T7. Consistency & Compliance Checking": 0.31381086481875964,
"T8. Structured & Numeric Reasoning": 0.6038580246913581,
"T9. Version & Code Diff Analysis": 0.5619188050015754,
"T10. Rule Induction & In-Context Learning": 0.5529629629629632,
"T11. Dialogue Memory & Long-Horizon Tracking": 0.39444444444444454
},
"average_language_metric": {
"Chinese": 0.5354797348772966,
"English": 0.5286574066533296
},
"BoN-1": {
"overall_metric": 0.535180398833494,
"token_length": {
"8k": 0.6571274187960493,
"16k": 0.5855452098864022,
"32k": 0.6094638772285274,
"64k": 0.5094373867375244,
"128k": 0.5028727484199556,
"256k": 0.34663575193250185
},
"contextual_requirement": {
"Full": 0.4983496461876354,
"Partial": 0.5820559022009494
},
"difficulty": {
"Easy": 0.7348029222520711,
"Moderate": 0.6076522249303262,
"Hard": 0.4165082385065274,
"Extreme": 0.3468482177079349
},
"primary_task": {
"T1. Retrieval & Ranking": 0.7483964432006224,
"T2. Sequencing & Structure Reconstruction": 0.6994320017261199,
"T3. Evidence-Grounded QA": 0.5,
"T4. Summarization & Synthesis": 0.46659438196842223,
"T5. Attribution & Citation Alignment": 0.5466093432829364,
"T6. Aggregation & Clustering": 0.5244645023077399,
"T7. Consistency & Compliance Checking": 0.32026132009110975,
"T8. Structured & Numeric Reasoning": 0.6097222222222223,
"T9. Version & Code Diff Analysis": 0.5581618594200692,
"T10. Rule Induction & In-Context Learning": 0.5638888888888888,
"T11. Dialogue Memory & Long-Horizon Tracking": 0.425
},
"language": {
"Chinese": 0.5363273796225,
"English": 0.5340334180444871
}
},
"pass@1": 0.30133333333333334,
"BoN-2": {
"overall_metric": 0.631079137318095,
"token_length": {
"8k": 0.7607802022833232,
"16k": 0.6922523863155777,
"32k": 0.696226877019834,
"64k": 0.6061494664574102,
"128k": 0.5803070617163435,
"256k": 0.45075883011608214
},
"contextual_requirement": {
"Full": 0.5863061857618213,
"Partial": 0.6880628938442622
},
"difficulty": {
"Easy": 0.8429412379045521,
"Moderate": 0.7260325873797594,
"Hard": 0.5037591992235261,
"Extreme": 0.42025273404272534
},
"primary_task": {
"T1. Retrieval & Ranking": 0.854967268320159,
"T2. Sequencing & Structure Reconstruction": 0.7950413527388575,
"T3. Evidence-Grounded QA": 0.6333333333333333,
"T4. Summarization & Synthesis": 0.48790929716965253,
"T5. Attribution & Citation Alignment": 0.6628232709674524,
"T6. Aggregation & Clustering": 0.6070531962911038,
"T7. Consistency & Compliance Checking": 0.4273910542891768,
"T8. Structured & Numeric Reasoning": 0.6976851851851852,
"T9. Version & Code Diff Analysis": 0.6589983180763119,
"T10. Rule Induction & In-Context Learning": 0.663888888888889,
"T11. Dialogue Memory & Long-Horizon Tracking": 0.5333333333333333
},
"language": {
"Chinese": 0.6243713115466586,
"English": 0.6377869630895318
}
},
"pass@2": 0.38133333333333336,
"BoN-3": {
"overall_metric": 0.6838483190204042,
"token_length": {
"8k": 0.8056053063071357,
"16k": 0.754121676530954,
"32k": 0.7309373525434467,
"64k": 0.6802921620132278,
"128k": 0.6428076963616377,
"256k": 0.48932572036602695
},
"contextual_requirement": {
"Full": 0.6418563038987366,
"Partial": 0.7372927019025284
},
"difficulty": {
"Easy": 0.8932565923719491,
"Moderate": 0.7958991372015439,
"Hard": 0.5571346872299623,
"Extreme": 0.46408187669686923
},
"primary_task": {
"T1. Retrieval & Ranking": 0.8856810957992655,
"T2. Sequencing & Structure Reconstruction": 0.8257039309014353,
"T3. Evidence-Grounded QA": 0.7083333333333334,
"T4. Summarization & Synthesis": 0.5020042367803014,
"T5. Attribution & Citation Alignment": 0.729994986816228,
"T6. Aggregation & Clustering": 0.6651081533166491,
"T7. Consistency & Compliance Checking": 0.49051401515979876,
"T8. Structured & Numeric Reasoning": 0.7680555555555556,
"T9. Version & Code Diff Analysis": 0.7036475958542688,
"T10. Rule Induction & In-Context Learning": 0.7072222222222223,
"T11. Dialogue Memory & Long-Horizon Tracking": 0.6
},
"language": {
"Chinese": 0.6727473750313149,
"English": 0.694949263009495
}
},
"pass@3": 0.444
}