| { | |
| "date": "2025-12-08", | |
| "total_questions_num": 1500, | |
| "inference_iterations": 3, | |
| "total_samples_num": 4500, | |
| "fail_samples_num": 0, | |
| "inference_inconsistent_samples_num": 0, | |
| "average_overall_metric": 0.4113523778378889, | |
| "inference_iteration_1_overall_metric": 0.4026546189679395, | |
| "inference_iteration_2_overall_metric": 0.41422198000018023, | |
| "inference_iteration_3_overall_metric": 0.41718053454554826, | |
| "average_token_length_metric": { | |
| "8k": 0.45750122785552744, | |
| "16k": 0.40648581074103435, | |
| "32k": 0.41953181726499883, | |
| "64k": 0.3963813527019971, | |
| "128k": 0.41323756281622565, | |
| "256k": 0.3749764956475515 | |
| }, | |
| "average_contextual_requirement_metric": { | |
| "Full": 0.37732447212646125, | |
| "Partial": 0.45466062147061553 | |
| }, | |
| "average_difficulty_metric": { | |
| "Easy": 0.5125950929989945, | |
| "Moderate": 0.38228847113922254, | |
| "Hard": 0.3867421547849868, | |
| "Extreme": 0.33569972963459577 | |
| }, | |
| "average_primary_task_metric": { | |
| "T1. Retrieval & Ranking": 0.7019344870456294, | |
| "T2. Sequencing & Structure Reconstruction": 0.6935557265385518, | |
| "T3. Evidence-Grounded QA": 0.5000000000000001, | |
| "T4. Summarization & Synthesis": 0.525289467915154, | |
| "T5. Attribution & Citation Alignment": 0.40960389859884994, | |
| "T6. Aggregation & Clustering": 0.3855189408594916, | |
| "T7. Consistency & Compliance Checking": 0.2570183735053335, | |
| "T8. Structured & Numeric Reasoning": 0.16126543209876543, | |
| "T9. Version & Code Diff Analysis": 0.3763262824393013, | |
| "T10. Rule Induction & In-Context Learning": 0.3850462962962962, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.34444444444444444 | |
| }, | |
| "average_language_metric": { | |
| "Chinese": 0.4206201869029405, | |
| "English": 0.40208456877283766 | |
| }, | |
| "BoN-1": { | |
| "overall_metric": 0.4026546189679395, | |
| "token_length": { | |
| "8k": 0.4484815946744958, | |
| "16k": 0.40023341947584756, | |
| "32k": 0.39365195091822286, | |
| "64k": 0.4050265329266902, | |
| "128k": 0.40626760527764794, | |
| "256k": 0.3622666105347326 | |
| }, | |
| "contextual_requirement": { | |
| "Full": 0.3618050193216267, | |
| "Partial": 0.45464501851779227 | |
| }, | |
| "difficulty": { | |
| "Easy": 0.49189201078601713, | |
| "Moderate": 0.3847984308236515, | |
| "Hard": 0.39113612973801154, | |
| "Extreme": 0.32419293466074633 | |
| }, | |
| "primary_task": { | |
| "T1. Retrieval & Ranking": 0.7071317552591362, | |
| "T2. Sequencing & Structure Reconstruction": 0.692217342415818, | |
| "T3. Evidence-Grounded QA": 0.475, | |
| "T4. Summarization & Synthesis": 0.5252872492452957, | |
| "T5. Attribution & Citation Alignment": 0.3965042482839467, | |
| "T6. Aggregation & Clustering": 0.38900319686695384, | |
| "T7. Consistency & Compliance Checking": 0.24881818692821855, | |
| "T8. Structured & Numeric Reasoning": 0.1462962962962963, | |
| "T9. Version & Code Diff Analysis": 0.35572673286895423, | |
| "T10. Rule Induction & In-Context Learning": 0.36347222222222214, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.3416666666666667 | |
| }, | |
| "language": { | |
| "Chinese": 0.4101250015901318, | |
| "English": 0.3951842363457473 | |
| } | |
| }, | |
| "pass@1": 0.15533333333333332, | |
| "BoN-2": { | |
| "overall_metric": 0.4807038949944852, | |
| "token_length": { | |
| "8k": 0.5303281501019884, | |
| "16k": 0.4819497908714715, | |
| "32k": 0.47954691765928337, | |
| "64k": 0.48083012165065453, | |
| "128k": 0.465293133114307, | |
| "256k": 0.44627525656921 | |
| }, | |
| "contextual_requirement": { | |
| "Full": 0.44656912126099607, | |
| "Partial": 0.5241481524734732 | |
| }, | |
| "difficulty": { | |
| "Easy": 0.596920990471646, | |
| "Moderate": 0.4603818463137054, | |
| "Hard": 0.4590470067460482, | |
| "Extreme": 0.3809658785230146 | |
| }, | |
| "primary_task": { | |
| "T1. Retrieval & Ranking": 0.7449388025193358, | |
| "T2. Sequencing & Structure Reconstruction": 0.7399444536944532, | |
| "T3. Evidence-Grounded QA": 0.6166666666666667, | |
| "T4. Summarization & Synthesis": 0.54417036696111, | |
| "T5. Attribution & Citation Alignment": 0.5088222013004289, | |
| "T6. Aggregation & Clustering": 0.4705462063266301, | |
| "T7. Consistency & Compliance Checking": 0.3211976903039678, | |
| "T8. Structured & Numeric Reasoning": 0.2083333333333333, | |
| "T9. Version & Code Diff Analysis": 0.4528903513431796, | |
| "T10. Rule Induction & In-Context Learning": 0.45958333333333334, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.44166666666666665 | |
| }, | |
| "language": { | |
| "Chinese": 0.48759226495133245, | |
| "English": 0.47381552503763963 | |
| } | |
| }, | |
| "pass@2": 0.206, | |
| "BoN-3": { | |
| "overall_metric": 0.5286875532565248, | |
| "token_length": { | |
| "8k": 0.5760868208214227, | |
| "16k": 0.5315447995369911, | |
| "32k": 0.5297979513353553, | |
| "64k": 0.5139951126923608, | |
| "128k": 0.5285145377275431, | |
| "256k": 0.49218609742548064 | |
| }, | |
| "contextual_requirement": { | |
| "Full": 0.4922563052647642, | |
| "Partial": 0.5750545961551319 | |
| }, | |
| "difficulty": { | |
| "Easy": 0.6571166886876132, | |
| "Moderate": 0.5008690293131257, | |
| "Hard": 0.5070400734318661, | |
| "Extreme": 0.42048944373649116 | |
| }, | |
| "primary_task": { | |
| "T1. Retrieval & Ranking": 0.7851733453547192, | |
| "T2. Sequencing & Structure Reconstruction": 0.7758457283457282, | |
| "T3. Evidence-Grounded QA": 0.7083333333333334, | |
| "T4. Summarization & Synthesis": 0.5534698517064113, | |
| "T5. Attribution & Citation Alignment": 0.5639720868179612, | |
| "T6. Aggregation & Clustering": 0.503918026189678, | |
| "T7. Consistency & Compliance Checking": 0.34945026972752397, | |
| "T8. Structured & Numeric Reasoning": 0.25277777777777777, | |
| "T9. Version & Code Diff Analysis": 0.5104976262726122, | |
| "T10. Rule Induction & In-Context Learning": 0.5270833333333333, | |
| "T11. Dialogue Memory & Long-Horizon Tracking": 0.525 | |
| }, | |
| "language": { | |
| "Chinese": 0.5330279709661675, | |
| "English": 0.5243471355468845 | |
| } | |
| }, | |
| "pass@3": 0.24266666666666667 | |
| } |