| { |
| "method": "Wilcoxon signed-rank test on paired per-sample first-answer-token log-probabilities", |
| "description": "Log P(first correct answer token | input + thoughts) under each model. No answer tokens appended; logits from last position predict first answer token.", |
| "bonferroni_k": 5, |
| "adjusted_alpha": 0.01, |
| "results": { |
| "prosqa_id": { |
| "n": 500, |
| "median_diff": 5.5448455810546875, |
| "mean_diff": 5.900060585021973, |
| "std_diff": 4.304266040095551, |
| "wilcoxon_stat": 2294.0, |
| "p_value": 9.558665789472875e-78, |
| "p_bonferroni": 4.779332894736438e-77, |
| "effect_size_r": 0.8347188840290514, |
| "z_score": 18.664881667917225, |
| "m3_median_logprob": -36.25274085998535, |
| "m5_median_logprob": -30.16720485687256, |
| "m3_mean_logprob": -36.290785385131834, |
| "m5_mean_logprob": -30.39072480010986, |
| "direction": "M5 > M3" |
| }, |
| "7hop": { |
| "n": 1000, |
| "median_diff": 4.609404563903809, |
| "mean_diff": 4.781453802108764, |
| "std_diff": 3.692269596981685, |
| "wilcoxon_stat": 14216.0, |
| "p_value": 3.419355734386437e-147, |
| "p_bonferroni": 1.7096778671932184e-146, |
| "effect_size_r": 0.8170330113820354, |
| "z_score": 25.83685239513508, |
| "m3_median_logprob": -34.421878814697266, |
| "m5_median_logprob": -30.148743629455566, |
| "m3_mean_logprob": -34.93062486076355, |
| "m5_mean_logprob": -30.149171058654787, |
| "direction": "M5 > M3" |
| }, |
| "8hop": { |
| "n": 1000, |
| "median_diff": 4.223997116088867, |
| "mean_diff": 4.518639762878418, |
| "std_diff": 3.4717507040890796, |
| "wilcoxon_stat": 13275.0, |
| "p_value": 2.3666197544095806e-148, |
| "p_bonferroni": 1.1833098772047903e-147, |
| "effect_size_r": 0.8202902881460207, |
| "z_score": 25.93985653057244, |
| "m3_median_logprob": -34.220977783203125, |
| "m5_median_logprob": -29.836036682128906, |
| "m3_mean_logprob": -34.49567288208008, |
| "m5_mean_logprob": -29.97703311920166, |
| "direction": "M5 > M3" |
| }, |
| "dag": { |
| "n": 1000, |
| "median_diff": 5.726862907409668, |
| "mean_diff": 5.837504280090332, |
| "std_diff": 3.5533476769526535, |
| "wilcoxon_stat": 4161.0, |
| "p_value": 7.992937269512403e-160, |
| "p_bonferroni": 3.9964686347562014e-159, |
| "effect_size_r": 0.8518384501300393, |
| "z_score": 26.937497009186796, |
| "m3_median_logprob": -35.94425582885742, |
| "m5_median_logprob": -30.378952980041504, |
| "m3_mean_logprob": -36.28265309143067, |
| "m5_mean_logprob": -30.44514881134033, |
| "direction": "M5 > M3" |
| }, |
| "dense": { |
| "n": 1000, |
| "median_diff": 4.454345703125, |
| "mean_diff": 4.632727506637573, |
| "std_diff": 3.4062677335293263, |
| "wilcoxon_stat": 12447.0, |
| "p_value": 2.237606774559982e-149, |
| "p_bonferroni": 1.118803387279991e-148, |
| "effect_size_r": 0.82315641477788, |
| "z_score": 26.030491412763865, |
| "m3_median_logprob": -34.61273956298828, |
| "m5_median_logprob": -30.307485580444336, |
| "m3_mean_logprob": -35.10507657432556, |
| "m5_mean_logprob": -30.47234906768799, |
| "direction": "M5 > M3" |
| } |
| } |
| } |