coconut-curriculum-checkpoints / experiments /wilcoxon_sensitivity.json
bmarti44's picture
Upload folder using huggingface_hub
6bf0e17 verified
{
"method": "Wilcoxon signed-rank test on paired per-sample first-answer-token log-probabilities",
"description": "Log P(first correct answer token | input + thoughts) under each model. No answer tokens appended; logits from last position predict first answer token.",
"bonferroni_k": 5,
"adjusted_alpha": 0.01,
"results": {
"prosqa_id": {
"n": 500,
"median_diff": 5.5448455810546875,
"mean_diff": 5.900060585021973,
"std_diff": 4.304266040095551,
"wilcoxon_stat": 2294.0,
"p_value": 9.558665789472875e-78,
"p_bonferroni": 4.779332894736438e-77,
"effect_size_r": 0.8347188840290514,
"z_score": 18.664881667917225,
"m3_median_logprob": -36.25274085998535,
"m5_median_logprob": -30.16720485687256,
"m3_mean_logprob": -36.290785385131834,
"m5_mean_logprob": -30.39072480010986,
"direction": "M5 > M3"
},
"7hop": {
"n": 1000,
"median_diff": 4.609404563903809,
"mean_diff": 4.781453802108764,
"std_diff": 3.692269596981685,
"wilcoxon_stat": 14216.0,
"p_value": 3.419355734386437e-147,
"p_bonferroni": 1.7096778671932184e-146,
"effect_size_r": 0.8170330113820354,
"z_score": 25.83685239513508,
"m3_median_logprob": -34.421878814697266,
"m5_median_logprob": -30.148743629455566,
"m3_mean_logprob": -34.93062486076355,
"m5_mean_logprob": -30.149171058654787,
"direction": "M5 > M3"
},
"8hop": {
"n": 1000,
"median_diff": 4.223997116088867,
"mean_diff": 4.518639762878418,
"std_diff": 3.4717507040890796,
"wilcoxon_stat": 13275.0,
"p_value": 2.3666197544095806e-148,
"p_bonferroni": 1.1833098772047903e-147,
"effect_size_r": 0.8202902881460207,
"z_score": 25.93985653057244,
"m3_median_logprob": -34.220977783203125,
"m5_median_logprob": -29.836036682128906,
"m3_mean_logprob": -34.49567288208008,
"m5_mean_logprob": -29.97703311920166,
"direction": "M5 > M3"
},
"dag": {
"n": 1000,
"median_diff": 5.726862907409668,
"mean_diff": 5.837504280090332,
"std_diff": 3.5533476769526535,
"wilcoxon_stat": 4161.0,
"p_value": 7.992937269512403e-160,
"p_bonferroni": 3.9964686347562014e-159,
"effect_size_r": 0.8518384501300393,
"z_score": 26.937497009186796,
"m3_median_logprob": -35.94425582885742,
"m5_median_logprob": -30.378952980041504,
"m3_mean_logprob": -36.28265309143067,
"m5_mean_logprob": -30.44514881134033,
"direction": "M5 > M3"
},
"dense": {
"n": 1000,
"median_diff": 4.454345703125,
"mean_diff": 4.632727506637573,
"std_diff": 3.4062677335293263,
"wilcoxon_stat": 12447.0,
"p_value": 2.237606774559982e-149,
"p_bonferroni": 1.118803387279991e-148,
"effect_size_r": 0.82315641477788,
"z_score": 26.030491412763865,
"m3_median_logprob": -34.61273956298828,
"m5_median_logprob": -30.307485580444336,
"m3_mean_logprob": -35.10507657432556,
"m5_mean_logprob": -30.47234906768799,
"direction": "M5 > M3"
}
}
}