coconut-curriculum-checkpoints / experiments /wilcoxon_sensitivity_v3.json
bmarti44's picture
Upload folder using huggingface_hub
6bf0e17 verified
{
"method": "Wilcoxon signed-rank test on paired per-sample first-answer-token log-probabilities",
"description": "Log P(first correct answer token | input + thoughts + ### separator). v3: Fixed extraction point (### position) AND tokenization (leading space).",
"fixes": [
"Appended ### separator token so logits[-1] is at the answer-predicting position",
"Tokenized answer with leading space to match model generation format (### Sally...)"
],
"bonferroni_k": 5,
"adjusted_alpha": 0.01,
"results": {
"prosqa_id": {
"n": 500,
"median_diff": 0.0,
"mean_diff": 1.9073484480713887e-09,
"std_diff": 3.2808333491232266e-08,
"wilcoxon_stat": 59.0,
"p_value": 0.22669185385013768,
"p_bonferroni": 1.0,
"effect_size_r": 0.05406474709320131,
"z_score": 1.208924496867323,
"m3_median_logprob": 0.0,
"m5_median_logprob": 0.0,
"m3_mean_logprob": -3.814696910353632e-09,
"m5_mean_logprob": -1.9073484622822432e-09,
"m3_prob_median": 1.0,
"m5_prob_median": 1.0,
"direction": "equal"
},
"7hop": {
"n": 1000,
"median_diff": 0.0,
"mean_diff": 1.5545431362859574e-05,
"std_diff": 0.000491480095991523,
"wilcoxon_stat": 817.5,
"p_value": 2.825225828337809e-10,
"p_bonferroni": 1.4126129141689046e-09,
"effect_size_r": 0.19947892842275394,
"z_score": 6.308077590256019,
"m3_median_logprob": 0.0,
"m5_median_logprob": 0.0,
"m3_mean_logprob": -1.5582147784044765e-05,
"m5_mean_logprob": -3.671642118519003e-08,
"m3_prob_median": 1.0,
"m5_prob_median": 1.0,
"direction": "equal"
},
"8hop": {
"n": 1000,
"median_diff": 0.0,
"mean_diff": -2.5510785491178468e-08,
"std_diff": 3.456691393519963e-07,
"wilcoxon_stat": 556.0,
"p_value": 6.337349739272441e-11,
"p_bonferroni": 3.1686748696362206e-10,
"effect_size_r": 0.20667243975845923,
"z_score": 6.535556392206708,
"m3_median_logprob": 0.0,
"m5_median_logprob": 0.0,
"m3_mean_logprob": -1.1801690376955776e-08,
"m5_mean_logprob": -3.7312475868134245e-08,
"m3_prob_median": 1.0,
"m5_prob_median": 1.0,
"direction": "equal"
},
"dag": {
"n": 1000,
"median_diff": 0.0,
"mean_diff": -3.8504580892606554e-08,
"std_diff": 2.2670660050335698e-07,
"wilcoxon_stat": 1072.5,
"p_value": 7.740160468120871e-16,
"p_bonferroni": 3.870080234060436e-15,
"effect_size_r": 0.25482388020266233,
"z_score": 8.058238636423024,
"m3_median_logprob": 0.0,
"m5_median_logprob": 0.0,
"m3_mean_logprob": -6.675716875292892e-09,
"m5_mean_logprob": -4.518029776789945e-08,
"m3_prob_median": 1.0,
"m5_prob_median": 1.0,
"direction": "equal"
},
"dense": {
"n": 1000,
"median_diff": 0.0,
"mean_diff": 1.8954253092857698e-06,
"std_diff": 6.556349770050734e-05,
"wilcoxon_stat": 3283.5,
"p_value": 4.2083786008929025e-43,
"p_bonferroni": 2.104189300446451e-42,
"effect_size_r": 0.4352494406715319,
"z_score": 13.763795828363676,
"m3_median_logprob": 0.0,
"m5_median_logprob": 0.0,
"m3_mean_logprob": -2.1150066327138006e-06,
"m5_mean_logprob": -2.1958132342803083e-07,
"m3_prob_median": 1.0,
"m5_prob_median": 1.0,
"direction": "equal"
}
}
}