| { |
| "method": "Wilcoxon signed-rank test on paired per-sample first-answer-token log-probabilities", |
| "description": "Log P(first correct answer token | input + thoughts + ### separator). v3: Fixed extraction point (### position) AND tokenization (leading space).", |
| "fixes": [ |
| "Appended ### separator token so logits[-1] is at the answer-predicting position", |
| "Tokenized answer with leading space to match model generation format (### Sally...)" |
| ], |
| "bonferroni_k": 5, |
| "adjusted_alpha": 0.01, |
| "results": { |
| "prosqa_id": { |
| "n": 500, |
| "median_diff": 0.0, |
| "mean_diff": 1.9073484480713887e-09, |
| "std_diff": 3.2808333491232266e-08, |
| "wilcoxon_stat": 59.0, |
| "p_value": 0.22669185385013768, |
| "p_bonferroni": 1.0, |
| "effect_size_r": 0.05406474709320131, |
| "z_score": 1.208924496867323, |
| "m3_median_logprob": 0.0, |
| "m5_median_logprob": 0.0, |
| "m3_mean_logprob": -3.814696910353632e-09, |
| "m5_mean_logprob": -1.9073484622822432e-09, |
| "m3_prob_median": 1.0, |
| "m5_prob_median": 1.0, |
| "direction": "equal" |
| }, |
| "7hop": { |
| "n": 1000, |
| "median_diff": 0.0, |
| "mean_diff": 1.5545431362859574e-05, |
| "std_diff": 0.000491480095991523, |
| "wilcoxon_stat": 817.5, |
| "p_value": 2.825225828337809e-10, |
| "p_bonferroni": 1.4126129141689046e-09, |
| "effect_size_r": 0.19947892842275394, |
| "z_score": 6.308077590256019, |
| "m3_median_logprob": 0.0, |
| "m5_median_logprob": 0.0, |
| "m3_mean_logprob": -1.5582147784044765e-05, |
| "m5_mean_logprob": -3.671642118519003e-08, |
| "m3_prob_median": 1.0, |
| "m5_prob_median": 1.0, |
| "direction": "equal" |
| }, |
| "8hop": { |
| "n": 1000, |
| "median_diff": 0.0, |
| "mean_diff": -2.5510785491178468e-08, |
| "std_diff": 3.456691393519963e-07, |
| "wilcoxon_stat": 556.0, |
| "p_value": 6.337349739272441e-11, |
| "p_bonferroni": 3.1686748696362206e-10, |
| "effect_size_r": 0.20667243975845923, |
| "z_score": 6.535556392206708, |
| "m3_median_logprob": 0.0, |
| "m5_median_logprob": 0.0, |
| "m3_mean_logprob": -1.1801690376955776e-08, |
| "m5_mean_logprob": -3.7312475868134245e-08, |
| "m3_prob_median": 1.0, |
| "m5_prob_median": 1.0, |
| "direction": "equal" |
| }, |
| "dag": { |
| "n": 1000, |
| "median_diff": 0.0, |
| "mean_diff": -3.8504580892606554e-08, |
| "std_diff": 2.2670660050335698e-07, |
| "wilcoxon_stat": 1072.5, |
| "p_value": 7.740160468120871e-16, |
| "p_bonferroni": 3.870080234060436e-15, |
| "effect_size_r": 0.25482388020266233, |
| "z_score": 8.058238636423024, |
| "m3_median_logprob": 0.0, |
| "m5_median_logprob": 0.0, |
| "m3_mean_logprob": -6.675716875292892e-09, |
| "m5_mean_logprob": -4.518029776789945e-08, |
| "m3_prob_median": 1.0, |
| "m5_prob_median": 1.0, |
| "direction": "equal" |
| }, |
| "dense": { |
| "n": 1000, |
| "median_diff": 0.0, |
| "mean_diff": 1.8954253092857698e-06, |
| "std_diff": 6.556349770050734e-05, |
| "wilcoxon_stat": 3283.5, |
| "p_value": 4.2083786008929025e-43, |
| "p_bonferroni": 2.104189300446451e-42, |
| "effect_size_r": 0.4352494406715319, |
| "z_score": 13.763795828363676, |
| "m3_median_logprob": 0.0, |
| "m5_median_logprob": 0.0, |
| "m3_mean_logprob": -2.1150066327138006e-06, |
| "m5_mean_logprob": -2.1958132342803083e-07, |
| "m3_prob_median": 1.0, |
| "m5_prob_median": 1.0, |
| "direction": "equal" |
| } |
| } |
| } |