coconut-curriculum-checkpoints / experiments /wilcoxon_diagnostics.json
bmarti44's picture
Upload folder using huggingface_hub
6bf0e17 verified
{
"diagnostic_1_probability_scale": {
"prosqa_id": {
"m3_median_prob": 1.8015017595499927e-16,
"m5_median_prob": 7.91679494328894e-14,
"ratio_m5_over_m3": 439.4552989649325
},
"7hop": {
"m3_median_prob": 1.124004303514157e-15,
"m5_median_prob": 8.064306124379313e-14,
"ratio_m5_over_m3": 71.74622107020912
},
"8hop": {
"m3_median_prob": 1.374099505807729e-15,
"m5_median_prob": 1.1024880913607591e-13,
"ratio_m5_over_m3": 80.23349740692105
},
"dag": {
"m3_median_prob": 2.452494473518025e-16,
"m5_median_prob": 6.406020952062677e-14,
"ratio_m5_over_m3": 261.20429714459027
},
"dense": {
"m3_median_prob": 9.287058992349747e-16,
"m5_median_prob": 6.880599064425863e-14,
"ratio_m5_over_m3": 74.08803012981595
}
},
"diagnostic_2_entropy": {
"prosqa_id": {
"n": 500,
"m3_mean_entropy": 8.245684373986754e-08,
"m5_mean_entropy": 7.790865615004972e-07,
"m3_std_entropy": 5.380577548978713e-07,
"m5_std_entropy": 2.5222992223835106e-06,
"m3_median_entropy": 3.2982319098096013e-09,
"m5_median_entropy": 9.583473570273782e-08,
"entropy_diff_mean": -6.966297177606296e-07,
"entropy_diff_p_value": 3.2357157290647014e-09,
"direction": "M5 > M3"
},
"dag": {
"n": 1000,
"m3_mean_entropy": 5.54877669094885e-08,
"m5_mean_entropy": 9.562996519757449e-07,
"m3_std_entropy": 1.9521541945357018e-07,
"m5_std_entropy": 4.271332058753898e-06,
"m3_median_entropy": 8.63850679877487e-09,
"m5_median_entropy": 2.2466630866802006e-07,
"entropy_diff_mean": -9.008118850662564e-07,
"entropy_diff_p_value": 3.5962255165486594e-11,
"direction": "M5 > M3"
}
},
"diagnostic_3_subsampling": {
"prosqa_id": {
"50": {
"median_r": 1.0111228967991008,
"mean_r": 1.0051491497447864,
"std_r": 0.06029258401254942,
"min_r": 0.8318166676389841,
"max_r": 1.1251537019813398,
"n_successful": 200
},
"100": {
"median_r": 0.8386074293556046,
"mean_r": 0.836016651585947,
"std_r": 0.015510156365912298,
"min_r": 0.7880640541545902,
"max_r": 0.8616442058077676,
"n_successful": 200
},
"200": {
"median_r": 0.8358284039529764,
"mean_r": 0.8359841376581776,
"std_r": 0.009534859818316218,
"min_r": 0.8060189883590921,
"max_r": 0.8577864249910184,
"n_successful": 200
},
"500": {
"median_r": 0.8347188840290514,
"mean_r": 0.8347188840290514,
"std_r": 0.0,
"min_r": 0.8347188840290514,
"max_r": 0.8347188840290514,
"n_successful": 200
}
},
"7hop": {
"50": {
"median_r": 0.9481486999740725,
"mean_r": 0.9531135622917283,
"std_r": 0.06293261512730695,
"min_r": 0.7928139676366417,
"max_r": 1.1251537019813398,
"n_successful": 200
},
"100": {
"median_r": 0.819008977747048,
"mean_r": 0.8156927134353896,
"std_r": 0.02111324353944751,
"min_r": 0.7382083439222973,
"max_r": 0.8575182159954398,
"n_successful": 200
},
"200": {
"median_r": 0.8189177079865471,
"mean_r": 0.8169341523729338,
"std_r": 0.015910137227400267,
"min_r": 0.7601185278787841,
"max_r": 0.8519194488394003,
"n_successful": 200
},
"500": {
"median_r": 0.8164834911273876,
"mean_r": 0.8167422178703777,
"std_r": 0.007052610683216399,
"min_r": 0.8009183795838579,
"max_r": 0.8364206695578106,
"n_successful": 200
},
"1000": {
"median_r": 0.8170330113820354,
"mean_r": 0.8170330113820353,
"std_r": 1.1102230246251565e-16,
"min_r": 0.8170330113820354,
"max_r": 0.8170330113820354,
"n_successful": 200
}
},
"8hop": {
"50": {
"median_r": 0.9739360145177666,
"mean_r": 0.9693214129779848,
"std_r": 0.06744026838119704,
"min_r": 0.75397404039132,
"max_r": 1.1251537019813398,
"n_successful": 200
},
"100": {
"median_r": 0.8214158051375724,
"mean_r": 0.8201642548944997,
"std_r": 0.019890168055637775,
"min_r": 0.7581506280152144,
"max_r": 0.8633633682295706,
"n_successful": 200
},
"200": {
"median_r": 0.8193922428223398,
"mean_r": 0.8200169032243652,
"std_r": 0.0133925371240747,
"min_r": 0.7714210848767548,
"max_r": 0.8509703791678149,
"n_successful": 200
},
"500": {
"median_r": 0.8197694591199104,
"mean_r": 0.819961774720315,
"std_r": 0.007195415498732053,
"min_r": 0.801264270951492,
"max_r": 0.839699719722981,
"n_successful": 200
},
"1000": {
"median_r": 0.8202902881460207,
"mean_r": 0.8202902881460207,
"std_r": 0.0,
"min_r": 0.8202902881460207,
"max_r": 0.8202902881460207,
"n_successful": 200
}
},
"dag": {
"50": {
"median_r": 1.0673612216706767,
"mean_r": 1.0618530277163836,
"std_r": 0.04394317499378165,
"min_r": 0.9294377393890321,
"max_r": 1.1251537019813398,
"n_successful": 200
},
"100": {
"median_r": 0.8544237236361942,
"mean_r": 0.8535108483902165,
"std_r": 0.00847521599972869,
"min_r": 0.8245102974968181,
"max_r": 0.8678331905262591,
"n_successful": 200
},
"200": {
"median_r": 0.8526959603888792,
"mean_r": 0.8522434267136549,
"std_r": 0.006530786353488246,
"min_r": 0.8320752647971617,
"max_r": 0.8655515404858075,
"n_successful": 200
},
"500": {
"median_r": 0.8517021501798804,
"mean_r": 0.8516264691486423,
"std_r": 0.003518625131681763,
"min_r": 0.8409449286464634,
"max_r": 0.862016630762726,
"n_successful": 200
},
"1000": {
"median_r": 0.8518384501300393,
"mean_r": 0.851838450130039,
"std_r": 3.3306690738754696e-16,
"min_r": 0.8518384501300393,
"max_r": 0.8518384501300393,
"n_successful": 200
}
},
"dense": {
"50": {
"median_r": 0.9893792786083057,
"mean_r": 0.9841720531880661,
"std_r": 0.06501513630435339,
"min_r": 0.8275968732060146,
"max_r": 1.1251537019813398,
"n_successful": 200
},
"100": {
"median_r": 0.8289801197935066,
"mean_r": 0.8266678463361812,
"std_r": 0.020971271348192252,
"min_r": 0.7578067955308537,
"max_r": 0.8661140281044558,
"n_successful": 200
},
"200": {
"median_r": 0.8226708471423619,
"mean_r": 0.8233761784664719,
"std_r": 0.013990142267202568,
"min_r": 0.780135270043129,
"max_r": 0.8511429372899214,
"n_successful": 200
},
"500": {
"median_r": 0.823643442437411,
"mean_r": 0.8231771116955668,
"std_r": 0.007503303921795524,
"min_r": 0.8019007110679384,
"max_r": 0.8400041041264988,
"n_successful": 200
},
"1000": {
"median_r": 0.82315641477788,
"mean_r": 0.8231564147778804,
"std_r": 3.3306690738754696e-16,
"min_r": 0.82315641477788,
"max_r": 0.82315641477788,
"n_successful": 200
}
}
},
"diagnostic_4_species_token": {
"dag": {
"first_token_wilcoxon": {
"n": 1000,
"direction": "M5 > M3",
"p": 7.992937269512403e-160,
"r": 0.8518384501300393,
"m3_median_lp": -35.94425582885742,
"m5_median_lp": -30.378952980041504
},
"species_token_wilcoxon": {
"n": 1000,
"direction": "M5 > M3",
"p": 1.2486659436314225e-164,
"r": 0.8647152509398429,
"m3_median_lp": -51.22725296020508,
"m5_median_lp": -38.59332847595215
},
"species_same_as_first": false,
"n_species_valid": 1000
}
},
"diagnostic_5_quartile_accuracy": {
"prosqa_id": {
"by_m3_confidence": {
"Q1": {
"n": 125,
"m3_acc": 0.984,
"m5_acc": 0.976,
"diff_m3_minus_m5": 0.008000000000000007,
"m3_lp_range": [
-50.72520446777344,
-38.92578887939453
]
},
"Q2": {
"n": 125,
"m3_acc": 0.984,
"m5_acc": 0.944,
"diff_m3_minus_m5": 0.040000000000000036,
"m3_lp_range": [
-38.88484191894531,
-36.26152801513672
]
},
"Q3": {
"n": 125,
"m3_acc": 0.96,
"m5_acc": 0.984,
"diff_m3_minus_m5": -0.02400000000000002,
"m3_lp_range": [
-36.243953704833984,
-33.15000915527344
]
},
"Q4": {
"n": 125,
"m3_acc": 0.952,
"m5_acc": 0.96,
"diff_m3_minus_m5": -0.008000000000000007,
"m3_lp_range": [
-33.142913818359375,
-25.003509521484375
]
}
},
"by_m5_confidence": {}
},
"7hop": {
"by_m3_confidence": {
"Q1": {
"n": 250,
"m3_acc": 0.66,
"m5_acc": 0.772,
"diff_m3_minus_m5": -0.11199999999999999,
"m3_lp_range": [
-47.91731262207031,
-38.02136993408203
]
},
"Q2": {
"n": 250,
"m3_acc": 0.712,
"m5_acc": 0.74,
"diff_m3_minus_m5": -0.028000000000000025,
"m3_lp_range": [
-38.012367248535156,
-34.42694091796875
]
},
"Q3": {
"n": 250,
"m3_acc": 0.608,
"m5_acc": 0.756,
"diff_m3_minus_m5": -0.14800000000000002,
"m3_lp_range": [
-34.41681671142578,
-31.62476348876953
]
},
"Q4": {
"n": 250,
"m3_acc": 0.66,
"m5_acc": 0.748,
"diff_m3_minus_m5": -0.08799999999999997,
"m3_lp_range": [
-31.606525421142578,
-21.095178604125977
]
}
},
"by_m5_confidence": {}
},
"8hop": {
"by_m3_confidence": {
"Q1": {
"n": 250,
"m3_acc": 0.676,
"m5_acc": 0.776,
"diff_m3_minus_m5": -0.09999999999999998,
"m3_lp_range": [
-47.47686767578125,
-37.30986022949219
]
},
"Q2": {
"n": 250,
"m3_acc": 0.652,
"m5_acc": 0.744,
"diff_m3_minus_m5": -0.09199999999999997,
"m3_lp_range": [
-37.29325866699219,
-34.222503662109375
]
},
"Q3": {
"n": 250,
"m3_acc": 0.676,
"m5_acc": 0.732,
"diff_m3_minus_m5": -0.05599999999999994,
"m3_lp_range": [
-34.219451904296875,
-31.44131088256836
]
},
"Q4": {
"n": 250,
"m3_acc": 0.696,
"m5_acc": 0.752,
"diff_m3_minus_m5": -0.05600000000000005,
"m3_lp_range": [
-31.43999481201172,
-23.230289459228516
]
}
},
"by_m5_confidence": {}
},
"dag": {
"by_m3_confidence": {
"Q1": {
"n": 250,
"m3_acc": 0.568,
"m5_acc": 0.516,
"diff_m3_minus_m5": 0.051999999999999935,
"m3_lp_range": [
-50.762939453125,
-39.221099853515625
]
},
"Q2": {
"n": 250,
"m3_acc": 0.556,
"m5_acc": 0.5,
"diff_m3_minus_m5": 0.05600000000000005,
"m3_lp_range": [
-39.2105598449707,
-35.946693420410156
]
},
"Q3": {
"n": 250,
"m3_acc": 0.636,
"m5_acc": 0.56,
"diff_m3_minus_m5": 0.07599999999999996,
"m3_lp_range": [
-35.94181823730469,
-32.98561096191406
]
},
"Q4": {
"n": 250,
"m3_acc": 0.608,
"m5_acc": 0.5,
"diff_m3_minus_m5": 0.10799999999999998,
"m3_lp_range": [
-32.979331970214844,
-25.686195373535156
]
}
},
"by_m5_confidence": {}
},
"dense": {
"by_m3_confidence": {
"Q1": {
"n": 250,
"m3_acc": 0.648,
"m5_acc": 0.692,
"diff_m3_minus_m5": -0.04399999999999993,
"m3_lp_range": [
-48.519752502441406,
-37.903221130371094
]
},
"Q2": {
"n": 250,
"m3_acc": 0.632,
"m5_acc": 0.696,
"diff_m3_minus_m5": -0.06399999999999995,
"m3_lp_range": [
-37.88905334472656,
-34.615211486816406
]
},
"Q3": {
"n": 250,
"m3_acc": 0.568,
"m5_acc": 0.648,
"diff_m3_minus_m5": -0.08000000000000007,
"m3_lp_range": [
-34.610267639160156,
-31.857315063476562
]
},
"Q4": {
"n": 250,
"m3_acc": 0.6,
"m5_acc": 0.7,
"diff_m3_minus_m5": -0.09999999999999998,
"m3_lp_range": [
-31.839698791503906,
-22.86867332458496
]
}
},
"by_m5_confidence": {}
}
}
}