| logit_gpt-4o,logit_DeepSeek-V3-0324,logit_Llama-4-maverick-17b-128e-instruct-fp8,logit_qwen25-coder-32b-instruct,logit_gpt-4.1-mini,chosen_executor,correct_prediction,true_gpt-4o,true_DeepSeek-V3-0324,true_Llama-4-maverick-17b-128e-instruct-fp8,true_qwen25-coder-32b-instruct,true_gpt-4.1-mini | |
| -0.0625,-0.0712890625,-0.12109375,-0.08837890625,-0.0966796875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.07666015625,-0.0830078125,-0.1474609375,-0.0986328125,-0.10546875,gpt-4o,1,1.0,1.0,0.0,1.0,1.0 | |
| -0.07861328125,-0.07373046875,-0.1396484375,-0.10986328125,-0.1337890625,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1552734375,-0.140625,-0.208984375,-0.1494140625,-0.1806640625,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1416015625,-0.10009765625,-0.1923828125,-0.107421875,-0.1865234375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1748046875,-0.1533203125,-0.1982421875,-0.1806640625,-0.2080078125,DeepSeek-V3-0324,1,0.0,1.0,0.0,0.0,0.0 | |
| -0.1171875,-0.1015625,-0.15234375,-0.125,-0.1669921875,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1728515625,-0.1455078125,-0.2294921875,-0.1513671875,-0.2392578125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.134765625,-0.12890625,-0.2412109375,-0.1357421875,-0.19140625,DeepSeek-V3-0324,1,1.0,1.0,0.0,1.0,1.0 | |
| -0.045166015625,-0.0537109375,-0.1337890625,-0.078125,-0.080078125,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.09423828125,-0.0693359375,-0.1455078125,-0.08935546875,-0.10986328125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.058837890625,-0.0693359375,-0.12060546875,-0.09130859375,-0.099609375,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1240234375,-0.09228515625,-0.1806640625,-0.1494140625,-0.14453125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1171875,-0.1005859375,-0.1826171875,-0.1533203125,-0.1337890625,DeepSeek-V3-0324,0,1.0,0.0,0.0,1.0,0.0 | |
| -0.1318359375,-0.0947265625,-0.16015625,-0.146484375,-0.1357421875,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.10546875,-0.11083984375,-0.2109375,-0.1142578125,-0.1953125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1806640625,-0.1630859375,-0.224609375,-0.1298828125,-0.23828125,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.09326171875,-0.07861328125,-0.16015625,-0.119140625,-0.126953125,DeepSeek-V3-0324,1,0.0,1.0,0.0,0.0,0.0 | |
| -0.07763671875,-0.1103515625,-0.1728515625,-0.09033203125,-0.10986328125,gpt-4o,1,1.0,1.0,1.0,1.0,0.0 | |
| -0.119140625,-0.10888671875,-0.171875,-0.12451171875,-0.1474609375,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.00811767578125,-0.05029296875,-0.0888671875,-0.07763671875,-0.043701171875,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1904296875,-0.15625,-0.220703125,-0.15234375,-0.2431640625,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.0810546875,-0.123046875,-0.2060546875,-0.109375,-0.10498046875,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1669921875,-0.138671875,-0.1806640625,-0.10595703125,-0.2314453125,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.0927734375,-0.08984375,-0.1396484375,-0.0947265625,-0.1318359375,DeepSeek-V3-0324,0,0.0,0.0,1.0,0.0,0.0 | |
| -0.0595703125,-0.057373046875,-0.10302734375,-0.09326171875,-0.125,DeepSeek-V3-0324,1,1.0,1.0,0.0,1.0,1.0 | |
| -0.1455078125,-0.130859375,-0.203125,-0.1640625,-0.2041015625,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.0927734375,-0.10009765625,-0.18359375,-0.09716796875,-0.1279296875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.0595703125,-0.0751953125,-0.1376953125,-0.10498046875,-0.06396484375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1064453125,-0.10400390625,-0.1845703125,-0.130859375,-0.150390625,DeepSeek-V3-0324,1,0.0,1.0,1.0,1.0,1.0 | |
| -0.10546875,-0.0888671875,-0.1484375,-0.0888671875,-0.1201171875,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.062255859375,-0.052978515625,-0.125,-0.095703125,-0.111328125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1552734375,-0.1103515625,-0.1845703125,-0.1025390625,-0.2099609375,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.083984375,-0.0712890625,-0.15625,-0.1357421875,-0.09619140625,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.07958984375,-0.05126953125,-0.1728515625,-0.06640625,-0.08203125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.08203125,-0.08740234375,-0.169921875,-0.10888671875,-0.115234375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.171875,-0.158203125,-0.201171875,-0.146484375,-0.2041015625,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.10009765625,-0.064453125,-0.142578125,-0.107421875,-0.10888671875,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.146484375,-0.146484375,-0.1923828125,-0.1298828125,-0.1953125,qwen25-coder-32b-instruct,0,0.0,1.0,1.0,0.0,1.0 | |
| -0.09033203125,-0.0966796875,-0.1669921875,-0.12890625,-0.10400390625,gpt-4o,1,1.0,0.0,0.0,1.0,0.0 | |
| -0.11474609375,-0.12451171875,-0.251953125,-0.12890625,-0.2001953125,gpt-4o,0,0.0,1.0,1.0,1.0,1.0 | |
| -0.1572265625,-0.138671875,-0.1904296875,-0.1337890625,-0.1982421875,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,1.0 | |
| -0.01214599609375,-0.0771484375,-0.146484375,-0.07373046875,-0.08349609375,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.044677734375,-0.028564453125,-0.1376953125,-0.056640625,-0.0537109375,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.076171875,-0.06884765625,-0.1298828125,-0.11376953125,-0.10791015625,DeepSeek-V3-0324,1,1.0,1.0,0.0,1.0,1.0 | |
| -0.04736328125,-0.06689453125,-0.146484375,-0.08349609375,-0.06787109375,gpt-4o,0,0.0,0.0,0.0,1.0,0.0 | |
| -0.04736328125,-0.041259765625,-0.1318359375,-0.0751953125,-0.0693359375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.12890625,-0.0908203125,-0.169921875,-0.1025390625,-0.150390625,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1728515625,-0.138671875,-0.2275390625,-0.1396484375,-0.2060546875,DeepSeek-V3-0324,0,1.0,0.0,0.0,1.0,0.0 | |
| -0.06396484375,-0.076171875,-0.130859375,-0.08349609375,-0.10302734375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.115234375,-0.1142578125,-0.158203125,-0.1396484375,-0.1337890625,DeepSeek-V3-0324,1,0.0,1.0,1.0,0.0,1.0 | |
| -0.1669921875,-0.1328125,-0.2080078125,-0.154296875,-0.2060546875,DeepSeek-V3-0324,0,0.0,0.0,1.0,0.0,1.0 | |
| -0.0208740234375,-0.036376953125,-0.095703125,-0.0849609375,-0.045654296875,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.08056640625,-0.09716796875,-0.1865234375,-0.0888671875,-0.099609375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1279296875,-0.09423828125,-0.162109375,-0.1123046875,-0.158203125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1484375,-0.146484375,-0.26171875,-0.12255859375,-0.2021484375,qwen25-coder-32b-instruct,0,1.0,0.0,1.0,0.0,1.0 | |
| -0.12353515625,-0.150390625,-0.2470703125,-0.1328125,-0.2109375,gpt-4o,1,1.0,1.0,0.0,1.0,0.0 | |
| -0.05322265625,-0.05859375,-0.11767578125,-0.08203125,-0.068359375,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.119140625,-0.1328125,-0.15234375,-0.140625,-0.1416015625,gpt-4o,1,1.0,1.0,1.0,0.0,1.0 | |
| -0.1572265625,-0.15234375,-0.25390625,-0.150390625,-0.2138671875,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.064453125,-0.0595703125,-0.1376953125,-0.0966796875,-0.09228515625,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.03173828125,-0.0751953125,-0.1357421875,-0.080078125,-0.1181640625,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.14453125,-0.138671875,-0.248046875,-0.1416015625,-0.205078125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1591796875,-0.126953125,-0.2421875,-0.1416015625,-0.228515625,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.08349609375,-0.0927734375,-0.1533203125,-0.126953125,-0.1201171875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.0859375,-0.10595703125,-0.17578125,-0.09521484375,-0.12451171875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.09130859375,-0.087890625,-0.1572265625,-0.091796875,-0.119140625,DeepSeek-V3-0324,0,0.0,0.0,0.0,1.0,0.0 | |
| -0.07666015625,-0.0771484375,-0.12109375,-0.11279296875,-0.09765625,gpt-4o,1,1.0,0.0,1.0,1.0,1.0 | |
| -0.0732421875,-0.0673828125,-0.1318359375,-0.1259765625,-0.1328125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.09033203125,-0.09814453125,-0.169921875,-0.1103515625,-0.1416015625,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.072265625,-0.060791015625,-0.171875,-0.09716796875,-0.09521484375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.162109375,-0.12060546875,-0.21484375,-0.10302734375,-0.212890625,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.0712890625,-0.1279296875,-0.173828125,-0.10107421875,-0.142578125,gpt-4o,1,1.0,0.0,0.0,0.0,0.0 | |
| -0.109375,-0.095703125,-0.1962890625,-0.10693359375,-0.162109375,DeepSeek-V3-0324,1,0.0,1.0,0.0,0.0,0.0 | |
| -0.078125,-0.06689453125,-0.15625,-0.1103515625,-0.1142578125,DeepSeek-V3-0324,1,1.0,1.0,1.0,0.0,1.0 | |
| -0.1328125,-0.1435546875,-0.2041015625,-0.1455078125,-0.1708984375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.0673828125,-0.0634765625,-0.15625,-0.08935546875,-0.11181640625,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.09033203125,-0.0732421875,-0.1474609375,-0.1298828125,-0.1298828125,DeepSeek-V3-0324,1,0.0,1.0,1.0,0.0,0.0 | |
| -0.107421875,-0.134765625,-0.173828125,-0.0732421875,-0.171875,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.09619140625,-0.07470703125,-0.1328125,-0.11181640625,-0.1015625,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,0.0 | |
| -0.0810546875,-0.07177734375,-0.1630859375,-0.09228515625,-0.115234375,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.13671875,-0.142578125,-0.2216796875,-0.12451171875,-0.1669921875,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,0.0 | |
| -0.09716796875,-0.0966796875,-0.203125,-0.10595703125,-0.1591796875,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,1.0 | |
| -0.1240234375,-0.1259765625,-0.2265625,-0.125,-0.1572265625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.05615234375,-0.08154296875,-0.1435546875,-0.07421875,-0.11328125,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1259765625,-0.13671875,-0.2470703125,-0.11572265625,-0.1806640625,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.064453125,-0.053955078125,-0.125,-0.099609375,-0.11279296875,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1337890625,-0.1279296875,-0.2197265625,-0.1396484375,-0.1943359375,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1455078125,-0.1416015625,-0.1640625,-0.10791015625,-0.1865234375,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.08203125,-0.0732421875,-0.1337890625,-0.0869140625,-0.07275390625,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.087890625,-0.076171875,-0.130859375,-0.1201171875,-0.1162109375,DeepSeek-V3-0324,0,1.0,0.0,1.0,0.0,1.0 | |
| -0.1201171875,-0.09716796875,-0.158203125,-0.09716796875,-0.1640625,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.07861328125,-0.1025390625,-0.2060546875,-0.099609375,-0.1689453125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.07177734375,-0.09521484375,-0.17578125,-0.08935546875,-0.1259765625,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.158203125,-0.1298828125,-0.1953125,-0.14453125,-0.208984375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1591796875,-0.12353515625,-0.2353515625,-0.1279296875,-0.205078125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1396484375,-0.1376953125,-0.19921875,-0.1396484375,-0.1669921875,DeepSeek-V3-0324,0,1.0,0.0,0.0,0.0,1.0 | |
| -0.0849609375,-0.09521484375,-0.1669921875,-0.12109375,-0.134765625,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.09130859375,-0.12451171875,-0.1962890625,-0.115234375,-0.1455078125,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.14453125,-0.08837890625,-0.177734375,-0.150390625,-0.158203125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.11865234375,-0.08056640625,-0.1767578125,-0.1201171875,-0.1337890625,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1025390625,-0.103515625,-0.171875,-0.134765625,-0.130859375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.11474609375,-0.0966796875,-0.146484375,-0.09521484375,-0.09619140625,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.080078125,-0.0908203125,-0.1435546875,-0.07958984375,-0.10791015625,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.138671875,-0.12060546875,-0.1982421875,-0.1328125,-0.1455078125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.1337890625,-0.107421875,-0.1689453125,-0.12060546875,-0.1640625,DeepSeek-V3-0324,1,1.0,1.0,0.0,0.0,1.0 | |
| -0.138671875,-0.1435546875,-0.1982421875,-0.1533203125,-0.18359375,gpt-4o,1,1.0,1.0,1.0,0.0,1.0 | |
| -0.08056640625,-0.07861328125,-0.1318359375,-0.10302734375,-0.0986328125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.0135498046875,-0.057861328125,-0.126953125,-0.07470703125,-0.068359375,gpt-4o,0,0.0,0.0,0.0,0.0,0.0 | |
| -0.10302734375,-0.08544921875,-0.1767578125,-0.09912109375,-0.1337890625,DeepSeek-V3-0324,1,0.0,1.0,0.0,0.0,0.0 | |
| -0.15625,-0.126953125,-0.2265625,-0.126953125,-0.2099609375,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.050537109375,-0.048583984375,-0.11767578125,-0.1259765625,-0.091796875,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.1376953125,-0.134765625,-0.1845703125,-0.11572265625,-0.1806640625,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0 | |
| -0.08251953125,-0.08056640625,-0.1591796875,-0.0908203125,-0.107421875,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0 | |
| predicted_proportions,0.3246,0.5175,0.0000,0.1491,0.0088 | |
| true_proportions,0.1288,0.1442,0.1156,0.1222,0.1295 | |
| correct_fraction,0.5263 | |